# 01. Загрузка и первичная обработка датасета (EDA)

---

> Импорты и настройки:

In [2]:
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

> Константы:

In [12]:
DATA_PATH = "/Users/georgetarasov/Desktop/NIR/github-roles-abm/data/data.json"

> Загрузка датасета:

In [4]:
def load_large_json(path, max_records=None):
    """
    Универсальная загрузка JSON или JSONL.
    max_records — если нужно ограничить размер (например, 100000).
    """
    data = []
    
    with open(path, "r", encoding="utf-8") as f:
        first_char = f.read(1)
        f.seek(0)
        
        if first_char != "[":
            print("Detected JSONL format")
            for i, line in enumerate(f):
                if max_records and i >= max_records:
                    break
                data.append(json.loads(line))
        else:
            print("Detected single JSON array")
            full_data = json.load(f)
            data = full_data[:max_records] if max_records else full_data

    return pd.DataFrame(data)

> Загружаем **sample 100k** для **EDA**:

In [19]:
github_users_100k_df = load_large_json(DATA_PATH, max_records=100_000)
github_users_100k_df.head(5)

Detected JSONL format


Unnamed: 0,hirable,public_repos,is_suspicious,updated_at,id,blog,followers,location,follower_list,type,commit_list,bio,commits,company,following_list,public_gists,name,created_at,email,following,login,repo_list
0,,0,False,2018-02-14 16:37:08,6611157,,0,,[],User,[],,0.0,,[],0,,2014-02-07 01:01:35,,0,lorraine94588,[]
1,,0,False,2017-10-02 17:43:51,32464022,,0,,[],User,[],,0.0,,[],0,,2017-10-02 17:43:51,,0,nourelddinayman,[]
2,,3,True,2016-02-28 05:00:58,12417299,,0,,,User,,,,,,0,,2015-05-12 19:35:16,,0,joseph5swa6rblo,
3,,1,True,2016-05-01 07:19:02,18867538,,0,,,User,,,,,,0,,2016-05-01 07:19:01,,0,aborebyg,
4,,0,False,2016-02-27 10:52:51,5343442,,0,,[],User,[],,0.0,,[],0,,2013-08-30 02:06:20,,0,Frapuchis,[]


> Общая информация:

In [20]:
github_users_100k_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 22 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   hirable         2208 non-null    object 
 1   public_repos    100000 non-null  int64  
 2   is_suspicious   100000 non-null  bool   
 3   updated_at      100000 non-null  object 
 4   id              100000 non-null  int64  
 5   blog            100000 non-null  object 
 6   followers       100000 non-null  int64  
 7   location        8620 non-null    object 
 8   follower_list   78608 non-null   object 
 9   type            100000 non-null  object 
 10  commit_list     78608 non-null   object 
 11  bio             6381 non-null    object 
 12  commits         78608 non-null   float64
 13  company         4523 non-null    object 
 14  following_list  78608 non-null   object 
 15  public_gists    100000 non-null  int64  
 16  name            18595 non-null   object 
 17  created_at 

In [21]:
github_users_100k_df.describe()

Unnamed: 0,public_repos,id,followers,commits,public_gists,following
count,100000.0,100000.0,100000.0,78608.0,100000.0,100000.0
mean,1.97259,17745250.0,0.4669,14.029844,0.16438,0.58078
std,12.210868,10016700.0,14.251944,61.675306,2.074161,12.526793
min,0.0,1120.0,0.0,0.0,0.0,0.0
25%,0.0,9144597.0,0.0,0.0,0.0,0.0
50%,0.0,17842270.0,0.0,0.0,0.0,0.0
75%,1.0,26416950.0,0.0,2.0,0.0,0.0
max,2341.0,34991420.0,3154.0,898.0,183.0,3366.0


> Проверка пропусков:

In [22]:
github_users_100k_df.isna().mean().sort_values(ascending=False)


hirable           0.97792
company           0.95477
bio               0.93619
email             0.93354
location          0.91380
name              0.81405
commits           0.21392
commit_list       0.21392
following_list    0.21392
repo_list         0.21392
follower_list     0.21392
type              0.00000
public_repos      0.00000
id                0.00000
is_suspicious     0.00000
public_gists      0.00000
followers         0.00000
created_at        0.00000
blog              0.00000
following         0.00000
login             0.00000
updated_at        0.00000
dtype: float64