# 01. Загрузка и первичная обработка датасета (EDA)

---

> Импорты и настройки:

In [2]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

> Константы:

In [3]:
DATA_PATH = "/Users/georgetarasov/Desktop/NIR/github-roles-abm/data/raw/data.json"

> Загрузка датасета:

In [4]:
def load_large_json(path, max_records=None):
    """
    Универсальная загрузка JSON или JSONL.
    max_records — если нужно ограничить размер (например, 100000).
    """
    data = []
    
    with open(path, "r", encoding="utf-8") as f:
        first_char = f.read(1)
        f.seek(0)
        
        if first_char != "[":
            print("Detected JSONL format")
            for i, line in enumerate(f):
                if max_records and i >= max_records:
                    break
                data.append(json.loads(line))
        else:
            print("Detected single JSON array")
            full_data = json.load(f)
            data = full_data[:max_records] if max_records else full_data

    return pd.DataFrame(data)

> Загружаем **sample 100k** для **EDA**:

In [5]:
github_users_100k_df = load_large_json(DATA_PATH, max_records=100_000)
github_users_100k_df.head(5)

Detected JSONL format


Unnamed: 0,hirable,public_repos,is_suspicious,updated_at,id,blog,followers,location,follower_list,type,commit_list,bio,commits,company,following_list,public_gists,name,created_at,email,following,login,repo_list
0,,0,False,2018-02-14 16:37:08,6611157,,0,,[],User,[],,0.0,,[],0,,2014-02-07 01:01:35,,0,lorraine94588,[]
1,,0,False,2017-10-02 17:43:51,32464022,,0,,[],User,[],,0.0,,[],0,,2017-10-02 17:43:51,,0,nourelddinayman,[]
2,,3,True,2016-02-28 05:00:58,12417299,,0,,,User,,,,,,0,,2015-05-12 19:35:16,,0,joseph5swa6rblo,
3,,1,True,2016-05-01 07:19:02,18867538,,0,,,User,,,,,,0,,2016-05-01 07:19:01,,0,aborebyg,
4,,0,False,2016-02-27 10:52:51,5343442,,0,,[],User,[],,0.0,,[],0,,2013-08-30 02:06:20,,0,Frapuchis,[]


> Общая информация:

In [6]:
github_users_100k_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 22 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   hirable         2208 non-null    object 
 1   public_repos    100000 non-null  int64  
 2   is_suspicious   100000 non-null  bool   
 3   updated_at      100000 non-null  object 
 4   id              100000 non-null  int64  
 5   blog            100000 non-null  object 
 6   followers       100000 non-null  int64  
 7   location        8620 non-null    object 
 8   follower_list   78608 non-null   object 
 9   type            100000 non-null  object 
 10  commit_list     78608 non-null   object 
 11  bio             6381 non-null    object 
 12  commits         78608 non-null   float64
 13  company         4523 non-null    object 
 14  following_list  78608 non-null   object 
 15  public_gists    100000 non-null  int64  
 16  name            18595 non-null   object 
 17  created_at 

In [7]:
github_users_100k_df.describe()

Unnamed: 0,public_repos,id,followers,commits,public_gists,following
count,100000.0,100000.0,100000.0,78608.0,100000.0,100000.0
mean,1.97259,17745250.0,0.4669,14.029844,0.16438,0.58078
std,12.210868,10016700.0,14.251944,61.675306,2.074161,12.526793
min,0.0,1120.0,0.0,0.0,0.0,0.0
25%,0.0,9144597.0,0.0,0.0,0.0,0.0
50%,0.0,17842270.0,0.0,0.0,0.0,0.0
75%,1.0,26416950.0,0.0,2.0,0.0,0.0
max,2341.0,34991420.0,3154.0,898.0,183.0,3366.0


> Проверка пропусков:

In [8]:
github_users_100k_df.isna().mean().sort_values(ascending=False)


hirable           0.97792
company           0.95477
bio               0.93619
email             0.93354
location          0.91380
name              0.81405
commits           0.21392
commit_list       0.21392
following_list    0.21392
repo_list         0.21392
follower_list     0.21392
type              0.00000
public_repos      0.00000
id                0.00000
is_suspicious     0.00000
public_gists      0.00000
followers         0.00000
created_at        0.00000
blog              0.00000
following         0.00000
login             0.00000
updated_at        0.00000
dtype: float64

> Удалим малоинформативные поля, с кол-ом пропусков больше 80%:

In [9]:
drop_cols = ["hirable", "company", "bio", "email", "name", "location", "login", "blog"]
github_users_100k_df = github_users_100k_df.drop(columns=drop_cols)

> Приведение типов:

In [10]:
# Даты к datetime
github_users_100k_df["created_at"]  = pd.to_datetime(github_users_100k_df["created_at"],  errors="coerce")
github_users_100k_df["updated_at"]  = pd.to_datetime(github_users_100k_df["updated_at"],  errors="coerce")

> Проверяем структуру **commit_list**:

In [11]:
sample_nonempty = github_users_100k_df["commit_list"].dropna().loc[
    github_users_100k_df["commit_list"].dropna().apply(lambda x: len(x) > 0)
].iloc[0]

sample_nonempty[0]

{'repo_id': 98311519,
 'repo_owner_id': 30438508,
 'commit_at': '2017-07-25 10:56:08.000-03:00',
 'committer_id': 30438508,
 'message': 'Set theme jekyll-theme-merlot',
 'repo_description': None,
 'generate_at': '2017-07-25 10:56:08.000-03:00',
 'author_id': 30438508,
 'repo_name': 'MelLobo/Fonte'}

> Проверка структуры **repo_list**:

In [12]:
sample_nonempty = github_users_100k_df["repo_list"].dropna().loc[
    github_users_100k_df["repo_list"].dropna().apply(lambda x: len(x) > 0)
].iloc[0]

sample_nonempty[0]

{'fork': False,
 'license': None,
 'has_wiki': True,
 'description': None,
 'language': None,
 'default_branch': 'master',
 'created_at': '2017-08-23 05:03:43',
 'forks_count': 0,
 'updated_at': '2017-08-23 05:03:43',
 'pushed_at': '2017-08-23 05:03:44',
 'full_name': 'turkimama/BOT-ME',
 'open_issues': 0,
 'stargazers_count': 0,
 'owner_id': 31267942,
 'id': 101139652,
 'size': 0}

> Проверка типа аккаунта:

In [13]:
github_users_100k_df["type"].value_counts()

type
User            92802
Organization     7181
Bot                17
Name: count, dtype: int64

> Добавим базовые количественные признаки:

In [14]:
github_users_100k_df["n_commits"] = github_users_100k_df["commit_list"].apply(lambda x: len(x) if isinstance(x, list) else 0)
github_users_100k_df["n_repos"] = github_users_100k_df["repo_list"].apply(lambda x: len(x) if isinstance(x, list) else 0)
github_users_100k_df["n_followers_list"] = github_users_100k_df["follower_list"].apply(lambda x: len(x) if isinstance(x, list) else 0)
github_users_100k_df["n_following_list"] = github_users_100k_df["following_list"].apply(lambda x: len(x) if isinstance(x, list) else 0)


> Приводим числовые поля к int:

In [15]:
for col in ["commits"]:
    github_users_100k_df[col] = github_users_100k_df[col].fillna(0).astype(int)

> Добавим также бинарные флаги ролей:

In [16]:
github_users_100k_df["is_bot"] = (github_users_100k_df["type"] == "Bot").astype(int)
github_users_100k_df["is_org"] = (github_users_100k_df["type"] == "Organization").astype(int)
github_users_100k_df["is_user"] = (github_users_100k_df["type"] == "User").astype(int)

> Добавим признак возраста аккаунта:

In [17]:
github_users_100k_df["account_age_days"] = (
    github_users_100k_df["updated_at"] - github_users_100k_df["created_at"]
).dt.days

> Лог-преобразования для **heavy-tailed** распределений:

In [18]:
for col in ["n_commits", "n_repos", "followers", "following"]:
    github_users_100k_df[f"log_{col}"] = np.log1p(github_users_100k_df[col])

### EDA Summary

---

In [19]:
eda_summary = {
    "n_users": len(github_users_100k_df),
    "n_commits_total": github_users_100k_df["n_commits"].sum(),
    "n_repos_total": github_users_100k_df["n_repos"].sum(),
    "pct_with_commits": (github_users_100k_df["n_commits"] > 0).mean(),
    "pct_with_repos": (github_users_100k_df["n_repos"] > 0).mean(),
    "pct_bots": github_users_100k_df["is_bot"].mean(),
    "pct_orgs": github_users_100k_df["is_org"].mean(),
}

eda_summary

{'n_users': 100000,
 'n_commits_total': np.int64(1102858),
 'n_repos_total': np.int64(191567),
 'pct_with_commits': np.float64(0.25421),
 'pct_with_repos': np.float64(0.36682),
 'pct_bots': np.float64(0.00017),
 'pct_orgs': np.float64(0.07181)}

### Промежуточные выводы

---

- **n_users** = 100000 - конечное число GitHub-профилей;
- **n_commits_total ≈ 1.1 млн** - совокупное число коммитов по всем пользователям;
- **n_repos_total ≈ 191k** - сколько репозиториев суммарно создали/владельцы которых попали в датасет;
- **pct_with_commits ≈ 25%** - только 1/4 пользователей имеют хотя бы 1 коммит в датасете;
- **pct_with_repos ≈ 37%** - только 36% имеют собственные репозитории;
- **pct_bots ≈ 0.017%** - крайне малый процент ботов;
- **pct_orgs ≈ 7.2%** - значимая доля организаций;

---

> Повторно выведем св-ия о предобработанном датафрейме:

In [20]:
github_users_100k_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 26 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   public_repos      100000 non-null  int64         
 1   is_suspicious     100000 non-null  bool          
 2   updated_at        100000 non-null  datetime64[ns]
 3   id                100000 non-null  int64         
 4   followers         100000 non-null  int64         
 5   follower_list     78608 non-null   object        
 6   type              100000 non-null  object        
 7   commit_list       78608 non-null   object        
 8   commits           100000 non-null  int64         
 9   following_list    78608 non-null   object        
 10  public_gists      100000 non-null  int64         
 11  created_at        100000 non-null  datetime64[ns]
 12  following         100000 non-null  int64         
 13  repo_list         78608 non-null   object        
 14  n_com

In [21]:
github_users_100k_df.describe()

Unnamed: 0,public_repos,updated_at,id,followers,commits,public_gists,created_at,following,n_commits,n_repos,n_followers_list,n_following_list,is_bot,is_org,is_user,account_age_days,log_n_commits,log_n_repos,log_followers,log_following
count,100000.0,100000,100000.0,100000.0,100000.0,100000.0,100000,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,1.97259,2017-01-29 00:45:00.200839680,17745250.0,0.4669,11.02858,0.16438,2015-10-10 03:13:37.746049792,0.58078,11.02858,1.91567,0.46212,0.53325,0.00017,0.07181,0.92802,476.52474,0.622677,0.488134,0.11561,0.124803
min,0.0,2012-11-23 21:43:38,1120.0,0.0,0.0,0.0,2008-02-27 18:00:37,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2016-02-28 09:58:38,9144597.0,0.0,0.0,0.0,2014-10-11 01:42:40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,2017-03-01 00:25:05,17842270.0,0.0,0.0,0.0,2016-03-15 03:30:50.500000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,227.0,0.0,0.0,0.0,0.0
75%,1.0,2017-11-27 07:34:45.500000,26416950.0,0.0,1.0,0.0,2017-03-14 18:32:52.249999872,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,792.0,0.693147,0.693147,0.0,0.0
max,2341.0,2018-07-09 18:00:24,34991420.0,3154.0,898.0,183.0,2017-12-31 23:23:42,3366.0,898.0,2341.0,3154.0,650.0,1.0,1.0,1.0,3700.0,6.801283,7.758761,8.056744,8.121777
std,12.210868,,10016700.0,14.251944,54.983789,2.074161,,12.526793,54.983789,12.042282,14.248767,6.429123,0.013037,0.258174,0.258456,601.044389,1.314793,0.80156,0.420287,0.461981


### Итоги EDA

---
Получили таблицу с 29 признаками, которые включают:

- **временные признаки** (created_at, updated_at);
- **лог-признаки активности** (log_n_commits, log_n_repos…);
- **бинарные типы** (is_bot, is_org, is_user);
- **raw counts** (followers, following, commits);
- **списки артефактов** (commit_list, repo_list);
- **derived counts** (n_repos, n_commits);
- **возраст аккаунта** (account_age_days);

---

Cтруктура данных подготовлена:

- даты очищены;
- пропуски обработаны;
- **heavy-tail** смягчены логами;
- артефакты проверены (commit_list, repo_list);
- малоинформативные признаки удалены;

---

> Сохраняем **sample 100k** для **EDA**:

In [22]:
github_users_100k_df.to_json("/Users/georgetarasov/Desktop/NIR/github-roles-abm/data/processed/github_users_100k_clean.json", orient="records", lines=True)
