# 02. Извлечение признаков для кластеризации ролей

---

**Цель**: извлечь дополнительные признаки пользователей GitHub, убрать неиформативные поля, масштабировать признаки для последующей кластеризации ролей.

> Импорты:

In [3]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler


> Загрузка очищенного датасета:

In [4]:
github_users_100k_clean_df = pd.read_json("/Users/georgetarasov/Desktop/NIR/github-roles-abm/data/processed/github_users_100k_clean.json", lines=True)
github_users_100k_clean_df.head()

Unnamed: 0,public_repos,is_suspicious,updated_at,id,followers,follower_list,type,commit_list,commits,following_list,...,n_followers_list,n_following_list,is_bot,is_org,is_user,account_age_days,log_n_commits,log_n_repos,log_followers,log_following
0,0,False,2018-02-14 16:37:08,6611157,0,[],User,[],0,[],...,0,0,0,0,1,1468,0.0,0.0,0.0,0.0
1,0,False,2017-10-02 17:43:51,32464022,0,[],User,[],0,[],...,0,0,0,0,1,0,0.0,0.0,0.0,0.0
2,3,True,2016-02-28 05:00:58,12417299,0,,User,,0,,...,0,0,0,0,1,291,0.0,0.0,0.0,0.0
3,1,True,2016-05-01 07:19:02,18867538,0,,User,,0,,...,0,0,0,0,1,0,0.0,0.0,0.0,0.0
4,0,False,2016-02-27 10:52:51,5343442,0,[],User,[],0,[],...,0,0,0,0,1,911,0.0,0.0,0.0,0.0


> Добавляем дополнительные признаки:

In [5]:
# commits per repo
github_users_100k_clean_df['commits_per_repo'] = github_users_100k_clean_df['n_commits'] / (github_users_100k_clean_df['n_repos'] + 1)

# суммарные фичи по репозиториям с проверкой на None
github_users_100k_clean_df['repo_forks_sum'] = github_users_100k_clean_df['repo_list'].apply(
    lambda repos: sum(r['forks_count'] for r in repos) if isinstance(repos, list) else 0
)
github_users_100k_clean_df['repo_stars_sum'] = github_users_100k_clean_df['repo_list'].apply(
    lambda repos: sum(r['stargazers_count'] for r in repos) if isinstance(repos, list) else 0
)
github_users_100k_clean_df['repo_open_issues_sum'] = github_users_100k_clean_df['repo_list'].apply(
    lambda repos: sum(r['open_issues'] for r in repos) if isinstance(repos, list) else 0
)

# дни с последнего обновления
github_users_100k_clean_df['days_since_last_update'] = (
    github_users_100k_clean_df['updated_at'] - github_users_100k_clean_df['created_at']
).dt.days


> Удаляем сложные объекты (списки JSON):

In [6]:
cols_complex = [
    "follower_list",
    "commit_list",
    "following_list",
    "repo_list"
]

github_users_100k_final_df = github_users_100k_clean_df.drop(columns=cols_complex)
github_users_100k_final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   public_repos            100000 non-null  int64         
 1   is_suspicious           100000 non-null  bool          
 2   updated_at              100000 non-null  datetime64[ns]
 3   id                      100000 non-null  int64         
 4   followers               100000 non-null  int64         
 5   type                    100000 non-null  object        
 6   commits                 100000 non-null  int64         
 7   public_gists            100000 non-null  int64         
 8   created_at              100000 non-null  datetime64[ns]
 9   following               100000 non-null  int64         
 10  n_commits               100000 non-null  int64         
 11  n_repos                 100000 non-null  int64         
 12  n_followers_list        100000 

> Удаляем поля, которые больше не нужны:

- **даты** заменили **account_age_days**;
- **тип** заменён на **is_user / is_org / is_bot**;
- **is_suspicious** нельзя использовать в кластеризации;

In [7]:
cols_remove = [
    "updated_at",
    "created_at",
    "type",
    "is_suspicious"
]

github_users_100k_final_df = github_users_100k_final_df.drop(columns=cols_remove)
github_users_100k_final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 23 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   public_repos            100000 non-null  int64  
 1   id                      100000 non-null  int64  
 2   followers               100000 non-null  int64  
 3   commits                 100000 non-null  int64  
 4   public_gists            100000 non-null  int64  
 5   following               100000 non-null  int64  
 6   n_commits               100000 non-null  int64  
 7   n_repos                 100000 non-null  int64  
 8   n_followers_list        100000 non-null  int64  
 9   n_following_list        100000 non-null  int64  
 10  is_bot                  100000 non-null  int64  
 11  is_org                  100000 non-null  int64  
 12  is_user                 100000 non-null  int64  
 13  account_age_days        100000 non-null  int64  
 14  log_n_commits        

> Формируем финальный список признаков для кластеризации:

In [8]:
features = github_users_100k_final_df.select_dtypes(include=["int64", "float64"]).columns.tolist()
features

['public_repos',
 'id',
 'followers',
 'commits',
 'public_gists',
 'following',
 'n_commits',
 'n_repos',
 'n_followers_list',
 'n_following_list',
 'is_bot',
 'is_org',
 'is_user',
 'account_age_days',
 'log_n_commits',
 'log_n_repos',
 'log_followers',
 'log_following',
 'commits_per_repo',
 'repo_forks_sum',
 'repo_stars_sum',
 'repo_open_issues_sum',
 'days_since_last_update']

> Масштабирование признаков: 

In [9]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(github_users_100k_final_df[features])

X_scaled[:5]

array([[-0.16154461, -1.1115589 , -0.03276061, -0.20057976, -0.07925173,
        -0.04636326, -0.20057976, -0.15907945, -0.03243244, -0.08294329,
        -0.01303951, -0.27814677,  0.27850132,  1.64959566, -0.47359526,
        -0.60898349, -0.27507569, -0.27014843, -0.14253145, -0.03611306,
        -0.02978536, -0.04457656,  1.64959566],
       [-0.16154461,  1.469431  , -0.03276061, -0.20057976, -0.07925173,
        -0.04636326, -0.20057976, -0.15907945, -0.03243244, -0.08294329,
        -0.01303951, -0.27814677,  0.27850132, -0.79283183, -0.47359526,
        -0.60898349, -0.27507569, -0.27014843, -0.14253145, -0.03611306,
        -0.02978536, -0.04457656, -0.79283183],
       [ 0.0841394 , -0.53190965, -0.03276061, -0.20057976, -0.07925173,
        -0.04636326, -0.20057976, -0.15907945, -0.03243244, -0.08294329,
        -0.01303951, -0.27814677,  0.27850132, -0.30867216, -0.47359526,
        -0.60898349, -0.27507569, -0.27014843, -0.14253145, -0.03611306,
        -0.02978536, -0.0445

> Сохраняем подготовленный датасет:

In [10]:
np.save("/Users/georgetarasov/Desktop/NIR/github-roles-abm/data/processed/X_scaled.npy", X_scaled)
github_users_100k_final_df[features].to_csv("/Users/georgetarasov/Desktop/NIR/github-roles-abm/data/processed/features_for_clustering.csv", index=False)