In [1]:
from src.utils.data import download_data, make_preprocessed_edges_file, split_data, load_data, run_cleora
from src.pipelines.model import model_pipeline
from src.pipelines.data import DataPipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from pathlib import Path
import json 
import pandas as pd

In [2]:
embeddings_pipeline = DataPipeline([
        ("Download data", download_data),
        ("Preprocess edges", make_preprocessed_edges_file),
        ("Run Cleora", run_cleora)
    ])
cleora = embeddings_pipeline.run()

[Data pipeline] Running step: Download data
Data already exists.
[Data pipeline] Running step: Preprocess edges
[Data pipeline] Running step: Run Cleora


In [3]:
# load JSON data into a DataFrame
with open("data/git_web_ml/musae_git_features.json") as json_file:
    features = json.load(json_file)  # node features

# convert the JSON data to a DataFrame
df = pd.DataFrame.from_dict(features, orient="index").reset_index()
df = df.fillna(0)
df = df.map(int)
df.columns = ['node'] + [f'feature_{i}' for i in range(len(df.columns) - 1)]
df = df.drop(columns=['node']) # same as index

print(df.head())  # preview the DataFrame
print(df.shape)

   feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0       1574       3773       3571       2672       2478       2534   
1       1193        376         73        290       3129       1852   
2       1574       3773        925       1728       2815       2963   
3       3964       3773       4003        928       1852       3077   
4       1929       3773       1793       3511       1290       3129   

   feature_6  feature_7  feature_8  feature_9  ...  feature_32  feature_33  \
0       3129       3077       1171       2045  ...           0           0   
1       3077       1171       1022       2045  ...           0           0   
2       3077        364       1171        536  ...           0           0   
3        364       1022       3763       2045  ...           0           0   
4       3077        364       1171       1022  ...           0           0   

   feature_34  feature_35  feature_36  feature_37  feature_38  feature_39  \
0           0           0  

In [4]:
data_pipeline = DataPipeline([
    ("Download data", download_data),
    ("Split data", split_data),
    ("Load Data", load_data)
])
X_train, X_test, y_train, y_test = data_pipeline.run()

[Data pipeline] Running step: Download data
Data already exists.
[Data pipeline] Running step: Split data
(30160, 3)
          id       name  ml_target
4773    4773      Nsima          0
36287  36287  jaycech3n          0
24241  24241  davendw49          1
27463  27463   mccordgh          0
6815    6815    oalders          0
[Data pipeline] Running step: Load Data


In [5]:
X_train.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4773,-0.007101,-0.061292,0.044918,0.053346,0.003608,-0.034036,0.093728,0.083397,-0.016932,0.092219,...,0.127293,0.038404,-0.090021,-0.113659,0.088267,-0.023457,-0.145749,0.013836,-0.036559,0.036221
36287,-0.01347,-0.023871,-0.101616,-0.035858,-0.093713,0.036922,-0.018288,0.098294,-0.019029,-0.009273,...,0.115829,-0.101206,-0.04722,-0.079582,0.108678,0.100546,-0.151835,0.0168,-0.055667,-0.019695
24241,0.100243,-0.014397,-0.065657,0.134193,-0.11095,0.057213,0.160921,0.051001,0.041495,-0.035548,...,-0.022779,-0.114507,-0.032809,0.042566,0.034716,0.121488,0.044444,-0.044879,0.158559,-0.11056
27463,-0.050016,-0.036012,0.028089,-0.035937,-0.154578,0.171846,-0.105659,0.175504,-0.033523,0.134867,...,-0.017195,0.19738,0.01918,-0.245719,-0.022977,0.057381,0.036812,-0.023235,0.021482,-0.014516
6815,0.032808,-0.155245,0.054059,0.012858,-0.019109,0.069628,-0.006147,0.01851,-0.070596,0.036902,...,0.029314,-0.009687,0.137806,-0.068026,0.096043,-0.047612,-0.093306,-0.018924,-0.112289,-0.032182


In [None]:
# take first 10 features for each node
X_train = X_train.merge(df.iloc[:,0:10], left_index=True, right_index=True)
X_test = X_test.merge(df.iloc[:,0:10], left_index=True, right_index=True)
print(X_train.head())

              0         1         2         3         4         5         6  \
4773  -0.007101 -0.061292  0.044918  0.053346  0.003608 -0.034036  0.093728   
36287 -0.013470 -0.023871 -0.101616 -0.035858 -0.093713  0.036922 -0.018288   
24241  0.100243 -0.014397 -0.065657  0.134193 -0.110950  0.057213  0.160921   
27463 -0.050016 -0.036012  0.028089 -0.035937 -0.154578  0.171846 -0.105659   
6815   0.032808 -0.155245  0.054059  0.012858 -0.019109  0.069628 -0.006147   

              7         8         9  ...  feature_0  feature_1  feature_2  \
4773   0.083397 -0.016932  0.092219  ...       3730       1330        277   
36287  0.098294 -0.019029 -0.009273  ...       1929       1330       1793   
24241  0.051001  0.041495 -0.035548  ...       3730       1033       1793   
27463  0.175504 -0.033523  0.134867  ...       3730        509       3571   
6815   0.018510 -0.070596  0.036902  ...       2048       2325       3214   

       feature_3  feature_4  feature_5  feature_6  feature_7  

In [9]:
models = {
    "KNeighbors": KNeighborsClassifier(n_neighbors=7), 
    "DecisionTree": DecisionTreeClassifier() 
}

for model_name, model in models.items():
        print(f"\nTraining pipeline with {model_name}...")
        pipeline = model_pipeline(model)
        
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        
        # evaluate model performance
        print(f"\n{model_name} Model Evaluation:")
        print(f"Accuracy: {pipeline.score(X_test, y_test):.4f}")
        print(classification_report(y_test, y_pred))


Training pipeline with KNeighbors...
[Pipeline] . (step 1 of 2) Processing cosine_similarity, total=  26.5s
[Pipeline] ............... (step 2 of 2) Processing knn, total=   2.7s

KNeighbors Model Evaluation:
Accuracy: 0.8170
              precision    recall  f1-score   support

           0       0.85      0.92      0.88      5620
           1       0.68      0.53      0.59      1920

    accuracy                           0.82      7540
   macro avg       0.77      0.72      0.74      7540
weighted avg       0.81      0.82      0.81      7540


Training pipeline with DecisionTree...
[Pipeline] ........ (step 1 of 1) Processing classifier, total=  13.7s

DecisionTree Model Evaluation:
Accuracy: 0.7154
              precision    recall  f1-score   support

           0       0.81      0.80      0.81      5620
           1       0.44      0.46      0.45      1920

    accuracy                           0.72      7540
   macro avg       0.63      0.63      0.63      7540
weighted avg  

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

for model_name, model in models.items():
        print(f"\nTraining pipeline with {model_name}...")
        pipeline = model_pipeline(model)
        
        pipeline.fit(X_train_scaled, y_train)
        y_pred = pipeline.predict(X_test_scaled)
        
        # evaluate model performance
        print(f"\n{model_name} Model Evaluation:")
        print(f"Accuracy: {pipeline.score(X_test_scaled, y_test):.4f}")
        print(classification_report(y_test, y_pred))


Training pipeline with KNeighbors...
[Pipeline] . (step 1 of 2) Processing cosine_similarity, total=  34.2s
[Pipeline] ............... (step 2 of 2) Processing knn, total=   2.4s

KNeighbors Model Evaluation:
Accuracy: 0.8214
              precision    recall  f1-score   support

           0       0.85      0.92      0.88      5620
           1       0.69      0.54      0.61      1920

    accuracy                           0.82      7540
   macro avg       0.77      0.73      0.75      7540
weighted avg       0.81      0.82      0.81      7540


Training pipeline with DecisionTree...
[Pipeline] ........ (step 1 of 1) Processing classifier, total=  14.4s

DecisionTree Model Evaluation:
Accuracy: 0.7208
              precision    recall  f1-score   support

           0       0.82      0.80      0.81      5620
           1       0.45      0.48      0.46      1920

    accuracy                           0.72      7540
   macro avg       0.64      0.64      0.64      7540
weighted avg  