<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition, datasets
from sklearn.manifold import TSNE
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import os
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
path = './data/'
emb_data_train = pd.read_csv('./data/node2vec_v2_train.csv.gz',compression = 'gzip',sep='\t')
emb_data_test = pd.read_csv('./data/node2vec_v2_test.csv.gz',compression = 'gzip',sep='\t')
emb_data_train= emb_data_train.drop(columns=['Unnamed: 0'])
emb_data_test= emb_data_test.drop(columns=['Unnamed: 0'])
emb_data_train.head()

Unnamed: 0,is_dorm,is_year,year_diff,from_high_school,to_high_school,from_major,to_major,is_faculty,is_gender,d0,...,d59,d6,d60,d61,d62,d63,d7,d8,d9,label
0,0,0,1.0,17819,50093,265,294,1,1,0.586327,...,0.080709,0.663903,0.521325,-1.218457,0.572862,8.867662,0.098672,0.072862,1.136375,0
1,0,1,0.0,10070,24562,51,60,1,1,1.556291,...,2.682853,-0.02294,-0.371618,-0.987022,1.831474,-0.448014,1.16799,2.401762,0.431279,0
2,0,1,0.0,1544,6122,14,39,1,1,-0.649554,...,6.857399,-0.348564,2.753245,-0.207652,1.89221,-0.977367,11.321891,2.510515,2.910005,1
3,0,0,2.196375,1894,50410,0,40,0,0,3.757138,...,5.611234,-1.006211,-1.872412,-0.861397,0.114388,-1.09661,-4.892152,-3.294941,0.133597,0
4,0,1,0.0,3535,21037,238,271,1,1,1.290842,...,0.82428,0.429348,2.083774,0.281935,-1.777484,0.258835,0.234619,-0.287869,2.079238,0


In [3]:
cols = ["is_dorm","is_year","year_diff", "from_high_school",
            "to_high_school", "from_major", "to_major", "is_faculty",
            "is_gender",'label']
X =  emb_data_train.drop(columns=cols).values
y = emb_data_train['label'].values

scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(X)
X = scaler.transform(X)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [7]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [50, 75, 100],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300]
}

In [8]:
'''
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X, y)
grid_search.best_params_
'''

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 21.4min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed: 47.4min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

{'bootstrap': True,
 'max_depth': 100,
 'max_features': 2,
 'min_samples_leaf': 3,
 'min_samples_split': 10,
 'n_estimators': 300}

In [19]:
params = {'bootstrap': True,
 'max_depth': 1000,
 'max_features': 2,
 'min_samples_leaf': 3,
 'min_samples_split': 10,
 'n_estimators': 700}

In [20]:
model = RandomForestClassifier(**params).fit(X,y)

In [21]:
X_test = emb_data_test.drop(columns=cols).values
y_test = emb_data_test[['label']].values
yhat = model.predict(X_test)
print(confusion_matrix(y_test, yhat))
print(classification_report(y_test, yhat))
print('ROC_AUC_SCORE:', roc_auc_score(y_test, model.predict_proba(X_test)[:,1]))

[[8951 1421]
 [2102 8149]]
              precision    recall  f1-score   support

           0       0.81      0.86      0.84     10372
           1       0.85      0.79      0.82     10251

    accuracy                           0.83     20623
   macro avg       0.83      0.83      0.83     20623
weighted avg       0.83      0.83      0.83     20623

ROC_AUC_SCORE: 0.909892643359731


In [23]:
cols = ["year_diff", "from_high_school",
            "to_high_school", "from_major", "to_major", "is_faculty",
            "is_gender",'label']
X =  emb_data_train.drop(columns=cols).values
y = emb_data_train['label'].values

scaler = MinMaxScaler(feature_range = (0,1))
scaler.fit(X)
X = scaler.transform(X)
model = RandomForestClassifier(**params).fit(X,y)
X_test = emb_data_test.drop(columns=cols).values
y_test = emb_data_test[['label']].values
yhat = model.predict(X_test)
print(confusion_matrix(y_test, yhat))
print(classification_report(y_test, yhat))
print('ROC_AUC_SCORE:', roc_auc_score(y_test, model.predict_proba(X_test)[:,1]))

MinMaxScaler(copy=True, feature_range=(0, 1))

[[8798 1574]
 [1783 8468]]
              precision    recall  f1-score   support

           0       0.83      0.85      0.84     10372
           1       0.84      0.83      0.83     10251

    accuracy                           0.84     20623
   macro avg       0.84      0.84      0.84     20623
weighted avg       0.84      0.84      0.84     20623

ROC_AUC_SCORE: 0.914195648347195
