<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [2]:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition, datasets
from sklearn.manifold import TSNE
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import os

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
path = './data/'
emb_data_train = pd.read_csv('./data/node2vec_v2_train.csv.gz',compression = 'gzip',sep='\t')
emb_data_test = pd.read_csv('./data/node2vec_v2_test.csv.gz',compression = 'gzip',sep='\t')
emb_data_train= emb_data_train.drop(columns=['Unnamed: 0'])
emb_data_test= emb_data_test.drop(columns=['Unnamed: 0'])
emb_data_train.head()

Unnamed: 0,is_dorm,is_year,year_diff,from_high_school,to_high_school,from_major,to_major,is_faculty,is_gender,d0,...,d59,d6,d60,d61,d62,d63,d7,d8,d9,label
0,0,0,1.0,17819,50093,265,294,1,1,0.586327,...,0.080709,0.663903,0.521325,-1.218457,0.572862,8.867662,0.098672,0.072862,1.136375,0
1,0,1,0.0,10070,24562,51,60,1,1,1.556291,...,2.682853,-0.02294,-0.371618,-0.987022,1.831474,-0.448014,1.16799,2.401762,0.431279,0
2,0,1,0.0,1544,6122,14,39,1,1,-0.649554,...,6.857399,-0.348564,2.753245,-0.207652,1.89221,-0.977367,11.321891,2.510515,2.910005,1
3,0,0,2.196375,1894,50410,0,40,0,0,3.757138,...,5.611234,-1.006211,-1.872412,-0.861397,0.114388,-1.09661,-4.892152,-3.294941,0.133597,0
4,0,1,0.0,3535,21037,238,271,1,1,1.290842,...,0.82428,0.429348,2.083774,0.281935,-1.777484,0.258835,0.234619,-0.287869,2.079238,0


In [4]:
param_grid = [
    {'classifier' : [LogisticRegression()],
     'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : np.logspace(-4, 4, 20),
    'classifier__solver' : ['liblinear']},
    {'classifier' : [RandomForestClassifier()],
    'classifier__n_estimators' : list(range(10,101,10)),
    'classifier__max_features' : list(range(6,32,5))}
]

In [5]:
def LogReg(data,test,cols):
    cols.append('label')
    X = data.drop(columns=cols).values
    y = data['label'].values
    X_test = test.drop(columns=cols).values
    y_test = test['label'].values
    scaler = MinMaxScaler(feature_range = (0,1))
    scaler.fit(X)
    X = scaler.transform(X)
    model = LogisticRegression().fit(X, y)
    yhat = model.predict(X_test)
    print(confusion_matrix(y_test, yhat))
    print(classification_report(y_test, yhat))
    print('ROC_AUC_SCORE:',roc_auc_score(y_test, model.decision_function(X_test)))

In [6]:
raw_data = pd.read_csv('./data/topological_train.data',sep='\t')

In [7]:
raw_data[[
            "is_dorm", "is_year", "year_diff", "from_high_school",
            "to_high_school", "from_major", "to_major", "is_faculty",
            "is_gender"
        ]]

Unnamed: 0,preferential,jaccard,adamic adar,resource allocation,is_dorm,is_year,year_diff,from_high_school,to_high_school,from_major,to_major,is_faculty,is_gender,label
0,58158.0,0.132609,472.561963,14701.0,0.0,1.0,0.000000,7676.0,9264.0,53.0,106.0,0.0,1.0,1.0
1,9918.0,0.010050,16.354318,732.0,0.0,0.0,1.803625,0.0,17107.0,0.0,0.0,1.0,1.0,0.0
2,10058.0,0.041451,53.649231,1032.0,0.0,0.0,1.000000,20895.0,50095.0,246.0,269.0,1.0,0.0,1.0
3,10728.0,0.000000,0.000000,0.0,0.0,0.0,1.000000,0.0,53129.0,1.0,6.0,0.0,0.0,0.0
4,214.0,0.000000,0.000000,0.0,0.0,0.0,1.000000,0.0,16281.0,135.0,160.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65987,29463.0,0.014749,37.109100,964.0,0.0,0.0,1.000000,0.0,17773.0,2.0,26.0,1.0,0.0,0.0
65988,35237.0,0.196203,453.973906,10538.0,1.0,1.0,0.000000,3754.0,5725.0,255.0,274.0,1.0,1.0,1.0
65989,6174.0,0.000000,0.000000,0.0,0.0,0.0,3.000000,25037.0,50221.0,0.0,254.0,1.0,1.0,0.0
65990,35763.0,0.043928,138.998594,6074.0,0.0,1.0,0.000000,1800.0,8939.0,20.0,60.0,1.0,1.0,1.0
