In [0]:
import networkx as nx
import pandas as pd
import numpy as np

In [17]:
#load the file
from google.colab import files
uploaded = files.upload()

Saving df_train.csv to df_train (1).csv


#### Evaluation metrics definitions:

In [0]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

def evaluate_model(predictions, actual):
    accuracy = accuracy_score(actual, predictions)
    precision = precision_score(actual, predictions)
    recall = recall_score(actual, predictions)
    f1 = f1_score(actual, predictions)
    
    metrics = ["accuracy", "precision", "recall", "F1"]
    values = [accuracy, precision, recall, f1]    
    return pd.DataFrame(data={'metric': metrics, 'value': values})

def feature_importance(columns, classifier):        
    features = list(zip(columns, classifier.feature_importances_))
    sorted_features = sorted(features, key = lambda x: x[1]*-1)
    
    keys = [value[0] for value in sorted_features]
    values = [value[1] for value in sorted_features]
    return pd.DataFrame(data={'feature': keys, 'value': values})

In [0]:
columns = ["Jaccard", "Adamic-Adar", "Preferential Attachment", "Resource Allocation",
           "Common Neighbors","Salton Index","Sorensen Index"]

# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(df_train[columns], df_train["label"], test_size=0.25, random_state=42)


In [0]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import model_selection, metrics   #Additional scklearn functions
from sklearn.model_selection import GridSearchCV   #Perforing grid search

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

train = pd.read_csv('df_train.csv')
train = train.iloc[:50]
train.rename(columns={'Unnamed: 0':'ID'}, inplace=True)
target = 'label'
IDcol = 'ID'

In [0]:
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['label'],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain['label'].values, dtrain_predictions))
    print ("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['label'], dtrain_predprob))
                    
    # feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False) #it doesn't work and I can't figure it out why
    # feat_imp.plot(kind='bar', title='Feature Importances')
    # plt.ylabel('Feature Importance Score')

In [22]:
#Start tunning by fixing learning rate and n_estimators
predictors = [x for x in train.columns if x not in [target, IDcol]]
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb1, train, predictors)


Model Report
Accuracy : 0.86
AUC Score (Train): 0.978896


In [24]:
#Tunung max_depth and min_child weight
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(train[predictors],train[target])
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

({'mean_fit_time': array([0.23044214, 0.12444515, 0.12887201, 0.21203952, 0.15985084,
         0.13978882, 0.22712865, 0.14875312, 0.13728571, 0.21374626,
         0.14005275, 0.12655845]),
  'mean_score_time': array([0.01123705, 0.0172255 , 0.01596203, 0.0134676 , 0.01211843,
         0.01498151, 0.01150451, 0.01397653, 0.01391273, 0.01304398,
         0.0122098 , 0.01360865]),
  'mean_test_score': array([0.82666667, 0.9075    , 0.5       , 0.82666667, 0.9075    ,
         0.5       , 0.82666667, 0.9075    , 0.5       , 0.82666667,
         0.9075    , 0.5       ]),
  'param_max_depth': masked_array(data=[3, 3, 3, 5, 5, 5, 7, 7, 7, 9, 9, 9],
               mask=[False, False, False, False, False, False, False, False,
                     False, False, False, False],
         fill_value='?',
              dtype=object),
  'param_min_child_weight': masked_array(data=[1, 3, 5, 1, 3, 5, 1, 3, 5, 1, 3, 5],
               mask=[False, False, False, False, False, False, False, False,
       

For this dataset, we found that 3 is the best value for both max-depth and min_child_weight. Let's go a bit further by looking around the value 3

In [26]:
param_test2 = {
 'max_depth':[2,3,4],
 'min_child_weight':[2,3,4]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch2.fit(train[predictors],train[target])
gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_

({'mean_fit_time': array([0.15983968, 0.14279075, 0.14694095, 0.15263138, 0.14265232,
         0.15118504, 0.15282001, 0.14439664, 0.1291657 ]),
  'mean_score_time': array([0.01612682, 0.01030245, 0.00809226, 0.01105204, 0.01332116,
         0.01433845, 0.01287227, 0.01396103, 0.00825243]),
  'mean_test_score': array([0.89333333, 0.9075    , 0.91666667, 0.89333333, 0.9075    ,
         0.91666667, 0.89333333, 0.9075    , 0.91666667]),
  'param_max_depth': masked_array(data=[2, 2, 2, 3, 3, 3, 4, 4, 4],
               mask=[False, False, False, False, False, False, False, False,
                     False],
         fill_value='?',
              dtype=object),
  'param_min_child_weight': masked_array(data=[2, 3, 4, 2, 3, 4, 2, 3, 4],
               mask=[False, False, False, False, False, False, False, False,
                     False],
         fill_value='?',
              dtype=object),
  'params': [{'max_depth': 2, 'min_child_weight': 2},
   {'max_depth': 2, 'min_child_weight': 3},


Here, we found a better score with max_depth = 2 and min_child_weight = 4 

In [27]:
#Tuning gamma
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=2,
 min_child_weight=4, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch3.fit(train[predictors],train[target])
gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_

({'mean_fit_time': array([0.13593616, 0.12583327, 0.13031216, 0.12242122, 0.12160201]),
  'mean_score_time': array([0.01503401, 0.02146525, 0.01061621, 0.01140809, 0.00956979]),
  'mean_test_score': array([0.91666667, 0.91666667, 0.91666667, 0.91666667, 0.91666667]),
  'param_gamma': masked_array(data=[0.0, 0.1, 0.2, 0.3, 0.4],
               mask=[False, False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'gamma': 0.0},
   {'gamma': 0.1},
   {'gamma': 0.2},
   {'gamma': 0.3},
   {'gamma': 0.4}],
  'rank_test_score': array([1, 1, 1, 1, 1], dtype=int32),
  'split0_test_score': array([0.86666667, 0.86666667, 0.86666667, 0.86666667, 0.86666667]),
  'split1_test_score': array([0.91666667, 0.91666667, 0.91666667, 0.91666667, 0.91666667]),
  'split2_test_score': array([0.875, 0.875, 0.875, 0.875, 0.875]),
  'split3_test_score': array([1., 1., 1., 1., 1.]),
  'split4_test_score': array([0.925, 0.925, 0.925, 0.925, 0.925]),
  'std_fit_time': array([

The best value of gamma here is 0

In [29]:
#Tuning subsample and colsample_bytree
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=2,
 min_child_weight=4, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch4.fit(train[predictors],train[target])
gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_

({'mean_fit_time': array([0.16031528, 0.16993542, 0.17389369, 0.16641212, 0.16845059,
         0.16234264, 0.16330171, 0.17241464, 0.15590363, 0.17040668,
         0.17107234, 0.16847329, 0.16519647, 0.15688567, 0.16278582,
         0.12144523]),
  'mean_score_time': array([0.01287313, 0.01149611, 0.0087009 , 0.01467533, 0.01231918,
         0.01342897, 0.01369061, 0.01284328, 0.01282845, 0.01117849,
         0.00986352, 0.01213007, 0.01225815, 0.01244712, 0.01328731,
         0.00714426]),
  'mean_test_score': array([0.5       , 0.67666667, 0.90166667, 0.905     , 0.5       ,
         0.71      , 0.91666667, 0.915     , 0.5       , 0.71      ,
         0.91      , 0.915     , 0.5       , 0.69666667, 0.93      ,
         0.87166667]),
  'param_colsample_bytree': masked_array(data=[0.6, 0.6, 0.6, 0.6, 0.7, 0.7, 0.7, 0.7, 0.8, 0.8, 0.8,
                     0.8, 0.9, 0.9, 0.9, 0.9],
               mask=[False, False, False, False, False, False, False, False,
                     False, F

We got a better score here with 0.93 with colsample_bytree = 0.9 and subsample = 0.8. We are going to see the different if we change a little bit this value.

In [31]:
param_test5 = {
 'subsample':[i/100.0 for i in range(75,90,5)],
 'colsample_bytree':[i/100.0 for i in range(85,100,5)]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=2,
 min_child_weight=4, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test5, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch5.fit(train[predictors],train[target])
gsearch5.cv_results_, gsearch5.best_params_, gsearch5.best_score_

({'mean_fit_time': array([0.16784439, 0.17867146, 0.16028147, 0.15596213, 0.16656108,
         0.16097832, 0.16586318, 0.16973195, 0.13142252]),
  'mean_score_time': array([0.00548177, 0.01743283, 0.01237841, 0.00921736, 0.0116673 ,
         0.01209936, 0.00858512, 0.00997343, 0.0094943 ]),
  'mean_test_score': array([0.8975    , 0.91      , 0.9       , 0.92166667, 0.93      ,
         0.91      , 0.92166667, 0.93      , 0.91      ]),
  'param_colsample_bytree': masked_array(data=[0.85, 0.85, 0.85, 0.9, 0.9, 0.9, 0.95, 0.95, 0.95],
               mask=[False, False, False, False, False, False, False, False,
                     False],
         fill_value='?',
              dtype=object),
  'param_subsample': masked_array(data=[0.75, 0.8, 0.85, 0.75, 0.8, 0.85, 0.75, 0.8, 0.85],
               mask=[False, False, False, False, False, False, False, False,
                     False],
         fill_value='?',
              dtype=object),
  'params': [{'colsample_bytree': 0.85, 'subsample

We found other values of colsample_bytree and subsample. We can keep this these values unchanged or change it since the score doesn't change.

In [32]:
#Tuning reg_alpha
param_test6 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch6 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=2,
 min_child_weight=4, gamma=0, subsample=0.8, colsample_bytree=0.9,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test6, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch6.fit(train[predictors],train[target])
gsearch6.cv_results_, gsearch6.best_params_, gsearch6.best_score_

({'mean_fit_time': array([0.18574228, 0.1556107 , 0.18238912, 0.15497713, 0.13775544]),
  'mean_score_time': array([0.01298919, 0.01092787, 0.01769209, 0.00774202, 0.00757098]),
  'mean_test_score': array([0.93      , 0.93      , 0.91333333, 0.91666667, 0.5       ]),
  'param_reg_alpha': masked_array(data=[1e-05, 0.01, 0.1, 1, 100],
               mask=[False, False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'reg_alpha': 1e-05},
   {'reg_alpha': 0.01},
   {'reg_alpha': 0.1},
   {'reg_alpha': 1},
   {'reg_alpha': 100}],
  'rank_test_score': array([1, 1, 4, 3, 5], dtype=int32),
  'split0_test_score': array([0.86666667, 0.86666667, 0.9       , 0.86666667, 0.5       ]),
  'split1_test_score': array([0.98333333, 0.98333333, 0.91666667, 0.91666667, 0.5       ]),
  'split2_test_score': array([0.875, 0.875, 0.875, 0.875, 0.5  ]),
  'split3_test_score': array([1. , 1. , 1. , 1. , 0.5]),
  'split4_test_score': array([0.925, 0.925, 0.875, 0.925, 0.5

The same as the previous section, we found reg_alpha = 1e-05. We can keep this value unchanged since the score is the same. By the way, let's try other value around this one in case we can have a better score.

In [33]:
param_test7 = {
 'reg_alpha':[0, 1e-05, 0.001, 0.01, 0.05]
}
gsearch7 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=2,
 min_child_weight=4, gamma=0, subsample=0.8, colsample_bytree=0.9,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test7, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch7.fit(train[predictors],train[target])
gsearch7.cv_results_, gsearch7.best_params_, gsearch7.best_score_

({'mean_fit_time': array([0.17843256, 0.158849  , 0.18235044, 0.13709173, 0.14682889]),
  'mean_score_time': array([0.0161624 , 0.01249113, 0.0159462 , 0.01294894, 0.00783935]),
  'mean_test_score': array([0.93      , 0.93      , 0.93      , 0.93      , 0.91666667]),
  'param_reg_alpha': masked_array(data=[0, 1e-05, 0.001, 0.01, 0.05],
               mask=[False, False, False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'reg_alpha': 0},
   {'reg_alpha': 1e-05},
   {'reg_alpha': 0.001},
   {'reg_alpha': 0.01},
   {'reg_alpha': 0.05}],
  'rank_test_score': array([1, 1, 1, 1, 5], dtype=int32),
  'split0_test_score': array([0.86666667, 0.86666667, 0.86666667, 0.86666667, 0.86666667]),
  'split1_test_score': array([0.98333333, 0.98333333, 0.98333333, 0.98333333, 0.91666667]),
  'split2_test_score': array([0.875, 0.875, 0.875, 0.875, 0.875]),
  'split3_test_score': array([1., 1., 1., 1., 1.]),
  'split4_test_score': array([0.925, 0.925, 0.925, 0.925, 0.

The score doesn't change. We can keep the old value. Let's change these parameters in order to see the improvement.

In [34]:
xgb3 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=2,
 min_child_weight=4,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.9,
 reg_alpha=0.005,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb3, train, predictors)


Model Report
Accuracy : 0.88
AUC Score (Train): 0.921266


We obtain a better Accuracy here. Even though the AUC Score decreases, it is still a good score. Another step is to decrease the learning rate and increase the number of estimators. We will  see if we get somethings better.

In [35]:
xgb4 = XGBClassifier(
 learning_rate =0.01,
 n_estimators=5000,
 max_depth=2,
 min_child_weight=4,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.9,
 reg_alpha=0.005,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb4, train, predictors)


Model Report
Accuracy : 0.88
AUC Score (Train): 0.924513


The Accuracy remains unchanged with a slightly change of AUC Score. So, we will keep the last model.