In [1]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt


  from numpy.core.umath_tests import inner1d


In [2]:
def get_X_y(df):
    X = df.drop(['label', 'nodes', 'Shortest_Path', 'Source_InDegree', 'Source_OutDegree', 'Shared_In', 'Sink_InDegree'], axis=1)
    y = df['label']
    return X, y

def preprocess_df(df):
    from collections import Counter
    
    processed_df = df.copy()
    zero = Counter(processed_df.label.values)[0]
    un = Counter(processed_df.label.values)[1]
    n = zero - un
    processed_df['label'] = processed_df['label'].astype('category')
    if n > 0:
        processed_df = processed_df.drop(processed_df[processed_df.label == 0].sample(n=n, random_state=1).index)
    return processed_df.sample(frac=0.5)

In [3]:
data = preprocess_df(pd.read_csv('train_features_600k.csv'))

In [4]:
X, y = get_X_y(data)

In [5]:
X.shape

(143812, 3)

In [6]:
from sklearn.model_selection import train_test_split

x_train, x_validation, y_train, y_validation = train_test_split(X, y, test_size=0.2)

In [7]:
# Rescale data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_validation = scaler.transform(x_validation)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# use a full grid over all parameters
param_grid = {"max_depth": [3, 5],
              "max_features": ['auto', 'log2'],
              "min_samples_split": [50, 100, 150],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"],
              "n_estimators": [50,100,150],
              "min_samples_leaf": [50, 100]}

clf = RandomForestClassifier(n_jobs=-1)

# run random search
rf_random = RandomizedSearchCV(clf, param_distributions=param_grid, cv=5, iid=False)
rf_random.fit(x_train, y_train)

rf_random.best_estimator_

In [8]:
best_rf = RandomForestClassifier(bootstrap=False, class_weight=None,
            criterion='entropy', max_depth=5, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=50,
            min_samples_split=150, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)
best_rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=False, class_weight=None,
            criterion='entropy', max_depth=5, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=50,
            min_samples_split=150, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [9]:
from sklearn.metrics import roc_auc_score
y_pred = best_rf.predict(x_validation)
y_prob = best_rf.predict_proba(x_validation)[:,1]

print("F1 Score: ", f1_score(y_validation, y_pred))
print("AUCROC Score: ", roc_auc_score(y_validation, y_prob))

F1 Score:  0.6486566195770784
AUCROC Score:  0.6178129230291503


In [None]:
from sklearn.model_selection import validation_curve

best_rf = RandomForestClassifier(bootstrap=False, class_weight=None,
            criterion='entropy', max_depth=5, max_features='log2',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=50,
            min_samples_split=150, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

param_range = np.linspace(3.0, 5.0, num=3)
train_scores, test_scores = validation_curve(
    best_rf, X, y, param_range=param_range, param_name='max_depth',
    cv=5, scoring="accuracy", n_jobs=-1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title("Validation Curve with SVM")
plt.xlabel("Max Depth")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
lw = 2
plt.plot(param_range, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
                 color="darkorange", lw=lw)
plt.plot(param_range, test_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2,
                 color="navy", lw=lw)
plt.legend(loc="best")
plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

fpr, tpr, _ = roc_curve(y_validation, y_prob, pos_label=1)

plt.figure()
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.show()

In [None]:
features = X.columns.values
importances = best_rf.feature_importances_
indices = (np.argsort(importances))[-20:]
plt.figure(figsize=(10,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='r', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

# specify parameters via map
param_grid = {"max_depth": [3, 5],
              "subsample": [0.8, 1],
              "min_samples_split": [50, 100, 150],
              "colsample_bytree": [0.8, 1],
              "learning_rate": [0.05, 0.1, 0.15],
              "gamma": [0, 0.0001],
              "importance_type": ["gain", "cover"],
              "n_estimators": [50,100,150]}
default_params = {'n_jobs':-1, 'random_state':1234, 'verbosity':0, "objective":'binary:logistic'}

xgb_search = XGBClassifier(**default_params)
# run random search
xgb_random = RandomizedSearchCV(xgb_search, param_distributions=param_grid, cv=5, iid=False)
xgb_random.fit(x_train, y_train)

xgb_random.best_estimator_

In [None]:
xgb = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.8, gamma=0.0001,
       learning_rate=0.15, max_delta_step=0, max_depth=5,
       min_child_weight=1, min_samples_split=150, missing=None,
       n_estimators=150, n_jobs=-1, nthread=None,
       objective='binary:logistic', random_state=1234, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=0)
xgb.fit(x_train, y_train)

y_pred = xgb.predict(x_validation)
y_prob = xgb.predict_proba(x_validation)[:,1]

print("F1 Score: ", f1_score(y_validation, y_pred))
print("AUCROC Score: ", roc_auc_score(y_validation, y_prob))

In [None]:
import matplotlib.pyplot as plt

features = X.columns.values
importances = xgb.feature_importances_
indices = (np.argsort(importances))[-20:]
plt.figure(figsize=(10,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='r', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

In [None]:
from sklearn.linear_model import LogisticRegression

logit = LogisticRegression(random_state=1234, solver='saga', n_jobs=-1)
logit.fit(x_train, y_train)

y_pred = logit.predict(x_validation)
y_prob = logit.predict_proba(x_validation)[:,1]

print("F1 Score: ", f1_score(y_validation, y_pred))
print("AUCROC Score: ", roc_auc_score(y_validation, y_prob))

In [None]:
from sklearn.naive_bayes import BernoulliNB, GaussianNB

bnb = BernoulliNB()
bnb.fit(x_train, y_train)

y_pred = bnb.predict(x_validation)
y_prob = bnb.predict_proba(x_validation)[:,1]

print("F1 Score: ", f1_score(y_validation, y_pred))
print("AUCROC Score: ", roc_auc_score(y_validation, y_prob))

In [None]:
gnb = GaussianNB()
gnb.fit(x_train, y_train)

y_pred = gnb.predict(x_validation)
y_prob = gnb.predict_proba(x_validation)[:,1]

print("F1 Score: ", f1_score(y_validation, y_pred))
print("AUCROC Score: ", roc_auc_score(y_validation, y_prob))

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2),
                        n_estimators=100, random_state=1234)
ada.fit(x_train, y_train)

y_pred = ada.predict(x_validation)
y_prob = ada.predict_proba(x_validation)[:,1]

print("F1 Score: ", f1_score(y_validation, y_pred))
print("AUCROC Score: ", roc_auc_score(y_validation, y_prob))

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV

# Create the RFE object and compute a cross-validated score.
# The "accuracy" scoring is proportional to the number of correct
# classifications
rfecv = RFECV(estimator=best_rf, step=1, cv=StratifiedKFold(2),
              scoring='precision')
rfecv.fit(x_train, y_train)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

In [11]:
test_features = pd.read_csv('test_features_600k.csv')

In [12]:
test_features = test_features.drop(['nodes', 'Shortest_Path', 'Source_InDegree', 'Source_OutDegree', 'Shared_In', 'Sink_InDegree'], axis=1)

In [13]:
test_features.head()

Unnamed: 0,JC,RA,AA
0,0.0,0.0,0.0
1,0.002618,0.000621,0.311809
2,0.0,0.0,0.0
3,0.063636,0.033795,2.865019
4,0.013423,0.007775,1.999527


In [15]:
x_test = test_features.to_numpy()

scaler = StandardScaler()
x_test = scaler.fit_transform(x_test)

In [None]:
y_test = logit.predict(x_test)

print(sum([1 for y in y_test if y==1]))

In [None]:
y_test = xgb.predict(x_test)

print(sum([1 for y in y_test if y==1]))

In [16]:
y_test = best_rf.predict(x_test)

print(sum([1 for y in y_test if y==1]))

1545


In [None]:
y_test = ada.predict(x_test)

print(sum([1 for y in y_test if y==1]))

In [17]:
y_prob = best_rf.predict_proba(x_test)[:,1]

row_list = []
for i in range(1, len(y_prob)+1):
    row = {}
    row['Id'] = i
    row['Predictions'] = y_prob[i-1]
    row_list.append(row)

predictions = pd.DataFrame(row_list)
predictions.to_csv('300k-rf-3cols.csv', index=False)