In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import xgboost as xgb

In [3]:
df = pd.read_csv('data/aggregated_data/combined_data.csv')
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size = 0.5, random_state = 0)

print(len(X_train.index))
print(len(X_test.index))
print(len(X_valid.index))

31004
3875
3876


In [4]:
from sklearn.model_selection import GridSearchCV

model = xgb.XGBClassifier(objective='binary:logistic')

In [5]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Precision = {}".format(metrics.precision_score(y_test, y_pred)))
print("Recall = {}".format(metrics.recall_score(y_test, y_pred)))
print("Accuracy = {}".format(metrics.accuracy_score(y_test, y_pred)))

Precision = 0.776930409914204
Recall = 0.45252637423653524
Accuracy = 0.6851612903225807


In [8]:
%%time
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split

pl_random_forest = Pipeline(steps=[('random_forest', RandomForestClassifier(n_estimators=100))])
scores = cross_val_score(pl_random_forest, X, y, cv=10, scoring='accuracy')
print('Accuracy for RandomForest : ', scores.mean())

Accuracy for RandomForest :  0.6666474583042045
CPU times: user 48.9 s, sys: 1.11 s, total: 50.1 s
Wall time: 49.7 s


In [14]:
%%time

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

pl_log_reg = Pipeline(steps=[('log_reg', LogisticRegression(solver='saga', max_iter=2000))])
scores = cross_val_score(pl_log_reg, X, y, cv=10,scoring='accuracy')
print('Accuracy for Logistic Regression: ', scores.mean())

Accuracy for Logistic Regression:  0.6402772262725124
CPU times: user 3min 13s, sys: 2.07 s, total: 3min 15s
Wall time: 3min 21s


In [19]:
%%time

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score

pl_svm = Pipeline(steps=[('pl_svm', LinearSVC(max_iter=4000))])
scores = cross_val_score(pl_svm, X, y, cv=10,scoring='accuracy')
print('Accuracy for Linear SVM : ', scores.mean())



KeyboardInterrupt: 

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

df = pd.read_csv('data/aggregated_data/combined_data.csv')
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

params = {
    "max_depth": [5, 8, 10],
    "criterion": ["gini", "entropy"]
}

clf = DecisionTreeClassifier()
model = GridSearchCV(clf, params, cv = 10, scoring='accuracy')
model.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=DecisionTreeClassifier(class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort=False, random_state=None,
                                              splitter='best'),
             iid='warn', n_jobs=None,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [5, 8, 10]},
             pre_dispatch

In [22]:
y_pred = model.predict(X_test)

print("Precision = {}".format(metrics.precision_score(y_test, y_pred)))
print("Recall = {}".format(metrics.recall_score(y_test, y_pred)))
print("Accuracy = {}".format(metrics.accuracy_score(y_test, y_pred)))

model.best_params_

Precision = 0.7472005429250085
Recall = 0.488898756660746
Accuracy = 0.6855196614717721


{'criterion': 'gini', 'max_depth': 10}

In [None]:
params = {
    "max_depth": [5, 8, 10, 15, 20],
    "gamma": [0.1, 0.2, 0.3]
}

clf = xgb.XGBClassifier(
    objective='binary:logistic', 
    learning_rate = 0.1,
    nthread = 4
)

model = GridSearchCV(clf, params, cv = 10, scoring='accuracy')
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

print("Precision = {}".format(metrics.precision_score(y_test, y_pred)))
print("Recall = {}".format(metrics.recall_score(y_test, y_pred)))
print("Accuracy = {}".format(metrics.accuracy_score(y_test, y_pred)))

model.best_params_

In [None]:
from sklearn.ensemble import RandomForestClassifier

params = {
    "max_depth": [5, 8, 10, 15, 20],
    "criterion": ["gini", "entropy"],
    "n_estimators": [10, 50, 100, 200]
}

clf = RandomForestClassifier()

model = GridSearchCV(clf, params, cv = 10, scoring='accuracy')
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

print("Precision = {}".format(metrics.precision_score(y_test, y_pred)))
print("Recall = {}".format(metrics.recall_score(y_test, y_pred)))
print("Accuracy = {}".format(metrics.accuracy_score(y_test, y_pred)))

model.best_params_