In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score,classification_report,confusion_matrix,log_loss

# Loading data

In [2]:
# loading the train data
df = pd.read_csv('data/training_set.csv',index_col=0)
df.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X49,X50,X51,X52,X53,X54,X55,X56,X57,Y
0,0.0,0.0,4.34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.342,0.0,0.0,1.2,2,12,0
1,0.0,0.56,0.56,0.0,1.12,0.56,2.25,0.0,0.0,0.56,...,0.0,0.083,0.0,0.503,0.0,0.083,16.304,148,375,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,5,0
3,0.64,0.0,0.64,0.0,1.93,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.462,0.37,0.0,2.44,22,122,1
4,0.58,0.0,0.0,35.46,0.58,0.0,0.58,0.58,0.0,0.0,...,0.0,0.0,0.0,0.239,0.239,0.0,3.338,123,207,1


In [3]:
# Loading Test data
test_data = pd.read_csv('data/test_set.csv',index_col=0)
test_data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X48,X49,X50,X51,X52,X53,X54,X55,X56,X57
0,0.7,0.0,0.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.105,0.0,0.0,2.342,47,89
1,0.0,0.0,0.84,0.0,0.84,0.0,0.84,0.0,0.0,0.0,...,0.0,0.0,0.388,0.0,0.776,0.129,0.0,10.375,168,249
2,0.46,0.3,0.46,0.0,0.05,0.12,0.05,0.28,0.43,0.74,...,0.0,0.0,0.065,0.0,0.325,0.756,0.153,5.891,193,3040
3,0.1,0.2,1.01,0.0,0.8,0.8,0.5,0.0,0.8,0.1,...,0.0,0.0,0.11,0.0,0.49,0.158,0.015,8.55,669,1351
4,0.0,0.0,0.72,0.0,0.72,0.0,0.72,0.0,0.0,0.0,...,0.0,0.0,0.364,0.0,0.729,0.121,0.0,7.781,32,249


# Splitting into X and y

In [4]:
X = df.drop(['Y'],axis=1)
y = df['Y']

# Train Test Split

In [5]:
X_train, X_test, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Using RandomForest Classifier for feature Selection

In [6]:
clf = RandomForestClassifier(100, max_depth=None, n_jobs=-1)
clf.fit(X_train,y_train)
feature_importance = clf.feature_importances_

In [7]:
# Ranking the features with their respect to feature importances
fi = sorted(zip(X.columns,feature_importance),key=lambda x: x[1], reverse=True)
# Extracting Top 30 features
top_features = [x[0] for x in fi[:30]]

In [8]:
# Selecting the top features from data
X_train_dash = X_train[top_features]
X_test_dash = X_test[top_features]

In [9]:
# Selecting the top features for test data
test_data_dash = test_data[top_features]

# Normalizing our data

In [10]:
# Using StandardScaler to normalize our data
scaler = StandardScaler()
scaler.fit(X_train_dash)

# transform
X_train_dash = pd.DataFrame(scaler.transform(X_train_dash),columns=X_train_dash.columns)
X_test_dash = pd.DataFrame(scaler.transform(X_test_dash),columns=X_test_dash.columns)

In [11]:
# Using StandardScaler to normalize our test data
test_data_dash = pd.DataFrame(scaler.transform(test_data_dash),columns=test_data_dash.columns)

# Training 

# Xgboost

In [12]:
# Using Xgboost
classifier = XGBClassifier(n_estimators=500,
                           max_depth=5,
                           learning_rate=0.15,
                           colsample_bytree=1,
                           subsample=1,
                           reg_alpha = 0.3,
                           gamma=10,
                           n_jobs=-1,
                           eval_metric='logloss',
                           use_label_encoder=False)

classifier.fit(X_train_dash, y_train)

y_train_pred = classifier.predict(X_train_dash)
y_train_prob = classifier.predict_proba(X_train_dash)[:,1]
y_val_pred = classifier.predict(X_test_dash)
y_val_prob = classifier.predict_proba(X_test_dash)[:,1]


# Calculating logloss score for our model
print(f'Train Logloss for the model -> {log_loss(y_train,y_train_prob)}')
print(f'Validation Logloss for the model -> {log_loss(y_val,y_val_prob)}')

print('-'*50)
# Calculating the AUC score for our model
print(f'Train AUC Score for the model -> {roc_auc_score(y_train, y_train_prob)}')
print(f'Validation AUC Score for the model -> {roc_auc_score(y_val, y_val_prob)}')

Train Logloss for the model -> 0.14307064312877005
Validation Logloss for the model -> 0.15659173915300834
--------------------------------------------------
Train AUC Score for the model -> 0.988952325096344
Validation AUC Score for the model -> 0.9873889936567805


# Predictions on Test data

In [13]:
y_test_pred = classifier.predict(test_data_dash)
print(y_test_pred)

[0 1 1 1 1 0 1 1 1 1 0 0 0 0 0 0 0 0 1 0 1 1 1 0 1 0 0 1 0 0 1 0 0 0 0 0 0
 0 1 0 1 0 0 0 0 1 0 0 0 1 1 0 0 1 1 1 0 0 0 1 1 0 0 0 0 1 1 1 0 0 0 1 0 1
 1 1 0 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 0 1 1 0 0 0 0 1 0
 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1 0 1 0 0 0 0 0 1 1 0
 1 1 0 1 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 1 0 1 1 0 0 0 1 0
 0 1 0 0 1 1 0 1 1 1 1 0 0 0 1 0 1 0 0 0 1 1 0 1 0 0 1 0 0 0 0 0 0 0 1 0 1
 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 1 1
 1 0 0 1 0 1 0 0 0 1 0 0 1 1 0 1 1 1 0 0 0 0 0 1 0 1 0 0 1 0 0 1 0 0 1 1 0
 0 0 0 1 0 1 0 0 0 0 0 1 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 1
 0 1 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1
 1 1 0 0 1 0 0 1 1 1 1 1 0 0 0 0 1 0 0 1 1 1 0 1 1 1 0 0 0 1 1 1 1 1 1 1 0
 1 0 0 0 1 1 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 1 1 0
 0 0 0 1 1 0 1 1 1 0 0 1 0 0 1 1 0 0 1 1 0 1 0 0 1 1 0 0 1 0 0 1 1 1 1 1 0
 0 1 0 0 0 0 0 0 0 0 1 0 

In [14]:
y_test_prob = classifier.predict_proba(test_data_dash)[:,1]
print(y_test_prob)

[0.04271409 0.9832941  0.9784445  0.9707199  0.9832941  0.25391334
 0.9813176  0.7845885  0.9663585  0.98225135 0.01083483 0.06842812
 0.21198574 0.01833995 0.05076724 0.06760822 0.02539814 0.16887471
 0.9663585  0.02184766 0.97454685 0.9735979  0.9714138  0.01123517
 0.72341365 0.06946245 0.11394848 0.85844535 0.06946245 0.01146475
 0.8965603  0.05694852 0.01833995 0.05694852 0.01179442 0.02831707
 0.46751907 0.02849386 0.61065644 0.47207215 0.9766541  0.13998061
 0.16887471 0.13001329 0.05054418 0.9363092  0.06672557 0.02184766
 0.05097714 0.97791696 0.85415566 0.11893313 0.0121338  0.94125766
 0.97723126 0.9777154  0.02660294 0.02445995 0.46172813 0.95680577
 0.9787588  0.143574   0.287279   0.18906485 0.40167016 0.9629246
 0.89245486 0.7384918  0.1506621  0.05694852 0.02674611 0.8565982
 0.05694852 0.59294015 0.89968246 0.9627089  0.10170943 0.01059248
 0.31736988 0.05694852 0.07162196 0.9541347  0.92554635 0.9702992
 0.05694852 0.01833995 0.05704243 0.05694852 0.96765643 0.0133679