### Loading Data

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Casestudy-1

/content/drive/MyDrive/Casestudy-1


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import pickle
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from mlxtend.classifier import StackingCVClassifier
from sklearn.svm import SVC

import warnings
warnings.filterwarnings("ignore")


from sklearn.metrics import log_loss




In [None]:
dfe=pickle.load(open('data_fe','rb'))

Data Splitting

In [None]:
le= LabelEncoder()
dfe['event']=le.fit_transform(dfe['event'])

In [None]:
y=dfe['event']
dfe.drop(['event'],axis=1,inplace= True)

In [None]:
X_tr, X_cv, y_tr, y_cv = train_test_split(dfe, y, test_size=0.1, stratify=y,random_state=42)

### Random Model

In [None]:
#https://stackoverflow.com/questions/18659858/generating-a-list-of-random-numbers-summing-to-1#:~:text=The%20best%20way%20to%20do,are%20totally%20random%20this%20way.&text=Dividing%20each%20number%20by%20the,pair%20x%2Cy%20%3D%20random.
def random(X,y):
    yp=[]
    for i in range(len(X)):
        r=np.random.random(4)
        r/=r.sum()
        yp.append(r)
    return (log_loss(y,yp))

In [None]:
random(dfe,y)

1.646011053392743

### Logistic Regression

In [None]:
C=[.001,.01,.1]
Penalty=['l1','l2']
for i in tqdm(Penalty):
    print("With Penalty:",i)
    for j in C:
        clf = LogisticRegression(penalty=i,C=j,solver='liblinear')
        clf.fit(X_tr,y_tr)
        yp_tr= clf.predict_proba(X_tr)
        yp_cv = clf.predict_proba(X_cv)
        print("C:",j)
        print("Train logloss: ",log_loss(y_tr,yp_tr))
        print("CV logloss: ",log_loss(y_cv,yp_cv))
    print("________________________________________________________")

Best logloss is 0.89 using logistic Regression

In [None]:
clf = LogisticRegression(penalty='l1',C=0.1,solver='liblinear')
clf.fit(X_tr,y_tr)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
pickle.dump(clf,open('log_reg','wb'))

### Decision Tree

In [None]:
max_depth = [5,10,20,30] 
for i in tqdm(max_depth)
    clf=DecisionTreeClassifier(max_depth=i)
    clf.fit(X_tr,y_tr)
    yp_tr= clf.predict_proba(X_tr)
    yp_cv= clf.predict_proba(X_cv)
    print('For max_depth of',i)
    print("Train logloss: ",log_loss(y_tr,yp_tr))
    print("CV logloss: ",log_loss(y_cv,yp_cv))
    print('____________________________________________')

  0%|          | 0/4 [00:00<?, ?it/s]

For max_depth of 5
Train logloss:  0.6694439630795775


 25%|██▌       | 1/4 [02:04<06:13, 124.55s/it]

CV logloss:  0.6715730484825096
____________________________________________
For max_depth of 10
Train logloss:  0.3620111945296545


 50%|█████     | 2/4 [05:55<05:12, 156.49s/it]

CV logloss:  0.36630413992267685
____________________________________________
For max_depth of 20
Train logloss:  0.06572062387880964


 75%|███████▌  | 3/4 [12:12<03:42, 222.53s/it]

CV logloss:  0.1257160472400046
____________________________________________
For max_depth of 30
Train logloss:  0.009126405480457858


100%|██████████| 4/4 [19:33<00:00, 293.48s/it]

CV logloss:  0.13230464350498783
____________________________________________





Best cv logloss is 0.12 for tree depth of 20 nodes.

In [None]:
dt=DecisionTreeClassifier(max_depth=20)
dt.fit(X_tr,y_tr)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=20, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
pickle.dump(dt,open('dt','wb'))

### AdaBoost Classifier

In [None]:
n=[50,100]
for i in n:
    adb = AdaBoostClassifier(n_estimators=i)
    adb.fit(X_tr, y_tr)
    yp_tr= xgb.predict_proba(X_tr)
    yp_cv= xgb.predict_proba(X_cv)
    print("Train logloss: ",log_loss(y_tr,yp_tr))
    print("CV logloss: ",log_loss(y_cv,yp_cv))
    print('____________________________________________')

Train logloss:  0.4714267234535248
CV logloss:  0.47140811647737424
____________________________________________
Train logloss:  0.4714267234535248
CV logloss:  0.47140811647737424
____________________________________________


In [None]:
adb = AdaBoostClassifier(n_estimators=50)
adb.fit(X_tr, y_tr)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

In [None]:
pickle.dump(adb,open('adb','wb'))

### Random Forest

In [None]:
clf=RandomForestClassifier()
clf.fit(X_tr,y_tr)
yp_tr= clf.predict_proba(X_tr)
yp_cv= clf.predict_proba(X_cv)
print("Train logloss: ",log_loss(y_tr,yp_tr))
print("CV logloss: ",log_loss(y_cv,yp_cv))

Train logloss:  0.027534669474056726
CV logloss:  0.08246289949394532


As, Random forest model is giving best score, we try to find best parameters giving least logloss.

Hyperparameter tuning on Random forest

In [None]:
n_estimators=[20,80,100]
max_depth = [10,20,30] 
for j in tqdm(n_estimators):
    print("With",j,"estimators")
    for i in max_depth:
        clf=RandomForestClassifier(n_estimators=i,max_depth=j)
        clf.fit(X_tr,y_tr)
        yp_tr= clf.predict_proba(X_tr)
        yp_cv= clf.predict_proba(X_cv)
        print("With tree depth of",i)
        print("Train logloss: ",log_loss(y_tr,yp_tr))
        print("CV logloss: ",log_loss(y_cv,yp_cv))
    print('____________________________________________')

  0%|          | 0/3 [00:00<?, ?it/s]

With 20 estimators
With tree depth of 10
Train logloss:  0.3341263239422819
CV logloss:  0.3500493474013759
With tree depth of 20
Train logloss:  0.31636409932186677
CV logloss:  0.3302590622860466
With tree depth of 30
Train logloss:  0.3215623517851082


 33%|███▎      | 1/3 [27:48<55:37, 1668.70s/it]

CV logloss:  0.3361050080253603
____________________________________________
With 80 estimators
With tree depth of 10
Train logloss:  0.027021071897901056
CV logloss:  0.08988155553903741
With tree depth of 20
Train logloss:  0.027142792841086893
CV logloss:  0.0820887237996315
With tree depth of 30
Train logloss:  0.028142926352770787


 67%|██████▋   | 2/3 [1:11:25<32:33, 1953.26s/it]

CV logloss:  0.08503707067207067
____________________________________________
With 100 estimators
With tree depth of 10
Train logloss:  0.027939019329282434
CV logloss:  0.09487499926135429
With tree depth of 20
Train logloss:  0.02382489971179354
CV logloss:  0.07296221431277189
With tree depth of 30
Train logloss:  0.027437726967649616


100%|██████████| 3/3 [1:53:58<00:00, 2279.56s/it]

CV logloss:  0.08256904374555427
____________________________________________





In [None]:
rf=RandomForestClassifier(max_depth=40)
rf.fit(X_tr,y_tr)

In [None]:
pickle.dump(rf,open('best_clf','wb'))

## Stacking

In [None]:
lr = LogisticRegression(penalty='l1',C=0.1,solver='liblinear')
dt=DecisionTreeClassifier(max_depth=20)
xgb=XGBClassifier(max_depth=5,verbose=1)
adb = AdaBoostClassifier(n_estimators=50,learning_rate= 0.75)
rf=RandomForestClassifier(max_depth=40,n_jobs=-1,verbose=1)
svc= SVC(kernel='rbf', C=0.025, probability=True)

### 1.

In [None]:
#https://www.kaggle.com/thomasnelson/simple-stacking-classifier-for-beginners
sclf = StackingCVClassifier(classifiers=[lr, dt, adb], 
                            use_probas=True,
                            meta_classifier=rf)

In [None]:
sclf1.fit(X_tr.values,y_tr.values)

In [None]:
yp_tr= sclf1.predict_proba(X_tr)
yp_cv= sclf1.predict_proba(X_cv)
print("Train logloss: ",log_loss(y_tr,yp_tr))
print("CV logloss: ",log_loss(y_cv,yp_cv))

Train logloss:  0.07848625725269495
CV logloss:  0.09339712085322266


In [None]:
pickle.dump(sclf1,open('stacking','wb'))

### 2.

In [None]:
sclf2 = StackingCVClassifier(classifiers=[dt,xgb,svc], use_probas=True, meta_classifier=rf)

In [None]:
sclf2.fit(X_tr.values,y_tr.values)

In [None]:
yp_tr2= sclf2.predict_proba(X_tr)
yp_cv2= sclf2.predict_proba(X_cv)

In [None]:
print("Train logloss: ",log_loss(y_tr,yp_tr2))
print("CV logloss: ",log_loss(y_cv,yp_cv2))

Train logloss:  0.005130631969277596
CV logloss:  0.016359747904336603


In [None]:
pickle.dump(sclf2,open('stack2','wb'))

### 3.

In [None]:
sclf3 = StackingCVClassifier(classifiers=[lr, rf, adb], use_probas=True, meta_classifier=lr)

In [None]:
sclf3.fit(X_tr.values,y_tr.values)

In [None]:
yp_tr3= sclf3.predict_proba(X_tr)
yp_cv3= sclf3.predict_proba(X_cv)

In [None]:
print("Train logloss: ",log_loss(y_tr,yp_tr3))
print("CV logloss: ",log_loss(y_cv,yp_cv3))

Train logloss:  0.003391153744715614
CV logloss:  0.023784883684558535


In [None]:
pickle.dump(sclf3,open('stack3','wb'))