In [72]:
import pandas as pd
import numpy as np
import matplotlib as mpl
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, StackingClassifier

In [45]:
bio_av_tr = pd.read_csv('Test_Datasets/11_Bio_Cat2_Avg_training.csv')
bio_av_vl = pd.read_csv('Test_Datasets/11_Bio_Cat2_Avg_validation.csv')
bio_av_ev = pd.read_csv('Test_Datasets/11_Bio_Cat2_Avg_evaluation.csv')

In [46]:
# Function to extract features and target variables

def x_y(df):
    y = df['0']
    x = df.drop(columns=['0'])
    return x, y

In [47]:
x_ba_tr, y_ba_tr = x_y(bio_av_tr)
x_ba_vl, y_ba_vl = x_y(bio_av_vl)
x_ba_ev, y_ba_ev = x_y(bio_av_ev)
#x_ba_tr.head()

In [48]:
# Function to train and evaluate model on dataset
import time

def scorer(model, x_ts, y_ts, name = 'Model'):
    print(' {} \n Score: {:6.4f}'.format(name, model.score(x_ts, y_ts)))

def trainer(model, x_tr, y_tr, x_vl, y_vl, x_ev, y_ev, name='Model'):
    st = time.time()
    model.fit(x_tr, y_tr)
    print('Training time: {:6.4f} s\n'.format(time.time()-st))
    scorer(mod_gs, x_vl, y_vl, '\n{} : Cross Validation Score:'.format(name))
    scorer(mod_gs, x_ev, y_ev, '{} : Evaluation Set Score:'.format(name))
    return model



In [49]:
# Function to train with GridSearch

def trainer_gs(mod, params, x_tr, y_tr, x_vl, y_vl, x_ev, y_ev, cvl, name='Model'):
    start = time.time()
    mod_gs = GridSearchCV(mod, params, cv=cvl)
    mod_gs.fit(x_tr, y_tr)
    print('Training Time : {:6.4f} seconds'.format(time.time()-start))
    print('\nOptimal parameter:\n', mod_gs.best_params_)
    
    mod_fin = mod_gs.best_estimator_
    
    scorer(mod_gs, x_vl, y_vl, '\n{} : Cross Validation Score:'.format(name))
    scorer(mod_gs, x_ev, y_ev, '{} : Evaluation Set Score:'.format(name))
    return mod_fin

In [29]:
# from sklearn.linear_model import LogisticRegression

# lr = LogisticRegression(max_iter=500)

# lr = trainer(lr, x_ba_tr, y_ba_tr)

Training time: 1.8211 s



In [31]:
#lr = trainer(lr, x_ba_vl, y_ba_vl)

scorer(lr, x_ba_vl, y_ba_vl)
scorer(lr, x_ba_ev, y_ba_ev)

 Model 
 Score: 0.9875
 Model 
 Score: 0.8200


In [63]:
# Function to pickle a trained model to an output.pickle file
import pickle

def pkl_out(model, name='Model'):
    op_ = open("Pkls/{}.pickle".format(name), "wb")
    pickle.dump(model, op_)
    op_.close()
    print('Success')
    
# Function to extract a trained model from a pickle file

def pkl_in(name):
    ip_ = open("Pkls/{}.pickle".format(name), "rb")
    mod = pickle.load(ip_)
    return mod


In [None]:
# Hereon, we try different classifiers with a range of parameters for accuracy, followed by ensembles.



In [54]:
# 1. Logistic Regression

lr = LogisticRegression()
params_lr = {'max_iter':[300, 500, 750]}

lr_f = trainer_gs(lr, params_lr, x_ba_tr, y_ba_tr, x_ba_vl, y_ba_vl, x_ba_ev, y_ba_ev, 5, 'Logistic Regression')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training Time : 25.4586 seconds

Optimal parameter:
 {'max_iter': 300}
 
Logistic Regression : Cross Validation Score: 
 Score: 0.9875
 Logistic Regression : Evaluation Set Score: 
 Score: 0.8200


In [55]:
# 2. K Nearest Neighbors

knn = KNeighborsClassifier()
params_knn = {'n_neighbors': np.arange(1,25)}

knn_f = trainer_gs(knn, params_knn, x_ba_tr, y_ba_tr, x_ba_vl, y_ba_vl, x_ba_ev, y_ba_ev, 5, 'K Nearest Neighbors')

Training Time : 310.5012 seconds

Optimal parameter:
 {'n_neighbors': 2}
 
K Nearest Neighbors : Cross Validation Score: 
 Score: 0.9563
 K Nearest Neighbors : Evaluation Set Score: 
 Score: 0.7664


In [56]:
# 3. Random Forest

rf = RandomForestClassifier()
params_rf = {'n_estimators':[10, 50, 100, 150, 200, 250]}
rf_f = trainer_gs(rf, params_rf, x_ba_tr, y_ba_tr, x_ba_vl, y_ba_vl, x_ba_ev, y_ba_ev, 5, 'Random Forest')

Training Time : 84.9117 seconds

Optimal parameter:
 {'n_estimators': 150}
 
Random Forest : Cross Validation Score: 
 Score: 0.9750
 Random Forest : Evaluation Set Score: 
 Score: 0.7243


In [57]:
# 4. Support Vector

sv = SVC()
params_sv = {'max_iter':[300, 400, 500, 600, 750]}

sv_f = trainer_gs(sv, params_sv, x_ba_tr, y_ba_tr, x_ba_vl, y_ba_vl, x_ba_ev, y_ba_ev, 5, 'Support Vector')



Training Time : 49.9612 seconds

Optimal parameter:
 {'max_iter': 300}
 
Support Vector : Cross Validation Score: 
 Score: 0.9812
 Support Vector : Evaluation Set Score: 
 Score: 0.7936


In [64]:
# Pickling final tuned models

pkl_out(lr_f, "LogReg")
pkl_out(knn_f, "KNeighbors")
pkl_out(rf_f, "RandomForest")
pkl_out(sv_f, "SupportVector")

Success
Success
Success
Success


In [65]:
keys = ['lr', 'knn', 'rf', 'sv']

models = [('lr', lr_f), ('knn', knn_f), ('rf', rf_f), ('sv', sv_f)]

In [77]:
vot_h = VotingClassifier(models, voting='hard')

st=time.time()
vot_h.fit(x_ba_tr, y_ba_tr)
print('Training time: {:6.2f} s'.format(time.time()-st))

scorer(vot_h, x_ba_ev, y_ba_ev, 'Ensemble: Hard voting')


# st=time.time()
# ens_s.fit(x_ba_tr, y_ba_tr)
# print('Training time: {:6.2f} s'.format(time.time()-st))

#scorer(ens_s, x_ba_ev, y_ba_ev, 'Ensemble: Soft voting')



Training time:  14.33 s
 Ensemble: Hard voting 
 Score: 0.7893


In [74]:
stk = StackingClassifier(models)


x_ba_trvl.dropna(axis=0)

st=time.time()
stk.fit(x_ba_tr, y_ba_tr)
print('Training time: {:6.2f} s'.format(time.time()-st))

scorer(stk, x_ba_ev, y_ba_ev, 'Ensemble: Stacking')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Training time:  87.27 s
 Ensemble: Stacking 
 Score: 0.8100


In [78]:
pkl_out(vot_h, "Voting")
pkl_out(stk, "Stacked")

Success
Success
