In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
import scipy.stats as st
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold,GridSearchCV,RandomizedSearchCV,cross_val_score
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import r2_score,roc_auc_score,recall_score,classification_report,mean_squared_error,accuracy_score,confusion_matrix

#for getting the columns only
test=pd.read_csv("cleaned_data_test_feature_engineered.csv")


#loading the data
data_train_smote=pd.read_csv("full_train_after_smote_adasyn.csv")
data_test_smote=pd.read_csv("cleaned_data_test_feature_engineered.csv")


#dropping the feature engineered data from test

data_test_smote=data_test_smote.drop(["YEAR","INCIDENT_ID","MONTH","DAY","DATE"],axis=1)
data_train_smote=data_train_smote.drop(["INCIDENT_ID"],axis=1)

print(data_train_smote.shape,data_test_smote.shape)


#splitting into train and validation
train,val=train_test_split(data_train_smote,test_size=.20,random_state=42)
train.shape,val.shape


#splitting into x and y
Xfull=data_train_smote.drop(["MULTIPLE_OFFENSE"],axis=1)
yfull=data_train_smote["MULTIPLE_OFFENSE"]
print(Xfull.shape,yfull.shape)


Xtrain=train.drop(["MULTIPLE_OFFENSE"],axis=1)
ytrain=train["MULTIPLE_OFFENSE"]
print(Xtrain.shape,ytrain.shape)

Xval=val.drop(["MULTIPLE_OFFENSE"],axis=1)
yval=val["MULTIPLE_OFFENSE"]
print(Xval.shape,yval.shape)


(45565, 16) (15903, 15)
(45565, 15) (45565,)
(36452, 15) (36452,)
(9113, 15) (9113,)


In [2]:
lrc = LogisticRegression(class_weight="balanced")
rfc=RandomForestClassifier(class_weight="balanced")
adbc=AdaBoostClassifier()
bgc=BaggingClassifier()
gnb = GaussianNB()
knn=KNeighborsClassifier()
dtc = DecisionTreeClassifier()
bgcl_lrc = BaggingClassifier(base_estimator=lrc, random_state=42)
ab_rfc = AdaBoostClassifier(base_estimator=rfc,random_state=42)
ab_dtc = AdaBoostClassifier(base_estimator=dtc,random_state=42)
ab_nbc=  AdaBoostClassifier(base_estimator=gnb,random_state=42)
ab_lrc=  AdaBoostClassifier(base_estimator=lrc,random_state=42)
gbc=GradientBoostingClassifier()
ab_gbc=  AdaBoostClassifier(base_estimator=gbc,random_state=42)
xgbc=XGBClassifier()
ab_xgbc=  AdaBoostClassifier(base_estimator=xgbc,random_state=42)
lgbc=LGBMClassifier(class_weight="balanced")

In [3]:


models=[lrc,rfc,adbc,bgc,gnb,knn,dtc,bgcl_lrc,ab_rfc,ab_dtc,ab_nbc,ab_lrc,gbc,ab_gbc,xgbc,ab_xgbc,lgbc]
sctr,scte,auc,ps,rs=[],[],[],[],[]
def ensemble_for_train_val(X_train,X_test, y_train, y_test):
    for model in models:
            model.fit(X_train, y_train)
            y_test_pred = model.predict(X_test)
            y_test_pred_new=model.predict_proba(X_test)
            y_test_pred_new=y_test_pred_new[:,1]
            train_score=model.score(X_train,y_train)
            test_score=model.score(X_test,y_test)
            p_score=metrics.precision_score(y_test,y_test_pred)
            r_score=metrics.recall_score(y_test,y_test_pred)
            
            ac=metrics.roc_auc_score(y_test,y_test_pred_new)
            
            sctr.append(train_score)
            scte.append(test_score)
            ps.append(p_score)
            rs.append(r_score)
            auc.append(ac)
    return sctr,scte,auc,ps,rs


#checking the performance on train and validation
ensemble_for_train_val(Xtrain,Xval, ytrain, yval)
# 'ab_rf','ab_dt','ab_nb','ab_lr','bgcl_lr'
ensemble=pd.DataFrame({'names':['Logistic Regression','Random Forest','Ada boost','Bagging',
                                'Naive-Bayes','KNN','Decistion Tree',
                                'bagged LR',"adaboost rf","adaboost dtc","adaboost naive bayes",
                                "adaboost logistic regression","gradient boosting trees"
                                ,"adaboost gbc","xgboost","adaboost xgbc","ligh gbm"],
                       'auc_score':auc,'training':sctr,'testing':scte,'precision':ps,'recall':rs})
ensemble=ensemble.sort_values(by='auc_score',ascending=False).reset_index(drop=True)
ensemble

Unnamed: 0,names,auc_score,training,testing,precision,recall
0,adaboost gbc,0.999991,1.0,0.999232,0.9989,0.99956
1,ligh gbm,0.999955,1.0,0.998683,0.998898,0.998458
2,Random Forest,0.999927,1.0,0.99605,0.999114,0.992953
3,adaboost rf,0.999911,1.0,0.99605,0.998893,0.993173
4,gradient boosting trees,0.999756,0.995062,0.994294,0.996022,0.992513
5,xgboost,0.999747,0.994843,0.994294,0.995365,0.993173
6,Ada boost,0.999615,0.992703,0.993416,0.994919,0.991852
7,Bagging,0.999193,0.999506,0.997037,0.998014,0.996036
8,Decistion Tree,0.995611,1.0,0.995611,0.995596,0.995596
9,adaboost dtc,0.9955,1.0,0.995501,0.995813,0.995155


In [4]:
ensemble=ensemble.sort_values(by='recall',ascending=False).reset_index(drop=True)
ensemble

Unnamed: 0,names,auc_score,training,testing,precision,recall
0,adaboost xgbc,0.5,0.500521,0.498299,0.498299,1.0
1,adaboost gbc,0.999991,1.0,0.999232,0.9989,0.99956
2,ligh gbm,0.999955,1.0,0.998683,0.998898,0.998458
3,Bagging,0.999193,0.999506,0.997037,0.998014,0.996036
4,Decistion Tree,0.995611,1.0,0.995611,0.995596,0.995596
5,adaboost dtc,0.9955,1.0,0.995501,0.995813,0.995155
6,adaboost rf,0.999911,1.0,0.99605,0.998893,0.993173
7,xgboost,0.999747,0.994843,0.994294,0.995365,0.993173
8,Random Forest,0.999927,1.0,0.99605,0.999114,0.992953
9,gradient boosting trees,0.999756,0.995062,0.994294,0.996022,0.992513


In [5]:

def cross_val(fullx,fully,kfold,models,model_names,i=1,p=1):
    dictionary_cv={}
    cross_val_score_auc=[]
    for model in models:
        print(model_names[i-1])
        for train_index,test_index in kfold.split(fullx,fully):
            
            print('\n{} of kfold {}'.format(p,kf.n_splits))
            xtr,xvl = Xfull.loc[train_index],Xfull.loc[test_index]
            ytr,yvl = yfull[train_index],yfull[test_index]
            model.fit(xtr, ytr)
            pred=model.predict(xvl)
            print('roc_auc_score',roc_auc_score(yvl,pred))
            cross_val_score_auc.append(roc_auc_score(yvl,pred))
            p+=1
        temp_name=model_names[i-1]
        dictionary_cv[temp_name]=cross_val_score_auc
        cross_val_score_auc=[]
        p=1
        i+=1
    result_cross_validation=pd.DataFrame(list(dictionary_cv.items()))
    return result_cross_validation

kf = StratifiedKFold(n_splits=5,random_state=42,shuffle=True)

# models_to_cross_validate=[lrc,rfc]
# # models=
# model_names=['Logistic Regression','Random Forest',]

models_to_cross_validate=[lrc,rfc,adbc,bgc,gnb,knn,dtc,bgcl_lrc,
                          ab_rfc,ab_dtc,ab_nbc,ab_lrc,gbc,ab_gbc,xgbc,ab_xgbc,lgbc]
# models=
model_names=['Logistic Regression','Random Forest','Ada boost','Bagging',
                                'Naive-Bayes','KNN','Decistion Tree',
                                'bagged LR',"adaboost rf","adaboost dtc","adaboost naive bayes",
                                "adaboost logistic regression","gradient boosting trees",
             "adaboost gbc","xgboost","adaboost xgbc","light gbm"]

result_cv=cross_val(fullx=Xfull,fully=yfull,kfold=kf,
                    models=models_to_cross_validate,model_names=model_names)
result_cv.to_csv("cross_val_results_with_adasyn.csv",index=False)

Logistic Regression

1 of kfold 5
roc_auc_score 0.8728183618175244

2 of kfold 5
roc_auc_score 0.871502284110563

3 of kfold 5
roc_auc_score 0.8746842020390245

4 of kfold 5
roc_auc_score 0.8774284147939386

5 of kfold 5
roc_auc_score 0.8787449624765614
Random Forest

1 of kfold 5
roc_auc_score 0.9967082403495823

2 of kfold 5
roc_auc_score 0.9961593688992207

3 of kfold 5
roc_auc_score 0.9971471014982786

4 of kfold 5
roc_auc_score 0.9967082403495823

5 of kfold 5
roc_auc_score 0.9949536381672205
Ada boost

1 of kfold 5
roc_auc_score 0.9937452352149334

2 of kfold 5
roc_auc_score 0.9916601932058331

3 of kfold 5
roc_auc_score 0.9930869555332965

4 of kfold 5
roc_auc_score 0.9920994396795791

5 of kfold 5
roc_auc_score 0.9903444035625232
Bagging

1 of kfold 5
roc_auc_score 0.9969275384684446

2 of kfold 5
roc_auc_score 0.9958300363958775

3 of kfold 5
roc_auc_score 0.9971470533326475

4 of kfold 5
roc_auc_score 0.9968178653261979

5 of kfold 5
roc_auc_score 0.994733376714516
Naive-Baye

In [9]:

# predictions=pd.DataFrame(data_test_smote['INCIDENT_ID'])
def pred_on_full_data(Xtrain,ytrain,Xtest,models,models_name,i=0):
    for model in models:
        print(models_name[i])
        model.fit(Xtrain, ytrain)
        y_test_pred = model.predict(Xtest)
        predictions=pd.concat([test['INCIDENT_ID'],pd.DataFrame(y_test_pred,columns=['MULTIPLE_OFFENSE'])],1)
        a="submission_feature_engineered"+models_name[i]+"with_adasync"+".csv"
        predictions.to_csv(a,index=False)
        i+=1

#models=[lrc,rfc,adbc,bgc,gnb,knn,dtc,bgcl_lrc,gbc,xgbc,lgbc]
# ab_rfc,ab_dtc,ab_nbc,ab_lrc,gbc,ab_gbc,xgbc,ab_xgbc
models_final_for_csv=[lrc,rfc,adbc,bgc,gnb,knn,dtc,bgcl_lrc,
                          ab_rfc,ab_dtc,ab_nbc,ab_lrc,gbc,ab_gbc,xgbc,ab_xgbc,lgbc]
# models=
model_names=['LogisticRegression','RandomForest','Adaboost','Bagging',
                                'Naive-Bayes','KNN','Decistion_Tree',
                                'bagged_LR',"adaboost_rf","adaboost_dtc","adaboost_naive_bayes",
                                "adaboost_logistic_regression","gradient_boosting_trees",
             "adaboost_gbc","xgboost","adaboost_xgbc","light_gbm"]

#getting predictions on full data
pred_on_full_data(Xfull,yfull,data_test_smote,models=models_final_for_csv,models_name=model_names)

LogisticRegression
RandomForest
Adaboost
Bagging
Naive-Bayes
KNN
Decistion_Tree
bagged_LR
adaboost_rf
adaboost_dtc
adaboost_naive_bayes
adaboost_logistic_regression
gradient_boosting_trees
adaboost_gbc
xgboost
adaboost_xgbc
light_gbm
