In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
import scipy.stats as st
import sklearn.metrics as metrics
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold,GridSearchCV,RandomizedSearchCV,cross_val_score
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import r2_score,roc_auc_score,recall_score,classification_report,mean_squared_error,accuracy_score,confusion_matrix

In [3]:
#loading the data
data_train_feature_engineered=pd.read_csv("cleaned_data_feature_engineered.csv")
data_test_feature_engineered=pd.read_csv("cleaned_data_test_feature_engineered.csv")


In [4]:
train,val=train_test_split(data_train_feature_engineered,test_size=.20,random_state=42)
train.shape,val.shape

Xfull=train.drop(["INCIDENT_ID","MULTIPLE_OFFENSE","DATE"],axis=1)
yfull=train["MULTIPLE_OFFENSE"]
Xtest=data_test_feature_engineered.drop(["INCIDENT_ID","DATE"],axis=1)


Xtrain=train.drop(["INCIDENT_ID","MULTIPLE_OFFENSE","DATE"],axis=1)
ytrain=train["MULTIPLE_OFFENSE"]
print(Xtrain.shape,ytrain.shape)

Xval=val.drop(["MULTIPLE_OFFENSE","INCIDENT_ID","DATE"],axis=1)
yval=val["MULTIPLE_OFFENSE"]
print(Xval.shape,yval.shape)

(19083, 18) (19083,)
(4771, 18) (4771,)


In [5]:
lrc = LogisticRegression(class_weight="balanced")
rfc=RandomForestClassifier(class_weight="balanced")
adbc=AdaBoostClassifier()
bgc=BaggingClassifier()
gnb = GaussianNB()
knn=KNeighborsClassifier()
dtc = DecisionTreeClassifier()
bgcl_lrc = BaggingClassifier(base_estimator=lrc, random_state=42)
ab_rfc = AdaBoostClassifier(base_estimator=rfc,random_state=42)
ab_dtc = AdaBoostClassifier(base_estimator=dtc,random_state=42)
ab_nbc=  AdaBoostClassifier(base_estimator=gnb,random_state=42)
ab_lrc=  AdaBoostClassifier(base_estimator=lrc,random_state=42)
gbc=GradientBoostingClassifier()
ab_gbc=  AdaBoostClassifier(base_estimator=gbc,random_state=42)
xgbc=XGBClassifier()
ab_xgbc=  AdaBoostClassifier(base_estimator=xgbc,random_state=42)

In [6]:


models=[lrc,rfc,adbc,bgc,gnb,knn,dtc,bgcl_lrc,ab_rfc,ab_dtc,ab_nbc,ab_lrc,gbc,ab_gbc,xgbc,ab_xgbc]
sctr,scte,auc,ps,rs=[],[],[],[],[]
def ensemble_for_train_val(X_train,X_test, y_train, y_test):
    for model in models:
            model.fit(X_train, y_train)
            y_test_pred = model.predict(X_test)
            y_test_pred_new=model.predict_proba(X_test)
            y_test_pred_new=y_test_pred_new[:,1]
            train_score=model.score(X_train,y_train)
            test_score=model.score(X_test,y_test)
            p_score=metrics.precision_score(y_test,y_test_pred)
            r_score=metrics.recall_score(y_test,y_test_pred)
            
            ac=metrics.roc_auc_score(y_test,y_test_pred_new)
            
            sctr.append(train_score)
            scte.append(test_score)
            ps.append(p_score)
            rs.append(r_score)
            auc.append(ac)
    return sctr,scte,auc,ps,rs


#checking the performance on train and validation
ensemble_for_train_val(Xtrain,Xval, ytrain, yval)
# 'ab_rf','ab_dt','ab_nb','ab_lr','bgcl_lr'
ensemble=pd.DataFrame({'names':['Logistic Regression','Random Forest','Ada boost','Bagging',
                                'Naive-Bayes','KNN','Decistion Tree',
                                'bagged LR',"adaboost rf","adaboost dtc","adaboost naive bayes",
                                "adaboost logistic regression","gradient boosting trees","adaboost gbc","xgboost","adaboost xgbc"],
                       'auc_score':auc,'training':sctr,'testing':scte,'precision':ps,'recall':rs})
ensemble=ensemble.sort_values(by='recall',ascending=False).reset_index(drop=True)
ensemble

Unnamed: 0,names,auc_score,training,testing,precision,recall
0,adaboost xgbc,0.5,0.9542,0.959338,0.959338,1.0
1,gradient boosting trees,0.999984,0.999371,0.999371,0.999563,0.999782
2,adaboost gbc,0.999999,1.0,0.99979,1.0,0.999782
3,xgboost,0.999955,0.998323,0.998742,0.999563,0.999126
4,Decistion Tree,0.999454,1.0,0.998952,1.0,0.998908
5,adaboost dtc,0.999454,1.0,0.998952,1.0,0.998908
6,Ada boost,0.9997,0.996175,0.996437,0.997817,0.998471
7,Bagging,0.999822,0.999843,0.998323,0.999781,0.998471
8,Random Forest,0.999184,1.0,0.992454,0.993909,0.998252
9,adaboost rf,0.999401,1.0,0.993712,0.995208,0.998252


In [16]:
# models=[lrc,rfc,adbc,bgc,gnb,knn,dtc,bgcl_lrc,gbc,xgbc]
# ab_rfc,ab_dtc,ab_nbc,ab_lrc,gbc,ab_gbc,xgbc,ab_xgbc
models=[ab_xgbc]

predictions=pd.DataFrame(data_test_feature_engineered['INCIDENT_ID'])
def pred_on_full_data(Xtrain,ytrain,Xtest,models):
    for model in models:
        print(model.__class__.__name__)
        model.fit(Xtrain, ytrain)
        y_test_pred = model.predict(Xtest)
        predictions=pd.concat([data_test_feature_engineered['INCIDENT_ID'],pd.DataFrame(y_test_pred,columns=['MULTIPLE_OFFENSE'])],1)
        a="submission_feature_engineered"+model.__class__.__name__+".csv"
        predictions.to_csv(a,index=False)
        

#getting predictions on full data
pred_on_full_data(Xfull,yfull,Xtest,models)

AdaBoostClassifier
