# Ensembling

###  Importing Libraries and Packages.

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.metrics import average_precision_score, confusion_matrix
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from collections import Counter
from sklearn import metrics
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
import pickle 

In [2]:
from keras.models import model_from_json

In [3]:
import os

In [4]:
from sklearn.metrics import roc_auc_score

In [5]:
import joblib

#### Data Preparation 

In [6]:
 df= pd.read_pickle('C:/Users/Vaishnavi M Shetty/Desktop/dfmod.pickle')

In [7]:
df1= df.copy()

In [8]:
df.shape[0]//5

56959

In [9]:
X = df.drop('Class', axis=1)
y = df['Class']

In [10]:
type(y)

pandas.core.series.Series

Preprocessing and finding the dataset using SMOTE

In [11]:
print('Original dataset shape %s' % Counter(y))
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)
print('Resampled dataset shape %s' % Counter(y_smote))
# Slit into train and test datasets
X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X_smote, y_smote, test_size=0.25, random_state=42)

Original dataset shape Counter({0: 284306, 1: 492})
Resampled dataset shape Counter({0: 284306, 1: 284306})


In [12]:
result = pd.concat([X_smote, y_smote], axis=1)

In [13]:
type(result)

pandas.core.frame.DataFrame

In [14]:
result.shape

(568612, 35)

In [15]:
result.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'average', 'min', 'time_difference', 'cond', 'Class'],
      dtype='object')

In [16]:
result.shape[0]//5

113722

In [17]:
train_index = result.shape[0]//5
train = result.iloc[:train_index, :]
test = result.iloc[train_index:, :]

## Random Forest Classifier Model Building

In [18]:
X = train.drop('Class', axis=1)
y = train['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier()

In [20]:
pkl_filename = "rf_pickle_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(rf, file)

In [22]:
rf_pred = rf.predict(test.drop('Class', axis=1))
rf_pred_proba = rf.predict_proba(test.drop('Class', axis=1))

In [23]:
rf_pred[:20]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [32]:
joblib.dump(rf,"rf.h5")

['rf.h5']

In [33]:
rf_pred_proba[:20]

array([[1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [0.99, 0.01],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ],
       [1.  , 0.  ]])

In [35]:
test['rf_pred'] = rf_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['rf_pred'] = rf_pred


In [34]:
test.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V25,V26,V27,V28,Amount,average,min,time_difference,cond,Class
113722,73181.0,0.106908,-1.894487,-0.321439,-1.278178,-1.064681,0.63837,-0.213243,-0.001613,0.687257,...,0.59023,-0.169477,-0.106514,0.345429,2.22142,-0.348875,1219.683333,0.575964,1.414228,0
113723,73181.0,-0.388105,0.619686,0.572499,0.053873,-0.106094,-0.049471,0.201806,0.407869,0.035047,...,0.011513,0.864662,0.55984,0.397911,-0.253321,-0.348802,1219.683333,0.575964,-5e-06,0
113724,73181.0,-0.467789,-0.523333,1.042506,-0.487787,-0.956146,-0.01418,0.513915,0.259888,1.078538,...,-2.260164,1.240338,0.111546,0.648391,0.685776,-0.348873,1219.683333,0.575964,1.414228,0
113725,73181.0,0.652088,-0.395515,0.400083,-0.343004,-0.688784,-0.190975,-0.487878,-0.050949,-0.496677,...,0.750602,-0.64286,0.094536,0.066986,-0.181317,-0.348783,1219.683333,0.575964,-5e-06,0
113726,73181.0,-0.186434,0.540203,0.874998,0.735254,0.15912,-0.158146,0.626683,-0.019745,-0.665101,...,-0.364965,-0.474,0.401069,0.44749,-0.182076,-0.348703,1219.683333,0.575964,-5e-06,0


## LightGBM Classifier 

In [36]:
%%time
err = [] 
pred_lgm = np.zeros((len(test), 2))
fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)
i = 1

for train_index, test_index in fold.split(X, y):
    x_train, x_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    m = LGBMClassifier(random_state=42, n_estimators=1000, learning_rate=0.1)
    m.fit(x_train, y_train , eval_set=[(x_val,y_val)], early_stopping_rounds=100, verbose=200)
    pred_y = m.predict(x_val)
    print(i, " err_lgm: ", metrics.accuracy_score(y_val,pred_y))
    err.append(metrics.roc_auc_score(y_val,pred_y))
    pred_lgm+= m.predict_proba(test.drop(['Class','rf_pred'], axis=1))
    i = i + 1
pred_lgm=pred_lgm/5    
sum(err)/5

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[8]	valid_0's binary_logloss: 0.0106744
1  err_lgm:  0.9992525829852715
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.00673512
2  err_lgm:  0.998988788744779
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.0107532
3  err_lgm:  0.999076679563841
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[32]	valid_0's binary_logloss: 0.00636944
4  err_lgm:  0.9994723883221949
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's binary_logloss: 0.0127837
5  err_lgm:  0.9987689060851214
Wall time: 16.1 s


0.9147016251536473

In [38]:
test['rf_pred'] = pred_lgm

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['rf_pred'] = pred_lgm


In [39]:
test['rf_pred_act'] = rf_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['rf_pred_act'] = rf_pred


In [40]:
test = test.rename(columns = {'rf_pred':'lgbm_preds'})

In [41]:
test.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V27,V28,Amount,average,min,time_difference,cond,Class,lgbm_preds,rf_pred_act
113722,73181.0,0.106908,-1.894487,-0.321439,-1.278178,-1.064681,0.63837,-0.213243,-0.001613,0.687257,...,-0.106514,0.345429,2.22142,-0.348875,1219.683333,0.575964,1.414228,0,0.998634,0
113723,73181.0,-0.388105,0.619686,0.572499,0.053873,-0.106094,-0.049471,0.201806,0.407869,0.035047,...,0.55984,0.397911,-0.253321,-0.348802,1219.683333,0.575964,-5e-06,0,0.998629,0
113724,73181.0,-0.467789,-0.523333,1.042506,-0.487787,-0.956146,-0.01418,0.513915,0.259888,1.078538,...,0.111546,0.648391,0.685776,-0.348873,1219.683333,0.575964,1.414228,0,0.998626,0
113725,73181.0,0.652088,-0.395515,0.400083,-0.343004,-0.688784,-0.190975,-0.487878,-0.050949,-0.496677,...,0.094536,0.066986,-0.181317,-0.348783,1219.683333,0.575964,-5e-06,0,0.998629,0
113726,73181.0,-0.186434,0.540203,0.874998,0.735254,0.15912,-0.158146,0.626683,-0.019745,-0.665101,...,0.401069,0.44749,-0.182076,-0.348703,1219.683333,0.575964,-5e-06,0,0.993827,0


## Artifical Neural Network   

In [6]:
import json
file = open(r'C:\\Users\\Vaishnavi M Shetty\\model.json')
loaded  = file.read()
file.close()

loaded_model = model_from_json(loaded)
loaded_model.load_weights('C:/Users/Vaishnavi M Shetty/model_nn.h5')

In [7]:
loaded_model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_7 (Dense)              (None, 64)                2176      
_________________________________________________________________
dense_8 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_9 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_10 (Dense)             (None, 8)                 136       
_________________________________________________________________
dense_11 (Dense)             (None, 4)                 36        
_________________________________________________________________
dense_12 (Dense)             (None, 2)                 10        
_________________________________________________________________
dense_13 (Dense)             (None, 1)                

In [44]:
nn_preds = loaded_model.predict(test.drop(['Class', 'Time','lgbm_preds', 'rf_pred_act'], axis=1))

In [43]:
test.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'average', 'min', 'time_difference', 'cond', 'Class', 'lgbm_preds',
       'rf_pred_act'],
      dtype='object')

In [45]:
test['nn_preds'] = nn_preds

Computing the average of all outputs and comparing the results using AUC Score.

In [49]:
test['ensem_preds'] = (test['lgbm_preds'] + test['rf_pred_act'] +test['nn_preds'])/3

In [52]:
print(roc_auc_score(test['Class'], test['ensem_preds']))

0.9951368281380266


In [53]:
test['ensem_preds_2'] = (test['lgbm_preds'] + test['rf_pred_act'])/2
print(roc_auc_score(test['Class'], test['ensem_preds_2']))

0.8107744341405259


#### We can see that model performance was in the order , RandomForest > LightGBM > ANN > Ensemble of all three models > Ensemble of RandomForest & LightGBM .<br>
Therefore, we will be using RandomForest Classifier for the deployment purpose. <br> <br>
Thank You

In [23]:
X_train.head(10)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V25,V26,V27,V28,Amount,average,min,time_difference,cond,Class
42686,41202.0,-1.780891,-0.703775,-1.184517,-0.185344,-1.34104,0.708812,2.753989,-0.041338,-0.027419,...,-0.633728,1.0801,3.283546,1.076124,2.448068,-1.212863,686.7,-0.373369,1.414228,0
93214,64307.0,-1.984382,-0.002748,0.001628,-0.241325,-0.980204,-0.895087,-0.493627,0.953444,0.280985,...,-0.68086,1.215607,-1.405756,-3.767403,-0.308334,-0.598349,1071.783333,0.575964,-5e-06,0
68167,52867.0,-0.308809,0.762511,0.713326,-0.131029,0.198261,-0.362663,0.503152,0.060487,-0.482634,...,-0.138348,0.161567,0.867458,0.457954,-0.335441,-0.902855,881.116667,0.575964,-1.414238,0
68822,53159.0,-0.341985,0.545278,1.124169,0.07888,0.199977,-0.5024,0.708313,0.014043,-0.492734,...,1.220445,-0.815984,-0.074114,-0.079126,-0.288304,-0.894954,885.983333,-0.373369,-5e-06,0
68928,53197.0,0.608184,-0.139646,0.214061,0.277367,-0.346362,-0.043064,-0.352567,0.161392,0.382243,...,0.638688,0.797607,-0.101083,0.007063,-0.205505,-0.893635,886.616667,0.575964,-5e-06,0
39382,39814.0,0.751715,-0.631727,0.398393,-1.078862,-1.072375,-0.299617,-0.938324,-0.021786,-1.837223,...,0.293069,-0.84625,0.098162,0.053778,-0.278869,-1.253076,663.566667,0.575964,-5e-06,0
1277,993.0,-0.238372,0.435635,0.72063,-0.624916,0.513151,0.034749,0.48166,0.174575,-0.34822,...,-0.451024,-0.033645,0.10327,0.29156,-0.345316,-1.716561,16.55,-0.373369,-1.414238,0
108043,70737.0,0.490016,-0.110659,0.204885,0.956232,-0.188352,0.114556,0.006255,0.140412,0.158418,...,1.091017,-0.696028,0.033877,0.055228,0.014587,-0.417856,1178.95,-0.373369,1.414228,0
3586,3072.0,-0.25028,0.844607,1.47352,1.827799,0.167486,0.046675,0.68765,-0.214784,-1.093886,...,-0.912275,-0.252195,0.377221,-0.548897,-0.313412,-1.688321,51.2,-0.373369,-5e-06,0
108430,70898.0,0.437014,-0.521018,1.262791,1.500718,-0.973377,1.339203,-1.198078,0.64801,1.112026,...,0.265357,2.507746,0.027677,0.06329,0.00815,-0.413136,1181.633333,0.575964,1.414228,0


In [24]:
pd.set_option('display.max_columns', None)

In [25]:
X_train.head(10)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,average,min,time_difference,cond,Class
42686,41202.0,-1.780891,-0.703775,-1.184517,-0.185344,-1.34104,0.708812,2.753989,-0.041338,-0.027419,0.124634,1.507835,0.359801,-0.73776,0.695162,0.261914,0.175235,-0.252903,-0.923915,-1.030345,-1.550322,-0.586575,1.24662,2.839592,-0.372076,-0.633728,1.0801,3.283546,1.076124,2.448068,-1.212863,686.7,-0.373369,1.414228,0
93214,64307.0,-1.984382,-0.002748,0.001628,-0.241325,-0.980204,-0.895087,-0.493627,0.953444,0.280985,-0.306528,-0.144473,0.387138,-0.401963,0.73201,1.194659,0.8566,0.265979,-1.047247,-1.370995,-1.12276,-0.34857,-0.514958,0.424015,1.118028,-0.68086,1.215607,-1.405756,-3.767403,-0.308334,-0.598349,1071.783333,0.575964,-5e-06,0
68167,52867.0,-0.308809,0.762511,0.713326,-0.131029,0.198261,-0.362663,0.503152,0.060487,-0.482634,-0.045957,1.287634,0.830162,0.696622,-0.461395,0.184027,0.716157,-0.479002,0.448743,0.357059,0.393645,-0.350026,-0.842806,-0.091676,-0.067823,-0.138348,0.161567,0.867458,0.457954,-0.335441,-0.902855,881.116667,0.575964,-1.414238,0
68822,53159.0,-0.341985,0.545278,1.124169,0.07888,0.199977,-0.5024,0.708313,0.014043,-0.492734,-0.677595,-0.17114,-0.238445,-0.807091,0.403874,0.719446,-0.150834,-0.251263,-0.658835,-1.687483,-0.240794,0.153533,0.32,-0.417248,0.627606,1.220445,-0.815984,-0.074114,-0.079126,-0.288304,-0.894954,885.983333,-0.373369,-5e-06,0
68928,53197.0,0.608184,-0.139646,0.214061,0.277367,-0.346362,-0.043064,-0.352567,0.161392,0.382243,0.139979,0.236818,-0.521626,-1.942397,0.721197,0.826899,1.089829,-1.062974,1.088504,0.357707,-0.166943,0.00115,-0.233605,-0.19202,-0.865527,0.638688,0.797607,-0.101083,0.007063,-0.205505,-0.893635,886.616667,0.575964,-5e-06,0
39382,39814.0,0.751715,-0.631727,0.398393,-1.078862,-1.072375,-0.299617,-0.938324,-0.021786,-1.837223,1.444654,1.000009,-0.117572,0.782056,-0.268139,0.05288,0.105335,0.04544,0.580804,0.211001,-0.355028,-0.452715,-0.8805,0.188123,-0.087377,0.293069,-0.84625,0.098162,0.053778,-0.278869,-1.253076,663.566667,0.575964,-5e-06,0
1277,993.0,-0.238372,0.435635,0.72063,-0.624916,0.513151,0.034749,0.48166,0.174575,-0.34822,-0.473139,-0.309106,-0.672911,-1.539129,0.617873,-0.043877,1.297394,-1.555663,0.862901,0.171587,-0.232523,-0.291105,-1.135163,-0.30928,-1.813886,-0.451024,-0.033645,0.10327,0.29156,-0.345316,-1.716561,16.55,-0.373369,-1.414238,0
108043,70737.0,0.490016,-0.110659,0.204885,0.956232,-0.188352,0.114556,0.006255,0.140412,0.158418,0.031954,0.969005,0.685727,-1.213201,0.502332,-0.7355,-0.533687,0.084587,-0.407113,-0.11917,-0.095126,-0.015263,-0.097915,-0.218358,0.025415,1.091017,-0.696028,0.033877,0.055228,0.014587,-0.417856,1178.95,-0.373369,1.414228,0
3586,3072.0,-0.25028,0.844607,1.47352,1.827799,0.167486,0.046675,0.68765,-0.214784,-1.093886,1.612068,1.541961,0.041508,-0.249555,0.031509,0.731228,-0.022679,-0.473283,0.334325,1.28159,0.696155,-0.360821,-0.437242,-0.066754,0.797088,-0.912275,-0.252195,0.377221,-0.548897,-0.313412,-1.688321,51.2,-0.373369,-5e-06,0
108430,70898.0,0.437014,-0.521018,1.262791,1.500718,-0.973377,1.339203,-1.198078,0.64801,1.112026,0.215775,-0.136715,0.500559,-1.316163,-0.919092,-2.303953,0.26744,0.212729,0.017688,-0.264037,-0.054232,0.166245,0.786942,-0.250481,-0.357008,0.265357,2.507746,0.027677,0.06329,0.00815,-0.413136,1181.633333,0.575964,1.414228,0


In [27]:
arr1 = np.array([['arr']])

arr2 = np.array([['cmmc', 'sdvf']])

arr = np.concatenate((arr1, arr2), axis=1)

print(arr)


[['arr' 'cmmc' 'sdvf']]
