## Handling Imbalanced Dataset with Machine Learning

### Tachniques to handle Imbalanced Dataset

## 1)Increasing Class Weights

In [2]:
# Weblink for Credit Card Fraud Detection Data set:
# https://www.kaggle.com/mlg-ulb/creditcardfraud

import pandas as pd
df = pd.read_csv(r'C:\Users\LENOVO\Desktop\Feature Engineering\creditcard.csv')
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [3]:
df.shape

(284807, 31)

In [4]:
# counting number of Null values in each feature / column
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [5]:
# Fraud Detection is a Classification problems. So lets focus on "Class" feature
df['Class']

0         0
1         0
2         0
3         0
4         0
         ..
284802    0
284803    0
284804    0
284805    0
284806    0
Name: Class, Length: 284807, dtype: int64

In [6]:
# counting the frequency of each type of value in the "Class" feature
# 0 = not fraud
# 1 = fraud
df['Class'].value_counts()

# As, the percentage of 1's is very less = 492/(492+284315) = 0.17 %
# so, the data is imbalanced

0    284315
1       492
Name: Class, dtype: int64

In [7]:
# Splitting the given dataset into 2 datasets
# Independent Features (X) = All features Except "Class"
# Dependent Features (Y) = "Class"
X = df.drop("Class",axis=1)
y = df.Class

#### Cross Validation like KFold and Hyperparameter Tuning

In [8]:
# Lets import all the necessary Librarires for "Logistic Regression Model"
from sklearn.linear_model import LogisticRegression

# Just "accuracy" is not enough. Other "metrices" should also be checked
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

# For Cross Validation
from sklearn.model_selection import KFold
# For KFold
import numpy as np

# to perform GridSearchCV
from sklearn.model_selection import GridSearchCV

In [9]:
 #Cross Validation through Hyper-Parameter tuning
10.0**np.arange(-2,3)
# o/p = 10^-2 , 10^-1 , 10^0 , 10^1 , 10^2 

array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02])

In [10]:
# Implementing "Logistic Regression Model"
log_class=LogisticRegression() #Intializing Logarithmic classifier
grid={'C':10.0**np.arange(-2,3),'penalty':['l1','l2']} #grid search parameters for Hyper-Parameter tuning
cv = KFold(n_splits=5, random_state=None,shuffle=False) # cv = Cross Validation parameters

In [11]:
# train -test split
from sklearn.model_selection import train_test_split

# reserving 70% dataset for training and remaining 30% for Testing purpose
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7) 

In [12]:
clf=GridSearchCV(log_class,grid,cv=cv,n_jobs=-1,scoring='f1_macro') #'f1_macro' is a scoring parameter
clf.fit(X_train,y_train) # implementing FIT on Training dataset
#  Fit Training  will take more time to execute ^|^

25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\LENOVO\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\LENOVO\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\LENOVO\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.84980172   

GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                         'penalty': ['l1', 'l2']},
             scoring='f1_macro')

In [13]:
#o/p code
y_pred=clf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred)) 
print(classification_report(y_test,y_pred))

# 0.9989 ~ 99.9 % accuracy # do not beleive the higher accuracy score of Imbalanced Dataset

[[85256    46]
 [   38   103]]
0.9990168884519505
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85302
           1       0.69      0.73      0.71       141

    accuracy                           1.00     85443
   macro avg       0.85      0.86      0.85     85443
weighted avg       1.00      1.00      1.00     85443



## 2nd Technique ~ RandomForestClassifier

In [14]:
X_train.shape

(199364, 30)

In [15]:
y_train.shape

(199364,)

In [16]:
y_train.value_counts()

0    199013
1       351
Name: Class, dtype: int64

In [17]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train,y_train)
#  Fit Training  will take more time to execute ^|^

RandomForestClassifier()

In [18]:
# copy pasted above o/p code
y_pred=classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred)) 
print(classification_report(y_test,y_pred))

# (precision, recall, f1-score) improved from
# ( 0.69, 0.63, 0.66) in Logistic Regression to (0.87, 0.75, 0.80 ) in RandomForestClassifier

# (False +ve, False -ve) improved from
# ( 38, 54) in Logistic Regressi on to (15, 34 ) in RandomForestClassifier

[[85295     7]
 [   26   115]]
0.9996137776061234
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85302
           1       0.94      0.82      0.87       141

    accuracy                           1.00     85443
   macro avg       0.97      0.91      0.94     85443
weighted avg       1.00      1.00      1.00     85443



In [19]:
# to make the dataset more BALANCED
# chainging the "weight"age of each class 0 & 1
class_weight = dict({0:1,1:100})
# old classes 0's are changed to new 1's
# old classes 1's are changed to new 100's (given 100x more importance to old classes 1's )

In [20]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(class_weight=class_weight)
classifier.fit(X_train,y_train)
# Fit Training will take more time to execute ^|^

RandomForestClassifier(class_weight={0: 1, 1: 100})

In [21]:
# copy pasted above o/p code
y_pred=classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred)) 
print(classification_report(y_test,y_pred))

# (precision, recall, f1-score) HARDLY improved from
# (0.87, 0.75, 0.80 ) in RandomForestClassifier to (0.92, 0.75, 0.82) with extra ClassWeights in RandomForestClassifier

# (False +ve, False -ve) BARELY improved from
# (15,34) in RandomForestClassifier TO (9,34) with extra ClassWeights  in RandomForestClassifier 

[[85296     6]
 [   28   113]]
0.999602073897218
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85302
           1       0.95      0.80      0.87       141

    accuracy                           1.00     85443
   macro avg       0.97      0.90      0.93     85443
weighted avg       1.00      1.00      1.00     85443



## Under Sampling

In [22]:
import pandas as pd
df = pd.read_csv(r'C:\Users\LENOVO\Desktop\Feature Engineering\creditcard.csv')
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [23]:
df['Class'].value_counts()

# As, the percentage of 1's is very less = 492/(492+284315) = 0.17 %
# so, the data is imbalanced

0    284315
1       492
Name: Class, dtype: int64

In [24]:
# Splitting the given dataset into 2 datasets
# Independent Features (X) = All features Except "Class"
# Dependent Features (Y) = "Class"
X = df.drop("Class",axis=1)
y = df.Class

In [25]:
# Lets import all the necessary Librarires for "Logistic Regression Model"
from sklearn.linear_model import LogisticRegression

# Just "accuracy" is not enough. Other "metrices" should also be checked
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

# For Cross Validation
from sklearn.model_selection import KFold
# For KFold
import numpy as np

# to perform GridSearchCV
from sklearn.model_selection import GridSearchCV

In [26]:
# train -test split
from sklearn.model_selection import train_test_split

# reserving 70% dataset for training and remaining 30% for Testing purpose
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7) 

In [27]:
y_train.value_counts()

0    199017
1       347
Name: Class, dtype: int64

## Under Sampling

In [28]:
from collections import Counter
Counter(y_train)

Counter({0: 199017, 1: 347})

In [30]:
from collections import Counter
from imblearn.under_sampling import NearMiss
ns = NearMiss(0.8) # reduce the total parameters to 80%
X_train_ns,y_train_ns=ns.fit_resample(X_train,y_train)
print("The number of Classes before fit {}".format(Counter(y_train)))
print("The number of Classes after fit {}".format(Counter(y_train_ns)))

# 363/453 ~ 80%



The number of Classes before fit Counter({0: 199017, 1: 347})
The number of Classes after fit Counter({0: 433, 1: 347})


In [31]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train_ns,y_train_ns)
# Fit Training will NOT take more time to execute, Because, dataset is very small

RandomForestClassifier()

In [32]:
# copy pasted above o/p code
y_pred=classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred)) 
print(classification_report(y_test,y_pred))

# very poor (precision, f1-score) = (0.01, 0.02) & [False +ve = 15044]
# because of small dataset

[[65031 20267]
 [   10   135]]
0.7626838945261754
              precision    recall  f1-score   support

           0       1.00      0.76      0.87     85298
           1       0.01      0.93      0.01       145

    accuracy                           0.76     85443
   macro avg       0.50      0.85      0.44     85443
weighted avg       1.00      0.76      0.86     85443



## Under Sampling
Aim: Reduce the number of points of the Maximum labels

Disadvantage: POOR PERFORMANCE due to Loss of Data

May be applicable for small DataSet

## 3)Over sampling
Aim: To increase the number of classes with Minimum number of values

In [33]:
import pandas as pd
df = pd.read_csv(r'C:\Users\LENOVO\Desktop\Feature Engineering\creditcard.csv')

In [34]:
df['Class'].value_counts()

# As, the percentage of 1's is very less = 492/(492+284315) = 0.17 %
# so, the data is imbalanced

0    284315
1       492
Name: Class, dtype: int64

In [35]:
# Splitting the given dataset into 2 datasets
# Independent Features (X) = All features Except "Class"
# Dependent Features (Y) = "Class"
X = df.drop("Class",axis=1)
y = df.Class

In [36]:
# Lets import all the necessary Librarires for "Logistic Regression Model"
from sklearn.linear_model import LogisticRegression

# Just "accuracy" is not enough. Other "metrices" should also be checked
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

# For Cross Validation
from sklearn.model_selection import KFold
# For KFold
import numpy as np

# to perform GridSearchCV
from sklearn.model_selection import GridSearchCV

In [37]:
# train -test split
from sklearn.model_selection import train_test_split

# reserving 70% dataset for training and remaining 30% for Testing purpose
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7) 

In [38]:
y_train.value_counts()

0    199032
1       332
Name: Class, dtype: int64

In [39]:
from collections import Counter
Counter(y_train)

Counter({0: 199032, 1: 332})

In [40]:
from imblearn.over_sampling import RandomOverSampler  # change "under_sampling" to "over_sampling"

In [41]:
os = RandomOverSampler(0.75) # reduce the total parameters to 75%
# "ns" is replaced with "os"
X_train_os,y_train_os=os.fit_resample(X_train,y_train)
print("The number of Classes before fit {}".format(Counter(y_train)))
print("The number of Classes after fit {}".format(Counter(y_train_os)))

# 149261/199015 ~ 75%



The number of Classes before fit Counter({0: 199032, 1: 332})
The number of Classes after fit Counter({0: 199032, 1: 149274})


In [42]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train_os,y_train_os)
# Fit Training will take MUCH MORE time to execute, Because, dataset is increased

RandomForestClassifier()

In [43]:
# copy pasted above o/p code
y_pred=classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred)) 
print(classification_report(y_test,y_pred))

# False +ve has been decreased to just 2


[[85277     6]
 [   37   123]]
0.9994967405170698
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85283
           1       0.95      0.77      0.85       160

    accuracy                           1.00     85443
   macro avg       0.98      0.88      0.93     85443
weighted avg       1.00      1.00      1.00     85443



#### Over Sampling is a Great Technique to Handle imbalanced Dataset

## SMOTETomek Technique to Handle imbalanced Dataset
Prononucation: SMOTE-Tomek
    
Aim: It create new points of the lowest numbers or similar to Nearest Neighbor

In [44]:
import pandas as pd
df = pd.read_csv(r'C:\Users\LENOVO\Desktop\Feature Engineering\creditcard.csv')
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [45]:
df['Class'].value_counts()

# As, the percentage of 1's is very less = 492/(492+284315) = 0.17 %
# so, the data is imbalanced

0    284315
1       492
Name: Class, dtype: int64

In [46]:
# Splitting the given dataset into 2 datasets
# Independent Features (X) = All features Except "Class"
# Dependent Features (Y) = "Class"
X = df.drop("Class",axis=1)
y = df.Class

In [47]:
# Lets import all the necessary Librarires for "Logistic Regression Model"
from sklearn.linear_model import LogisticRegression

# Just "accuracy" is not enough. Other "metrices" should also be checked
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

# For Cross Validation
from sklearn.model_selection import KFold
# For KFold
import numpy as np

# to perform GridSearchCV
from sklearn.model_selection import GridSearchCV

In [48]:
# train -test split
from sklearn.model_selection import train_test_split

# reserving 70% dataset for training and remaining 30% for Testing purpose
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7)

In [49]:
y_train.value_counts()

0    199026
1       338
Name: Class, dtype: int64

In [50]:
from collections import Counter
Counter(y_train)

Counter({0: 199026, 1: 338})

In [51]:
from imblearn.combine import SMOTETomek  # change "over_sampling" to "SMOTETomek"

In [52]:
os = SMOTETomek(0.5) # reduce the total parameters to 75%
X_train_os,y_train_os=os.fit_resample(X_train,y_train)
print("The number of Classes before fit {}".format(Counter(y_train)))
print("The number of Classes after fit {}".format(Counter(y_train_os)))

# Takes more time than the time of similar execution for Over Sampling
# because, "SMOTETomek Technique" create new points of the lowest numbers



The number of Classes before fit Counter({0: 199026, 1: 338})
The number of Classes after fit Counter({0: 198249, 1: 98736})


In [53]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train_os,y_train_os)
# Fit Training will take MUCH MORE time to execute, Because, dataset is increased

RandomForestClassifier()

In [54]:
# copy pasted above o/p code
y_pred=classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred)) 
print(classification_report(y_test,y_pred))

# False -ve has

[[85274    15]
 [   28   126]]
0.9994967405170698
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85289
           1       0.89      0.82      0.85       154

    accuracy                           1.00     85443
   macro avg       0.95      0.91      0.93     85443
weighted avg       1.00      1.00      1.00     85443



## 5)Ensemble Technique


In [None]:
from imblearn.ensemble import EasyEnsembleClassifier  # change "SMOTETomek" to "EasyEnsembleClassifier"

In [None]:
easy = EasyEnsembleClassifier() # reduce the total parameters to 50%
easy.fit(X_train,y_train)

In [None]:
# copy pasted above o/p code
y_pred=easy.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred)) 
print(classification_report(y_test,y_pred))