In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings (action= 'ignore')
plt.style.use(['seaborn-bright' , 'dark_background'])

In [37]:
data = pd.read_csv('creditcard.csv')
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [38]:
data.shape

(284807, 31)

In [39]:
data['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

* Class seems to be a very imbalanced 

In [40]:
X = data.drop('Class' , axis = 1)
Y = data['Class']

In [41]:
from sklearn.model_selection import train_test_split as tts
x_train , x_test , y_train , y_test = tts(X , Y , train_size= 0.7)

### Cross Validation like KFold and Hyperparamter Tuning 

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score , confusion_matrix , classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [43]:
10.0 ** np.arange(-2 , 3)

array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02])

In [44]:
classifier = LogisticRegression()
grid = {"C" : 10.0 ** np.arange(-2 , 3),
        "penalty" : ['l1' , 'l2']}
cv = KFold(n_splits= 5 , shuffle= False , random_state= None)

In [45]:
clf = GridSearchCV(classifier , grid , cv = cv , n_jobs = -1 , scoring= 'f1_macro')
clf.fit(x_train , y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                         'penalty': ['l1', 'l2']},
             scoring='f1_macro')

In [46]:
y_pred = clf.predict(x_test)
print(confusion_matrix(y_test , y_pred))
print(accuracy_score(y_test , y_pred))
print(classification_report(y_test , y_pred))

[[85261    34]
 [   45   103]]
0.9990754069964772
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.75      0.70      0.72       148

    accuracy                           1.00     85443
   macro avg       0.88      0.85      0.86     85443
weighted avg       1.00      1.00      1.00     85443



In [47]:
class_weight = dict({0 : 1, 1 : 100})

In [48]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(class_weight= class_weight)
classifier.fit(x_train , y_train) 

RandomForestClassifier(class_weight={0: 1, 1: 100})

In [49]:
y_pred = classifier.predict(x_test)
print(confusion_matrix(y_test , y_pred))
print(accuracy_score(y_test , y_pred))
print(classification_report(y_test , y_pred))

[[85288     7]
 [   33   115]]
0.9995318516437859
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.94      0.78      0.85       148

    accuracy                           1.00     85443
   macro avg       0.97      0.89      0.93     85443
weighted avg       1.00      1.00      1.00     85443



### Under Sampling

What Under_Sampling does is?
* Reduce the points of the maximum labels

Disadvantages
* Loss of data 
* Generally usable when dataset is small

In [50]:
from collections import Counter
Counter(y_train)

Counter({0: 199020, 1: 344})

In [51]:
from collections import Counter
from imblearn.under_sampling import NearMiss
ns = NearMiss(0.8)
x_train_ns , y_train_ns = ns.fit_resample(x_train , y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

The number of classes before fit Counter({0: 199020, 1: 344})
The number of classes after fit Counter({0: 430, 1: 344})


In [52]:
0.8 * 433

346.40000000000003

In [53]:
from sklearn.ensemble import RandomForestClassifier
classifier_ns = RandomForestClassifier()
classifier_ns.fit(x_train_ns , y_train_ns) 

RandomForestClassifier()

In [54]:
y_pred_ns = classifier_ns.predict(x_test)
print(confusion_matrix(y_test , y_pred_ns))
print(accuracy_score(y_test , y_pred_ns))
print(classification_report(y_test , y_pred_ns))

[[59720 25575]
 [    7   141]]
0.7005957187832824
              precision    recall  f1-score   support

           0       1.00      0.70      0.82     85295
           1       0.01      0.95      0.01       148

    accuracy                           0.70     85443
   macro avg       0.50      0.83      0.42     85443
weighted avg       1.00      0.70      0.82     85443



### Over Sampling

In [63]:
from imblearn.over_sampling import RandomOverSampler
os = RandomOverSampler(0.75)
x_train_os , y_train_os = os.fit_resample(x_train , y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_os)))

The number of classes before fit Counter({0: 199020, 1: 344})
The number of classes after fit Counter({0: 199020, 1: 149265})


In [64]:
0.75 * 199020

149265.0

In [65]:
from sklearn.ensemble import RandomForestClassifier
classifier_os = RandomForestClassifier()
classifier_os.fit(x_train_os , y_train_os) 

RandomForestClassifier()

In [66]:
y_pred_os = classifier_os.predict(x_test)
print(confusion_matrix(y_test , y_pred_os))
print(accuracy_score(y_test , y_pred_os))
print(classification_report(y_test , y_pred_os))

[[85287     8]
 [   32   116]]
0.9995318516437859
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85295
           1       0.94      0.78      0.85       148

    accuracy                           1.00     85443
   macro avg       0.97      0.89      0.93     85443
weighted avg       1.00      1.00      1.00     85443



### SMOTETomek

* SMOT : Synthetic Minority oversampling Technique

In [67]:
from imblearn.combine import SMOTETomek

In [None]:
ST = SMOTETomek(0.5)
x_train_ST , y_train_ST = ST.fit_resample(x_train , y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_ST)))

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier_os = RandomForestClassifier()
classifier_os.fit(x_train_ST , y_train_ST) 

In [None]:
y_pred_ST = classifier_os.predict(x_test)
print(confusion_matrix(y_test , y_pred_ST))
print(accuracy_score(y_test , y_pred_ST))
print(classification_report(y_test , y_pred_ST))

### Ensemble Techniques

In [70]:
from imblearn.ensemble import EasyEnsembleClassifier

In [71]:
easy = EasyEnsembleClassifier()
easy.fit(x_train , y_train)

EasyEnsembleClassifier()

In [72]:
y_pred = easy.predict(x_test)
print(confusion_matrix(y_test , y_pred))
print(accuracy_score(y_test , y_pred))
print(classification_report(y_test , y_pred))

[[81421  3874]
 [    9   139]]
0.9545544983205178
              precision    recall  f1-score   support

           0       1.00      0.95      0.98     85295
           1       0.03      0.94      0.07       148

    accuracy                           0.95     85443
   macro avg       0.52      0.95      0.52     85443
weighted avg       1.00      0.95      0.98     85443

