In [3]:
#from google.colab import drive
#drive.mount('/content/drive')

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import sklearn
from sklearn.model_selection import train_test_split, KFold,cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import PowerTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score,precision_score,accuracy_score,classification_report, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

from imblearn import over_sampling

from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})

XGBoostError: XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed (vcomp140.dll or libgomp-1.dll for Windows, libomp.dylib for Mac OSX, libgomp.so for Linux and other UNIX-like OSes). Mac OSX users: Run `brew install libomp` to install OpenMP runtime.
  * You are running 32-bit Python on a 64-bit OS
Error message(s): ['dlopen(/opt/anaconda3/lib/python3.8/site-packages/xgboost/lib/libxgboost.dylib, 6): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib\n  Referenced from: /opt/anaconda3/lib/python3.8/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: image not found']


# Exploratory Data Analysis

In [None]:
# loading data

df =  pd.read_csv('~/Desktop/creditcard.csv')

In [None]:
# checking shape of data and exploring data

print(df.shape)
df.head()

In [None]:
df.info()

In [None]:
classes = round(df['Class'].value_counts()*100/len(df.index),2)
classes

Classes are highly imbalanced. There is only 0.17% of 1 class

In [None]:
# bar plot of classes

sns.barplot(x=classes.index, y = classes.values)
plt.show()

In [None]:
# boxplot of time for both classes

sns.boxplot(x = 'Class', y = 'Time', data = df)
plt.show()

Time distrbution is almost same for both the classes with class 1 median being slightly lower

In [None]:
# boxplot of amount for both classes

sns.boxplot(x = 'Class', y = 'Amount', data = df[df['Amount'] < 200])
plt.show()

In [None]:
df.drop('Time',axis = 1,inplace=True)

### Splitting the data into train & test data

In [None]:
X = df.iloc[:,:-1]
y = df['Class']

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size = 0.7,stratify=y)

In [None]:
plt.figure(figsize=[20,150], dpi=90)

vars = list(X_train.columns)

i = 1

for var in vars:
  plt.subplot(40,3,i)
  sns.distplot(X_train[y_train==1][var],hist=False,label = "Target=1")
  sns.distplot(X_train[y_train==0][var], hist = False, label = "Target=0")
  i = i + 1

plt.show()

In [None]:
sns.distplot(X_train[(X_train['Amount'] < 200) & (y_train == 0)]['Amount'],hist=False,label = "Target=0")
sns.distplot(X_train[(X_train['Amount'] < 200) & (y_train == 1)]['Amount'],hist=False,label = "Target=1")
plt.show()

In [None]:
pt = PowerTransformer()

In [None]:
X_train['Amount'] = pt.fit_transform(X_train['Amount'].values.reshape(-1,1))
# X_train['Amount'] = np.log(X_train['Amount'])
X_test['Amount'] = pt.transform(X_test['Amount'].values.reshape(-1,1))

In [None]:
sns.distplot(X_train['Amount'],hist=True)
plt.show()

In [None]:
scores_df = pd.DataFrame()

def update_score(model_name,X_test,y_test,estimator):
  col = ['model_name','recall_score','precision_score','auc_score','accuracy']
  y_pred = estimator.predict(X_test)
  y_prob = estimator.predict_proba(X_test)
  model_name = model_name
  recall_score1 = round(recall_score(y_test,y_pred),3)
  precision_score1 = round(precision_score(y_test,y_pred),3)
  auc_score1 = round(roc_auc_score(y_test,y_prob[:,1]),3)
  accuracy_score1 = round(accuracy_score(y_test,y_pred),3)
  df = pd.DataFrame([[model_name,recall_score1,precision_score1,auc_score1,accuracy_score1]],columns = col)
  return df

# Logistic regression model

In [None]:
# grid search for logistic regression model with regularisation constant

lr_grid_params = {
    'C':[0.1,0.2,0.3,0.5,1,2,5]
}

folds = 3

lr_classifier = LogisticRegression()

lr_grid_search = GridSearchCV(estimator=lr_classifier,
                     param_grid=lr_grid_params,
                     cv = folds,
                     scoring = ['recall','precision'],
                     return_train_score = True,
                     refit = 'recall',
                     n_jobs = -1)


lr_grid_search.fit(X_train,y_train)

In [None]:
lr_grid_search_results = pd.DataFrame(lr_grid_search.cv_results_)
lr_grid_search_results.head()

In [None]:
plt.figure(figsize=[10,5], dpi = 80)

plt.subplot(1,2,1)

plt.plot(lr_grid_search_results['param_C'],lr_grid_search_results['mean_test_recall'], label = 'test_score' )
plt.plot(lr_grid_search_results['param_C'],lr_grid_search_results['mean_train_recall'], label = 'train_score'  )
plt.xlabel('C')
plt.ylabel('Recall score')
plt.legend()

plt.subplot(1,2,2)

plt.plot(lr_grid_search_results['param_C'],lr_grid_search_results['mean_test_precision'], label = 'test_score'  )
plt.plot(lr_grid_search_results['param_C'],lr_grid_search_results['mean_train_precision'], label = 'test_score'  )
plt.xlabel('C')
plt.ylabel('Precision score')
plt.legend()


plt.show()

In [None]:
lr_best_classifier = lr_grid_search.best_estimator_
print(lr_best_classifier.get_params)

In [None]:
y_pred = lr_best_classifier.predict(X_test)

print(classification_report(y_test,y_pred))

In [None]:
scores_df = scores_df.append(update_score('logistic_regression_imbalanced',X_test,y_test,lr_best_classifier))

In [None]:
scores_df

# Decision Tree Model

In [None]:
dt_grid_params = {
    'min_samples_split': range(100,500,50),
    'max_depth':range(5,15,5),
    'min_samples_leaf': range(50,150,25),
    'criterion':['entropy','gini']
}

folds = 3

dt_classifier = DecisionTreeClassifier()

dt_grid_search = GridSearchCV(estimator=dt_classifier,
                     param_grid=dt_grid_params,
                     cv = folds,
                     scoring = ['recall','precision'],
                     return_train_score = True,
                     refit = 'recall',
                     n_jobs = -1)

dt_grid_search.fit(X_train,y_train)

In [None]:
dt_grid_search_results = pd.DataFrame(dt_grid_search.cv_results_)
dt_grid_search_results.head()

In [None]:
dt_best_classifier = dt_grid_search.best_estimator_
print(dt_best_classifier.get_params)

In [None]:
y_pred = dt_best_classifier.predict(X_test)

print(classification_report(y_test,y_pred))

In [None]:
scores_df = scores_df.append(update_score('Decision_tree_imbalanced',X_test,y_test,dt_best_classifier))

In [None]:
scores_df

# Random Forest Classifier

In [None]:
rf_grid_params = {
    'n_estimators': range(50,500,100),
    'max_depth':range(5,15,5),
    'min_samples_leaf': range(25,150,50),
    # 'min_samples_split':range(50,200,50),
    'criterion':['entropy','gini']
}

folds = 3

rf_classifier = RandomForestClassifier()

rf_grid_search = RandomizedSearchCV(estimator=rf_classifier,
                     param_distributions=rf_grid_params,
                     cv = folds,
                     scoring = ['recall','precision'],
                     return_train_score = True,
                     refit = 'recall',
                     n_jobs = -1,
                     verbose=1)

rf_grid_search.fit(X_train,y_train)

In [None]:
rf_grid_search_results = pd.DataFrame(rf_grid_search.cv_results_)
rf_grid_search_results.head()

In [None]:
rf_best_classifier = rf_grid_search.best_estimator_
print(rf_best_classifier.get_params)

In [None]:
y_pred = rf_best_classifier.predict(X_test)

print(classification_report(y_test,y_pred))

In [None]:
scores_df = scores_df.append(update_score('Random_forest_imbalanced',X_test,y_test,rf_best_classifier))

In [None]:
scores_df

# Applying SMOTE technique to balance the data set

In [None]:
sm = over_sampling.SMOTE(random_state=0)

In [None]:
sm = over_sampling.SMOTE(random_state=0)
X_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)
# Artificial minority samples and corresponding minority labels from SMOTE are appended
# below X_train and y_train respectively
# So to exclusively get the artificial minority samples from SMOTE, we do
X_train_smote_1 = X_train_smote[X_train.shape[0]:]

X_train_1 = X_train.to_numpy()[np.where(y_train==1.0)]
X_train_0 = X_train.to_numpy()[np.where(y_train==0.0)]


plt.rcParams['figure.figsize'] = [20, 20]
fig = plt.figure()

plt.subplot(3, 1, 1)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.legend()

plt.subplot(3, 1, 2)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.scatter(X_train_smote_1[:X_train_1.shape[0], 0], X_train_smote_1[:X_train_1.shape[0], 1],
            label='Artificial SMOTE Class-1 Examples')
plt.legend()

plt.subplot(3, 1, 3)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.scatter(X_train_0[:X_train_1.shape[0], 0], X_train_0[:X_train_1.shape[0], 1], label='Actual Class-0 Examples')
plt.legend()

# Logistic Regression model on balanced dataset

In [None]:
# grid search for logistic regression model with regularisation constant

lr_grid_params_sm = {
    'C':[0.1,0.2,0.3,0.5,1,2,5]
}

folds = 3

lr_classifier_sm = LogisticRegression()

lr_grid_search_sm = GridSearchCV(estimator=lr_classifier_sm,
                     param_grid=lr_grid_params_sm,
                     cv = folds,
                     scoring = ['recall','precision'],
                     return_train_score = True,
                     refit = 'recall',
                     n_jobs = -1)


lr_grid_search_sm.fit(X_train_smote,y_train_smote)

In [None]:
lr_grid_search_results_sm = pd.DataFrame(lr_grid_search_sm.cv_results_)
lr_grid_search_results_sm.head()

In [None]:
lr_best_classifier_sm = lr_grid_search_sm.best_estimator_
print(lr_best_classifier_sm.get_params)

In [None]:
y_pred = lr_best_classifier_sm.predict(X_test)

print(classification_report(y_test,y_pred))

In [None]:
scores_df = scores_df.append(update_score('logistic_regression_smote_balanced',X_test,y_test,lr_best_classifier_sm))

In [None]:
scores_df

In [None]:
confusion_matrix(y_test,y_pred)

# Decision Tree model

In [None]:
dt_grid_params_sm = {
    'min_samples_split': range(100,500,50),
    'max_depth':range(5,15,5),
    'min_samples_leaf': range(50,150,25),
    'criterion':['entropy','gini']
}

folds = 3

dt_classifier_sm = DecisionTreeClassifier()

dt_grid_search_sm = GridSearchCV(estimator=dt_classifier_sm,
                     param_grid=dt_grid_params_sm,
                     cv = folds,
                     scoring = ['recall','precision'],
                     return_train_score = True,
                     refit = 'recall',
                     n_jobs = -1)

dt_grid_search_sm.fit(X_train_smote,y_train_smote)

In [None]:
dt_grid_search_results_sm = pd.DataFrame(dt_grid_search_sm.cv_results_)
dt_grid_search_results_sm.head()

In [None]:
dt_best_classifier_sm = dt_grid_search_sm.best_estimator_
print(dt_best_classifier_sm.get_params)

In [None]:
scores_df = scores_df.append(update_score('decision_tree_smote_imbalanced',X_test,y_test,dt_best_classifier_sm))

In [None]:
scores_df

# XGBoost

In [None]:
xgb_grid_params_sm = {
    'n_estimators': range(10,100,20),
    'max_depth':range(3,10,3),
    # 'min_samples_leaf': range(25,150,50),
    # 'min_samples_split':range(50,200,50),
}

folds = 3

xgb_classifier_sm = XGBClassifier()

xgb_grid_search_sm = GridSearchCV(estimator=xgb_classifier_sm,
                     param_grid=xgb_grid_params_sm,
                     cv = folds,
                     scoring = ['recall','precision'],
                     return_train_score = True,
                     refit = 'recall',
                     n_jobs = 10,
                     verbose = 3)

xgb_grid_search_sm.fit(X_train_smote,y_train_smote)

In [None]:
xgb_grid_search_results_sm = pd.DataFrame(xgb_grid_search_sm.cv_results_)
xgb_grid_search_results_sm.head()

In [None]:
xgb_best_classifier_sm = xgb_grid_search_sm.best_estimator_
print(xgb_best_classifier_sm.get_params)

In [None]:
scores_df = scores_df.append(update_score('xgboost_smote_balanced',X_test.values,y_test,xgb_best_classifier_sm))

In [None]:
scores_df.to_csv('score.csv')

In [None]:
scores_df

# Applying ADYSYN balancing

In [None]:
ada = over_sampling.ADASYN(random_state=0)
X_train_adasyn, y_train_adasyn = ada.fit_resample(X_train, y_train)
# Artificial minority samples and corresponding minority labels from ADASYN are appended
# below X_train and y_train respectively
# So to exclusively get the artificial minority samples from ADASYN, we do
X_train_adasyn_1 = X_train_adasyn[X_train.shape[0]:]

X_train_1 = X_train.to_numpy()[np.where(y_train==1.0)]
X_train_0 = X_train.to_numpy()[np.where(y_train==0.0)]



import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = [20, 20]
fig = plt.figure()

plt.subplot(3, 1, 1)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.legend()

plt.subplot(3, 1, 2)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.scatter(X_train_adasyn_1[:X_train_1.shape[0], 0], X_train_adasyn_1[:X_train_1.shape[0], 1],
            label='Artificial ADASYN Class-1 Examples')
plt.legend()

plt.subplot(3, 1, 3)
plt.scatter(X_train_1[:, 0], X_train_1[:, 1], label='Actual Class-1 Examples')
plt.scatter(X_train_0[:X_train_1.shape[0], 0], X_train_0[:X_train_1.shape[0], 1], label='Actual Class-0 Examples')
plt.legend()

# Logistic Regression

In [None]:
# grid search for logistic regression model with regularisation constant

lr_grid_params_ada = {
    'C':[0.1,0.2,0.3,0.5,1,2,5]
}

folds = 3

lr_classifier_ada = LogisticRegression()

lr_grid_search_ada = GridSearchCV(estimator=lr_classifier_ada,
                     param_grid=lr_grid_params_ada,
                     cv = folds,
                     scoring = ['recall','precision'],
                     return_train_score = True,
                     refit = 'recall',
                     n_jobs = -1)


lr_grid_search_ada.fit(X_train_adasyn,y_train_adasyn)

In [None]:
lr_grid_search_results_ada = pd.DataFrame(lr_grid_search_ada.cv_results_)
lr_grid_search_results_ada.head()

In [None]:
lr_best_classifier_ada = lr_grid_search_ada.best_estimator_
print(lr_best_classifier_ada.get_params)

In [None]:
y_pred = lr_best_classifier_ada.predict(X_test)

print(classification_report(y_test,y_pred))

In [None]:
scores_df = scores_df.append(update_score('logistic_regression_ada_balanced',X_test.values,y_test,lr_best_classifier_ada))

In [None]:
scores_df

# Tree classifier model

In [None]:
dt_grid_params_ada = {
    'min_samples_split': range(100,500,50),
    'max_depth':range(5,15,5),
    'min_samples_leaf': range(50,150,25),
    'criterion':['entropy','gini']
}

folds = 3

dt_classifier_ada = DecisionTreeClassifier()

dt_grid_search_ada = GridSearchCV(estimator=dt_classifier_ada,
                     param_grid=dt_grid_params_ada,
                     cv = folds,
                     scoring = ['recall','precision'],
                     return_train_score = True,
                     refit = 'recall',
                     n_jobs = -1)

dt_grid_search_ada.fit(X_train_adasyn,y_train_adasyn)

In [None]:
dt_grid_search_results_ada = pd.DataFrame(dt_grid_search_ada.cv_results_)
dt_grid_search_results_ada.head()

In [None]:
dt_best_classifier_ada = dt_grid_search_sm.best_estimator_
print(dt_best_classifier_ada.get_params)

In [None]:
y_pred = dt_best_classifier_ada.predict(X_test)

print(classification_report(y_test,y_pred))

In [None]:
scores_df = scores_df.append(update_score('tree_classification_ada_balanced',X_test.values,y_test,dt_best_classifier_ada))
scores_df

# XGBoost

In [None]:
xgb_grid_params_ada = {
    'n_estimators': range(80,200,20),
    'max_depth':range(3,10,3),
    # 'min_samples_leaf': range(25,150,50),
    # 'min_samples_split':range(50,200,50),
}

folds = 3

xgb_classifier_ada = XGBClassifier()

xgb_grid_search_ada = GridSearchCV(estimator=xgb_classifier_ada,
                     param_grid=xgb_grid_params_ada,
                     cv = folds,
                     scoring = ['recall','precision'],
                     return_train_score = True,
                     refit = 'recall',
                     n_jobs = 20,
                     verbose = 3)

xgb_grid_search_ada.fit(X_train_adasyn,y_train_adasyn)

In [None]:
xgb_grid_search_results_ada = pd.DataFrame(xgb_grid_search_ada.cv_results_)
xgb_grid_search_results_ada.head()

In [None]:
xgb_best_classifier_ada = xgb_grid_search_ada.best_estimator_
print(xgb_best_classifier_ada.get_params)

In [None]:
y_pred = xgb_best_classifier_ada.predict(X_test.values)

print(classification_report(y_test,y_pred))

In [None]:
scores_df = scores_df.append(update_score('xgboost_ada_balanced',X_test.values,y_test,xgb_best_classifier_ada))
scores_df