In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
#from imblearn.over_sampling import SMOTE 
import collections
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers
from keras.callbacks import EarlyStopping
from scipy import stats
import tensorflow as tf
from sklearn.decomposition import PCA
from sklearn import svm

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('features.csv')

In [None]:
df.head(2)
# dropping unnecessary columns
new_df=df.drop(columns=['accountNumber','customerId','transactionDateTime','currentExpDate','accountOpenDate','dateOfLastAddressChange','previousTransaction','merchantName','cardCVV','enteredCVV','cardLast4Digits','transactionDate','transactionWeek','transactionMonth','transactionQuarter'])
# converting the boolean features(True/False) into 1/0
new_df[['differentCountry','expirationDateKeyInMatch', 'isFraud','cardPresent','reversalTransaction','multiSwipeTransaction','incorrectCVV']] =new_df[['differentCountry','expirationDateKeyInMatch', 'isFraud','cardPresent','reversalTransaction','multiSwipeTransaction','incorrectCVV']].astype(int)
new_df = pd.get_dummies(data=new_df, columns=['transactionType','acqCountry', 'merchantCountryCode','posEntryMode','posConditionCode','merchantCategoryCode'])


In [None]:
y = new_df['isFraud']
X = new_df.drop(columns = ['isFraud'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)



In [None]:
def logisticRegression(X_train, X_test, Y_train, Y_test):
    lr = LogisticRegression()
    lr.fit(X_train, Y_train)
    y_pred = lr.predict(X_test)
    fpr, tpr, thresholds = roc_curve(Y_test, y_pred)
    print('AUC' + str(auc(fpr, tpr)))
    pre = precision_score(Y_test,y_pred)
    rec = recall_score(Y_test, y_pred)
    print('Precision: '+str(pre))
    print('Recall: '+str(rec))
    return lr, pre, rec

In [None]:
def decisionTree(X_train, X_test, Y_train, Y_test):
    dt = DecisionTreeClassifier()
    print('Decision Tree Classifier:')
    dt.fit(X_train, Y_train)
    y_pred = dt.predict(X_test)
    fpr, tpr, thresholds = roc_curve(Y_test, y_pred)
    print('AUC' + str(auc(fpr, tpr)))
    pre = precision_score(Y_test,y_pred)
    rec = recall_score(Y_test, y_pred)
    print('Precision: '+str(pre))
    print('Recall: '+str(rec))
    return dt,pre,rec

In [None]:
def randomForestClassifier(X_train, X_test, Y_train, Y_test):
    print('Random Forest Classifier')
    rf = RandomForestClassifier()
    rf.fit(X_train, Y_train)
    y_pred = rf.predict(X_test)
    fpr, tpr, thresholds = roc_curve(Y_test, y_pred)
    print('AUC' + str(auc(fpr, tpr)))
    pre = precision_score(Y_test,y_pred)
    rec = recall_score(Y_test, y_pred)
    print('Precision: '+str(pre))
    print('Recall: '+str(rec))
    return rf, pre, rec


In [None]:
def gradientBoostingClassifier(X_train, X_test, Y_train, Y_test):
    gr = GradientBoostingClassifier()
    print('GradientBoostingClassifier')
    gr.fit(X_train, Y_train)
    y_pred = gr.predict(X_test)
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    print('AUC' + str(auc(fpr, tpr)))
    pre = precision_score(Y_test,y_pred)
    rec = recall_score(Y_test, y_pred)
    print('Precision: '+str(pre))
    print('Recall: '+str(rec))
    return gr, pre, rec


In [None]:
def svmsklearn(X_train, X_test, Y_train, Y_test):
    svc = svm.SVC()
    print('SVM')
    svc.fit(X_train, Y_train)
    y_pred = svc.predict(X_test)
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    print('AUC' + str(auc(fpr, tpr)))
    pre = precision_score(Y_test,y_pred)
    rec = recall_score(Y_test, y_pred)
    print('Precision: '+str(pre))
    print('Recall: '+str(rec))
    return gr, pre, rec


# Method 1 - Traning on Train data and testing on test data

In [None]:
# Method 1
rf,pre,recall = randomForestClassifier(X_train, X_test, y_train, y_test)
#rf,pre,recall = svmsklearn(X_train, X_test, y_train, y_test)
dt,pre,recall = decisionTree(X_train, X_test, y_train, y_test)
gr,pre,recall = gradientBoostingClassifier(X_train, X_test, y_train, y_test)

In [None]:
def importantFeatures(rf,X_train):
    names = list(X_train.columns)
    importances = rf.feature_importances_
    indices = np.argsort(importances)[::-1]
    new_indices = indices[:10]
    features = X_train.columns[indices]
    indices = rf.feature_importances_[indices]
    features = list(features[:20])
    indices = list(indices[:20])
    plt.figure(figsize=(20,8))
    plt.bar(features, indices)
    plt.show()
    return features, indices

In [None]:
features, indices = importantFeatures(rf,X_train)


## Method 2 - undersampling the majority class data in train set and validating on test set

In [None]:
# iterating over various ratios 
pre = []
rec = []
for i in range(1,7):
    newX = X_train
    newX['isFraud'] = y_train
    X_train_notFraud =  newX[newX['isFraud']==0] 
    X_train_notFraud = X_train_notFraud.sample(frac = i*0.01, random_state = 42)
    X_train_Fraud = newX[newX['isFraud']==1]
    dfunder = X_train_notFraud.append(X_train_Fraud) 
    y_under = dfunder['isFraud']
    X_under = dfunder.drop(columns = ['isFraud'])


    lr_under, pre1,rec1 = logisticRegression(X_under, X_test, y_under,y_test)
    pre.append(pre1)
    rec.append(rec1)

## Method 3 - Smote analysis on the train dataset to oversample the test set

In [None]:
# Undersampling the majority class data
y = new_df['isFraud']
X = new_df.drop(columns = ['isFraud'])
newX = X
newX['isFraud'] = y
X_train_notFraud =  newX[newX['isFraud']==0] 
X_train_notFraud = X_train_notFraud.sample(frac = 0.9, random_state = 42)
X_train_Fraud = newX[newX['isFraud']==1]
dfsmote = X_train_notFraud.append(X_train_Fraud) 

# Oversampling the minority class data (SMOTE)
y = dfsmote['isFraud']
X = dfsmote.drop(columns = ['isFraud'])

sm = SMOTE(sampling_strategy = 0.6,random_state=42)
X_sm, y_sm = sm.fit_resample(X, y)
print(f'''Shape of X before SMOTE: {X.shape} Shape of X after SMOTE: {X_sm.shape}''')
print('\n positive and negative classes:')

X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2, random_state=42, stratify = y_sm)

In [None]:
lrnew, val1, val2 = logisticRegression(X_train, X_test, y_train,y_test)
rfnew, val1, val2 = randomForestClassifier(X_train, X_test, y_train,y_test)
dtnew, val1, val2 = decisionTree(X_train, X_test, y_train,y_test)