### Dummy variable creation

In [7]:
# separate target variable from dataset
y = final_data['class']
X = final_data.drop('class',axis = 1)

#### Creating dummy for X data

In [9]:
s = pd.get_dummies(X['s'],drop_first=True)
b = pd.get_dummies(X['b'],drop_first=True)
t = pd.get_dummies(X['t'],drop_first=True)
c = pd.get_dummies(X['c'],drop_first=True)
a = pd.get_dummies(X['a'],drop_first=True)

In [10]:
X.drop(['s','b','t','c','a']
             ,axis=1,inplace=True)

In [None]:
X = pd.concat([X,s,b,t,c,a],axis=1)

### Train Test Split - StratifiedShuffleSplit: Having same proportion on target variable in Train and Test

In [15]:
# Stratified Split of train and test data
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.25, random_state=0)
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.25, random_state=0)
sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
#     print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
print(y_train.value_counts())
print(y_test.value_counts())

0    102721
1     10613
Name: class, dtype: int64
0    34240
1     3538
Name: class, dtype: int64


### Functionalize Model Performance Evaluation

In [16]:
def model_performance_metrics(model, y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    if model.classes_[0] == 1:
        cm = np.array([[cm[1,1], cm[1,0]], [cm[0,1], cm[0,0]]])

    
    tp = cm[1,1]
    fn = cm[1,0]
    fp = cm[0,1]
    tn = cm[0,0]
    return_values = [cm, 
                     'Recall (TPR) = {0:.2%}'.format(tp/(tp+fn)),
                     'Fallout (FPR) = {0:.2%}'.format(fp/(fp+tn)),
                     'Accuracy (Acc) = {0:.2%}'.format((tp+tn)/(tp+tn+fp+fn))]
    return return_values

### Logistic Model on Above Test-Train Split

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
lrn = LogisticRegression()
lrn.fit(X_train, y_train)
y_pred = lrn.predict(X_test)

model_performance_metrics(lrn, y_test, y_pred)



[array([[34063,   177],
        [ 1651,  1887]]),
 'Recall (TPR) = 53.34%',
 'Fallout (FPR) = 0.52%',
 'Accuracy (Acc) = 95.16%']

### Over Sampling Fraud class in X_train

In [18]:
# !pip install imbalanced-learn

In [19]:
import imblearn
from imblearn.over_sampling import SMOTE
from collections import Counter

In [20]:
# SMOTE
sampler = SMOTE(ratio={1: 102721, 0: 102721},random_state=0)
X_rs, y_rs = sampler.fit_sample(X_train, y_train)
print('Original dataset shape %s' % Counter(y_train))
print('Resampled dataset shape %s' % Counter(y_rs))

Original dataset shape Counter({0: 102721, 1: 10613})
Resampled dataset shape Counter({0: 102721, 1: 102721})


### Logistic Model on oversampled X_train

In [38]:
lrn = LogisticRegression()
lrn.fit(X_rs, y_rs)
y_pred = lrn.predict(X_test)

model_performance_metrics(lrn, y_test, y_pred)



[array([[31127,  3113],
        [ 1122,  2416]]),
 'Recall (TPR) = 68.29%',
 'Fallout (FPR) = 9.09%',
 'Accuracy (Acc) = 88.79%']

### Logistic Model on oversampled X_train with Threshold value set to 0.4

In [39]:
lrn = LogisticRegression()
lrn.fit(X_rs, y_rs)

THRESHOLD = 0.4
y_pred = np.where(lrn.predict_proba(X_test)[:,1] > THRESHOLD, 1, 0)

model_performance_metrics(lrn, y_test, y_pred)



[array([[28957,  5283],
        [  933,  2605]]),
 'Recall (TPR) = 73.63%',
 'Fallout (FPR) = 15.43%',
 'Accuracy (Acc) = 83.55%']

## KNN

In [21]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
model_performance_metrics(knn, y_test, y_pred)

[array([[34007,   233],
        [ 1640,  1898]]),
 'Recall (TPR) = 53.65%',
 'Fallout (FPR) = 0.68%',
 'Accuracy (Acc) = 95.04%']

### KNN on oversampled X_train

In [22]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_rs, y_rs)
y_pred = knn.predict(X_test)
model_performance_metrics(knn, y_test, y_pred)

[array([[29341,  4899],
        [ 1259,  2279]]),
 'Recall (TPR) = 64.41%',
 'Fallout (FPR) = 14.31%',
 'Accuracy (Acc) = 83.70%']

### KNN on oversampled X_train with Threshold value set to 0.4

In [26]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_rs, y_rs) #fitting on oversamples X_train

THRESHOLD = 0.40
y_pred = np.where(knn.predict_proba(X_test)[:,1] > THRESHOLD, 1, 0)

model_performance_metrics(knn, y_test, y_pred)

[array([[29341,  4899],
        [ 1259,  2279]]),
 'Recall (TPR) = 64.41%',
 'Fallout (FPR) = 14.31%',
 'Accuracy (Acc) = 83.70%']

## Decision Tree

In [40]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
model_performance_metrics(clf, y_test, y_pred)

[array([[33758,   482],
        [ 1533,  2005]]),
 'Recall (TPR) = 56.67%',
 'Fallout (FPR) = 1.41%',
 'Accuracy (Acc) = 94.67%']

### Decision Tree on oversampled X_train

In [41]:
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_rs, y_rs) #fitting on oversamples X_train
y_pred = clf.predict(X_test)
model_performance_metrics(clf, y_test, y_pred)

[array([[29935,  4305],
        [ 1317,  2221]]),
 'Recall (TPR) = 62.78%',
 'Fallout (FPR) = 12.57%',
 'Accuracy (Acc) = 85.12%']

### Decision Tree on oversampled X_train with Threshold value set to 0.4

In [47]:
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_rs, y_rs) #fitting on oversamples X_train

THRESHOLD = 0.40
y_pred = np.where(clf.predict_proba(X_test)[:,1] > THRESHOLD, 1, 0)

model_performance_metrics(clf, y_test, y_pred)

[array([[27466,  6774],
        [ 1206,  2332]]),
 'Recall (TPR) = 65.91%',
 'Fallout (FPR) = 19.78%',
 'Accuracy (Acc) = 78.88%']

## LDA

In [59]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)

y_pred = lda.predict(X_test)
model_performance_metrics(lda, y_test, y_pred)

[array([[34041,   199],
        [ 1650,  1888]]),
 'Recall (TPR) = 53.36%',
 'Fallout (FPR) = 0.58%',
 'Accuracy (Acc) = 95.11%']

### LDA on oversampled X_train

In [60]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_rs, y_rs) #fitting on oversamples X_train

y_pred = lda.predict(X_test)
model_performance_metrics(lda, y_test, y_pred)

[array([[31807,  2433],
        [  993,  2545]]),
 'Recall (TPR) = 71.93%',
 'Fallout (FPR) = 7.11%',
 'Accuracy (Acc) = 90.93%']

### LDA on oversampled X_train with Threshold value set to 0.28

In [72]:
lda = LinearDiscriminantAnalysis()
lda.fit(X_rs, y_rs) #fitting on oversamples X_train

THRESHOLD = 0.28
y_pred = np.where(lda.predict_proba(X_test)[:,1] > THRESHOLD, 1, 0)

model_performance_metrics(clf, y_test, y_pred)

[array([[28955,  5285],
        [  905,  2633]]),
 'Recall (TPR) = 74.42%',
 'Fallout (FPR) = 15.44%',
 'Accuracy (Acc) = 83.61%']

## XGboost

In [178]:
# !pip install xgboost

In [176]:
import xgboost as xgb
gbm= xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.01).fit(X_train, y_train)

y_pred = gbm.predict(X_test)
model_performance_metrics(gbm, y_test, y_pred)

[array([[34200,    40],
        [ 1654,  1884]]),
 'Recall (TPR) = 53.25%',
 'Fallout (FPR) = 0.12%',
 'Accuracy (Acc) = 95.52%']

### XGboost on oversampled X_train

In [177]:
gbm= xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.01).fit(X_rs, y_rs)

y_pred = gbm.predict(X_test.values)
model_performance_metrics(gbm, y_test, y_pred)

[array([[32612,  1628],
        [ 1230,  2308]]),
 'Recall (TPR) = 65.23%',
 'Fallout (FPR) = 4.75%',
 'Accuracy (Acc) = 92.43%']

### XGboost on oversampled X_train with Threshold value set to 0.39

In [173]:
gbm= xgb.XGBClassifier(max_depth=10, n_estimators=300, learning_rate=0.01).fit(X_rs, y_rs)

In [174]:
THRESHOLD = 0.39
y_pred = np.where(gbm.predict_proba(X_test.values)[:,1] > THRESHOLD, 1, 0)

In [175]:
model_performance_metrics(gbm, y_test, y_pred)

[array([[29718,  4522],
        [ 1047,  2491]]),
 'Recall (TPR) = 70.41%',
 'Fallout (FPR) = 13.21%',
 'Accuracy (Acc) = 85.26%']