In [1]:
import pandas as pd
import numpy as np
from scipy.stats import uniform, randint
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
import xgboost as xgb

In [2]:
df=pd.read_csv(r"C:\Users\yjosh\Downloads\train_transaction (1).csv")

In [3]:
df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,135.0,0.0,0.0,0.0,50.0,1404.0,790.0,0.0,0.0,0.0
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df = pd.get_dummies(df, columns=['ProductCD', 'card4', 'card6', 'M1', 'M2', 'M3', 'M4', 'M5', 'M6'])

In [8]:
df.drop(['P_emaildomain'], axis=1, inplace=True)

In [9]:
df.isFraud.value_counts(dropna=False)

0    569877
1     20663
Name: isFraud, dtype: int64

In [11]:
train,test=train_test_split(df,test_size=0.3,shuffle=False,random_state=22)

In [12]:
train.shape

(413378, 242)

In [13]:
test.shape

(177162, 242)

In [14]:
train.isFraud.value_counts(dropna=False)

0    398840
1     14538
Name: isFraud, dtype: int64

In [15]:
test.isFraud.value_counts(dropna=False)

0    171037
1      6125
Name: isFraud, dtype: int64

In [16]:
only_zero=train[train.isFraud==0].sample(15000)

In [17]:
only_one=train[train.isFraud==1]

In [18]:
train=pd.concat([only_one,only_zero])


In [19]:
train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,...,M2_T,M3_F,M3_T,M4_M0,M4_M1,M4_M2,M5_F,M5_T,M6_F,M6_T
203,2987203,1,89760,445.0,18268,583.0,150.0,226.0,251.0,87.0,...,0,0,0,1,0,0,1,0,0,1
240,2987240,1,90193,37.098,13413,103.0,185.0,137.0,,,...,0,0,0,0,0,1,0,0,0,0
243,2987243,1,90246,37.098,13413,103.0,185.0,137.0,,,...,0,0,0,0,0,1,0,0,0,0
245,2987245,1,90295,37.098,13413,103.0,185.0,137.0,,,...,0,0,0,0,0,1,0,0,0,0
288,2987288,1,90986,155.521,16578,545.0,185.0,226.0,,,...,0,0,0,0,0,1,0,0,0,0


In [20]:
x_train = train.loc[:, train.columns != 'isFraud']
y_train = train.isFraud

x_test = test.loc[:, test.columns != 'isFraud']
y_test = test.isFraud

xgb_model = xgb.XGBClassifier(objective="binary:logistic", base_score=0.7, random_state=42)
xgb_model.fit(x_train, y_train) 

XGBClassifier(base_score=0.7, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [21]:
y_pred = xgb_model.predict(x_test)  

In [22]:
y_pred=pd.Series(y_pred)

In [23]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[143555  27482]
 [  1545   4580]]


In [24]:
TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]

In [25]:
print((TP + TN) / float(TP + TN + FP + FN))
print(accuracy_score(y_test, y_pred))

0.8361556089906413
0.8361556089906413


In [26]:
print(FP / float(TN + FP))

0.1606786835596976


In [27]:
y_pred_prob = xgb_model.predict_proba(x_test)[:, 1]

In [28]:
type(y_pred_prob)

numpy.ndarray

In [29]:
dataset = pd.DataFrame(y_pred_prob)
type(dataset)

pandas.core.frame.DataFrame

In [38]:
dataset.to_excel(r"C:\Users\yjosh\Downloads\output.xlsx")

In [24]:
import matplotlib.pyplot as plt
plt.hist(y_pred_prob, bins=8, linewidth=1.2)
plt.xlim(0, 1)
plt.title("Histogram of predicted probabilities")
plt.xlabel("Predicted probability of fraud")
plt.ylabel("Frequency")

Text(0, 0.5, 'Frequency')

In [31]:
from sklearn.preprocessing import binarize
y_pred_class = binarize([y_pred_prob], 0.7)[0]
confusion_new = confusion_matrix(y_test, y_pred_class)
print(confusion_new)

[[160595  10442]
 [  2519   3606]]


In [42]:
accuracy=(confusion_new[0][0]+confusion_new[1][1])/(confusion_new[0][0]+confusion_new[0][1]+confusion_new[1][0]+confusion_new[1][1])*100
accuracy

97.19409354150439