In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from settings import DATASET_FILENAME

In [None]:
df = pd.read_csv(DATASET_FILENAME, header=0)
df = df.drop(['zipcodeOri', 'zipMerchant'], axis=1)
df_fraud = df.loc[df.fraud == 1]
df_non_fraud = df.loc[df.fraud == 0]

In [None]:
# turning object columns type to categorical for easing the transformation process
category_cols = df.select_dtypes(include= ['object']).columns
for col in category_cols:
    df[col] = df[col].astype('category')
# categorical values ==> numeric values
df[category_cols] = df[category_cols].apply(lambda x: x.cat.codes)

# Replace with labelencoder

In [None]:
X = df.drop(['fraud'],axis=1)
y = df['fraud']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, shuffle=True, stratify=y)

# KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors=5, p=1)
knn.fit(x_train, y_train)
y_pred_knn = knn.predict(x_test)

In [None]:
classification_report(y_test, y_pred_knn)

In [None]:
confusion_matrix(y_test, y_pred_knn)

# Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=123, verbose=1, class_weight="balanced")
rf.fit(x_train, y_train)
y_pred_rf = rf.predict(x_test)

In [None]:
classification_report(y_test, y_pred_rf)

In [None]:
confusion_matrix(y_test, y_pred_rf)

# XGBoost

In [None]:
xgb_clf = xgb.XGBClassifier(max_depth=6, learning_rate=0.05, n_estimators=400, 
                                objective="binary:hinge", booster='gbtree', 
                                n_jobs=-1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, 
                                subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, 
                                scale_pos_weight=1, base_score=0.5, random_state=42, verbosity=1)
xgb_clf.fit(x_train,y_train)

y_pred = xgb_clf.predict(x_test)

In [None]:
classification_report(y_test, y_pred)

In [None]:
confusion_matrix(y_test, y_pred)