In [29]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [30]:
df = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')
df.head()                 

In [31]:
print(df.shape)

In [32]:
df.isnull().sum()

In [33]:
df.info()

In [34]:
df.describe()

In [35]:
df.columns

In [36]:
plt.plot(df['Time'], df['Amount'])
plt.plot(df['Time'][df['Class']==1], df['Amount'][df['Class']==1], 'o')
plt.show()

In [37]:
df.hist(figsize=(15,10),bins=50)

In [38]:
#check some correlations
#keep in mind the final Class is highly skewed

df.corr().style.background_gradient(cmap='coolwarm')

# Model Building

specifically, I want to study how oversampling using the classic SMOTE method may affect the results

In [52]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, average_precision_score, roc_auc_score
from imblearn.over_sampling import SMOTE

In [40]:
X_data = df.drop(columns=['Class'])
Y_data = df['Class']

In [41]:
#create holdout for final evaluation
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, stratify=Y_data, test_size=0.25)

In [48]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.sum()/X_train.shape[0]*100)
print(Y_test.sum()/X_test.shape[0]*100)  #validate stratification worked -> yes

In [49]:
rf = RandomForestClassifier(random_state=7, n_jobs=-1)
rf.fit(X_train, Y_train)
Y_pred = rf.predict(X_test)

In [50]:
print(classification_report(Y_test, Y_pred))

In [53]:
print(average_precision_score(Y_test, Y_pred))
print(roc_auc_score(Y_test, Y_pred))

it's honestly not bad, let's see if SMOTE can make it better?

In [54]:
smote = SMOTE(random_state=8, n_jobs=-1)
X_resamp, Y_resamp = smote.fit_resample(X_train, Y_train)

In [55]:
#X_train, X_test, Y_train, Y_test = train_test_split(X_resamp, Y_resamp, stratify=Y_resamp, test_size=0.25)
rf = RandomForestClassifier(random_state=7, n_jobs=-1)
#rf.fit(X_train, Y_train)
rf.fit(X_resamp, Y_resamp)
Y_pred = rf.predict(X_test)
print(classification_report(Y_test, Y_pred))
print(average_precision_score(Y_test, Y_pred))
print(roc_auc_score(Y_test, Y_pred))