In [1]:
# import libararies
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Load Data
df=pd.read_csv('../input/creditcardfraud/creditcard.csv')
df.head()

In [3]:
# check null values in the Data
np.where(df.isnull().sum()!=0)

In [4]:
# Imbalanced data class
df.Class.value_counts().plot.bar()
plt.xticks([0,1],labels=['Non-Fraud','Fraud'],rotation=0)
plt.title('Imbalanced Class in Credit Card Fraud')
plt.xlabel('Class')
plt.ylabel('Number of Class')
plt.show()

In [5]:
# Time and amount analyze by class
nf_time=df.loc[df['Class']==0]['Time']
f_time=df.loc[df['Class']==1]['Time']

figure, axes = plt.subplots(ncols=2,figsize=(10,5))
sns.distplot(nf_time,ax=axes[0],color='b',hist=False)
sns.distplot(f_time,ax=axes[0],color='r',hist=False)
axes[0].legend(labels=['non-fraud','fraud'])
axes[0].set_title('Distribution of Time by Class')


sns.boxplot(x='Class',y='Amount',data=df,showfliers=False,ax=axes[1])
axes[1].set_title('Amount of credit card spent by Class')
axes[1].set_xticklabels(['non-fraud','fraud'])

In [6]:
from sklearn. preprocessing import StandardScaler
#scaling time and amount data
sm=StandardScaler()
df['scaled_amount'] = sm.fit_transform(df['Amount'].values.reshape(-1, 1))
df['scaled_time'] = sm.fit_transform(df['Time'].values.reshape(-1, 1))
df.drop(['Amount','Time'],axis=1,inplace=True)
df.head()

In [7]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
# set the ramdom state
rnd=0
# split feature and response variable
X = df.drop('Class',axis=1)
y = df['Class']
# split and resampling data
method = SMOTE(random_state=rnd)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rnd)
X_resample, y_resample = method.fit_resample(X_train, y_train)

print('the shape of train data before over-sampling:{}'.format(X_train.shape))
print('the shape of train data after over-sampling:{}'.format(X_resample.shape))
print('the number of class level "1":{}'.format(sum(y_train==1)))
print('the number of class level "1":{}'.format(sum(y_resample==1)))

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_curve,roc_auc_score,average_precision_score,roc_curve

# simple logistic regression
lr=LogisticRegression(random_state=rnd)
lr.fit(X_resample,y_resample)
predicted=lr.predict(X_test)
average_precision =average_precision_score(y_test,predicted)
print(average_precision)
precision, recall, _=precision_recall_curve(y_test,predicted)

In [12]:
plt.plot(recall,precision)

In [20]:
# Obtain model probabilities
probs = lr.predict_proba(X_test)
print(roc_auc_score(y_test, probs[:, 1]))
# Print ROC_AUC score using probabilities 
fpr, tpr, thresholds =roc_curve(y_test, probs[:, 1])
# Plot ROC
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b')
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.0])
plt.ylim([-0.1,1.01])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()