<a href="https://colab.research.google.com/github/sungchan1/goingSaboho/blob/gwangseok/credit_fraud.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler

df = pd.read_csv('/content/drive/MyDrive/Credit_fraud/data/creditcard.csv')
std_scaler = StandardScaler()
rob_scaler = RobustScaler()


df['Time'] = std_scaler.fit_transform(df['Time'].values.reshape(-1,1))
df['Amount'] = std_scaler.fit_transform(df['Amount'].values.reshape(-1,1))

In [None]:
# data distribution 시각화

import seaborn as sns
import matplotlib.pyplot as plt


row_cnt = len(df)
zero_cnt = df['Class'].value_counts()[0]
one_cnt = df['Class'].value_counts()[1]
ratio_no_fraud = round(zero_cnt/row_cnt * 100,2)
ratio_fraud = round(one_cnt/row_cnt * 100,2)


colors = ["#0101DF", "#DF0101"]
sns.countplot('Class', data=df, palette=colors)
plt.title(f'Class Distributions \n (0: No Fraud ({zero_cnt}, {ratio_no_fraud} %) '
          f'\n (1: Fraud ({one_cnt}, {ratio_fraud} %))', fontsize=14)

plt.show()

In [None]:
# Data Correlation Matrices

# pandas corr를 통해 피어슨 상관계수 사용
corr = df.corr()
plt.figure(figsize=(24,10))
ax=sns.heatmap(corr, cmap='coolwarm_r', annot_kws={'size':20})
ax.set_title("Correlation Matrix", fontsize=14)

plt.show()

In [None]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold

# UnderSampling 하기 전에 original dataframe을 test와 train으로 나눈다.
# Under 혹은 OverSampling 이후 원래 데이터로 검증하기 위해서이다.

X = df.drop('Class',axis=1)
y = df['Class']

# random_state : 난수 값을 지정하면 여러번 다시 수행해도 동일한 결과가 나오게 해줌
# shuffle : 데이터를 분리하기 전에 데이터를 미리 섞을지 결정
# test : train = 4 : 1
sss = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

for train_index, test_index in sss.split(X, y):
    original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
    original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]

# 배열로 바꾸기
# original_Xtrain = original_Xtrain.values
# original_Xtest = original_Xtest.values
# original_ytrain = original_ytrain.values
# original_ytest = original_ytest.values

In [None]:
# UnderSampling : NearMiss algorithm

# dataframe.sample(frac=1) : data를 뽑기전 random하게 섞기
df = df.sample(frac=1)

# fraud의 수가 492개 이므로 492개의 non_fraud를 가져온다.
fraud_df = df.loc[df['Class'] == 1]
non_fraud_df = df.loc[df['Class'] == 0][:492]

# pd.concat: data frame 합치기
undersampling_df = pd.concat([fraud_df, non_fraud_df]).sample(frac=1)

In [None]:
# undersampling data distribution 시각화

u_zero_cnt = undersampling_df['Class'].value_counts()[0]
u_one_cnt = undersampling_df['Class'].value_counts()[1]
sns.countplot('Class', data=undersampling_df, palette=colors)
plt.title(f'Equally Distributed Classes \n (0: No Fraud ({u_zero_cnt})'
            f'\n (1: Fraud ({u_one_cnt})', fontsize=14)
plt.show()

In [None]:
# undersampling data correlation
# pandas corr를 통해 피어슨 상관계수 사용
plt.figure(figsize=(24,10))

sub_sample_corr = undersampling_df.corr()
sns.heatmap(sub_sample_corr, cmap='coolwarm_r', annot_kws={'size':20})
ax2.set_title('SubSample Correlation Matrix \n (use for reference)', fontsize=14)
plt.show()
# heatmap의 class를 보자.
# V3, V10, V12, V14은 음의 상관계수를 가진다. 즉, 이 값들이 작을수록 fraud인 것이다.
# 반대로 V2, V4, V11는 양의 상관계수를 가진다. 즉, 이 값들이 클수록 fraud인 것이다.

In [None]:
# v3, v10, v12, v14 outlier 확인하기
f, axes = plt.subplots(ncols=4, figsize=(20,4))

# Negative Correlations with our Class (The lower our feature value the more likely it will be a fraud transaction)
sns.boxplot(x="Class", y="V3", data=undersampling_df, palette=colors, ax=axes[0])
axes[0].set_title('V3 vs Class Negative Correlation')

sns.boxplot(x="Class", y="V10", data=undersampling_df, palette=colors, ax=axes[1])
axes[1].set_title('V10 vs Class Negative Correlation')

sns.boxplot(x="Class", y="V12", data=undersampling_df, palette=colors, ax=axes[2])
axes[2].set_title('V12 vs Class Negative Correlation')


sns.boxplot(x="Class", y="V14", data=undersampling_df, palette=colors, ax=axes[3])
axes[3].set_title('V14 vs Class Negative Correlation')

plt.show()

In [None]:
# v2, v4, v11 outlier 확인하기
f, axes = plt.subplots(ncols=3, figsize=(20,4))

# Positive correlations (The higher the feature the probability increases that it will be a fraud transaction)
sns.boxplot(x="Class", y="V2", data=undersampling_df, palette=colors, ax=axes[0])
axes[0].set_title('V2 vs Class Positive Correlation')

sns.boxplot(x="Class", y="V4", data=undersampling_df, palette=colors, ax=axes[1])
axes[1].set_title('V4 vs Class Positive Correlation')


sns.boxplot(x="Class", y="V11", data=undersampling_df, palette=colors, ax=axes[2])
axes[2].set_title('V11 vs Class Positive Correlation')


plt.show()

In [None]:
# V3, V10, V12, V14 data의 분포와 정규분포
# V14만 정규분포 형태를 띄고 있다.
from scipy.stats import norm

f, (ax1, ax2, ax3, ax4) = plt.subplots(1,4, figsize=(20, 6))

v3_fraud_dist = undersampling_df['V3'].loc[undersampling_df['Class'] == 1].values
sns.distplot(v3_fraud_dist,ax=ax4, fit=norm, color='#C5B3F9')
ax1.set_title('V3 Distribution \n (Fraud Transactions)', fontsize=14)

v10_fraud_dist = undersampling_df['V10'].loc[undersampling_df['Class'] == 1].values
sns.distplot(v10_fraud_dist,ax=ax1, fit=norm, color='#C5B3F9')
ax2.set_title('V10 Distribution \n (Fraud Transactions)', fontsize=14)

v12_fraud_dist = undersampling_df['V12'].loc[undersampling_df['Class'] == 1].values
sns.distplot(v12_fraud_dist,ax=ax2, fit=norm, color='#56F9BB')
ax3.set_title('V12 Distribution \n (Fraud Transactions)', fontsize=14)

v14_fraud_dist = undersampling_df['V14'].loc[undersampling_df['Class'] == 1].values
sns.distplot(v14_fraud_dist,ax=ax3, fit=norm, color='#FB8861')
ax4.set_title('V14 Distribution \n (Fraud Transactions)', fontsize=14)


plt.show()

In [None]:
# V2, V4, V11, V19 data의 분포와 정규분포
f, (ax1, ax2, ax3) = plt.subplots(1,3, figsize=(20, 6))

v2_fraud_dist = undersampling_df['V2'].loc[undersampling_df['Class'] == 1].values
sns.distplot(v2_fraud_dist,ax=ax1, fit=norm, color='#C5B3F9')
ax1.set_title('V2 Distribution \n (Fraud Transactions)', fontsize=14)

v4_fraud_dist = undersampling_df['V4'].loc[undersampling_df['Class'] == 1].values
sns.distplot(v4_fraud_dist,ax=ax3, fit=norm, color='#FB8861')
ax3.set_title('V4 Distribution \n (Fraud Transactions)', fontsize=14)

v11_fraud_dist = undersampling_df['V11'].loc[undersampling_df['Class'] == 1].values
sns.distplot(v11_fraud_dist,ax=ax2, fit=norm, color='#56F9BB')
ax2.set_title('V11 Distribution \n (Fraud Transactions)', fontsize=14)

plt.show()

In [None]:
# Anomaly Detection: remove "extreme outliers"
# Interquartile Range 방법을 사용해 25% 아래이거나 75% 위에 있는 data를 제거한다.

# V2 outliers 제거
v2_fraud = undersampling_df['V2']
q25, q75 = np.percentile(v2_fraud, 25), np.percentile(v2_fraud, 75)
v2_iqr = q75 - q25
v2_cut_off = v2_iqr * 1.5
v2_lower, v2_upper = q25 - v2_cut_off, q75 + v2_cut_off
outliers = [x for x in v2_fraud if x < v2_lower or x > v2_upper]
undersampling_df = undersampling_df.drop(undersampling_df[(undersampling_df['V2'] > v2_upper) | (undersampling_df['V2'] < v2_lower)].index)

# v11 outliers 제거
v11_fraud = undersampling_df['V11']
q25, q75 = np.percentile(v11_fraud, 25), np.percentile(v11_fraud, 75)
v11_iqr = q75 - q25
v11_cut_off = v11_iqr * 1.5
v11_lower, v11_upper = q25 - v11_cut_off, q75 + v11_cut_off
outliers = [x for x in v11_fraud if x < v11_lower or x > v11_upper]
undersampling_df = undersampling_df.drop(undersampling_df[(undersampling_df['V11'] > v11_upper) | (undersampling_df['V11'] < v11_lower)].index)

# V3 outliers 제거
v3_fraud = undersampling_df['V3']
q25, q75 = np.percentile(v3_fraud, 25), np.percentile(v3_fraud, 75)
v3_iqr = q75 - q25
v3_cut_off = v3_iqr * 1.5
v3_lower, v3_upper = q25 - v3_cut_off, q75 + v3_cut_off
outliers = [x for x in v3_fraud if x < v3_lower or x > v3_upper]
undersampling_df = undersampling_df.drop(undersampling_df[(undersampling_df['V3'] > v3_upper) | (undersampling_df['V3'] < v3_lower)].index)

# V10 outliers 제거
v10_fraud = undersampling_df['V10']
q25, q75 = np.percentile(v10_fraud, 25), np.percentile(v10_fraud, 75)
v10_iqr = q75 - q25
v10_cut_off = v10_iqr * 1.5
v10_lower, v10_upper = q25 - v10_cut_off, q75 + v10_cut_off
outliers = [x for x in v10_fraud if x < v10_lower or x > v10_upper]
undersampling_df = undersampling_df.drop(undersampling_df[(undersampling_df['V10'] > v10_upper) | (undersampling_df['V10'] < v10_lower)].index)

# V12 outliers 제거
v12_fraud = undersampling_df['V12'].loc[undersampling_df['Class'] == 1].values
q25, q75 = np.percentile(v12_fraud, 25), np.percentile(v12_fraud, 75)
v12_iqr = q75 - q25
v12_cut_off = v12_iqr * 1.5
v12_lower, v12_upper = q25 - v12_cut_off, q75 + v12_cut_off
outliers = [x for x in v12_fraud if x < v12_lower or x > v12_upper]
undersampling_df = undersampling_df.drop(undersampling_df[(undersampling_df['V12'] > v12_upper) | (undersampling_df['V12'] < v12_lower)].index)

# v14 outliers 제거
v14_fraud = undersampling_df['V14'].loc[undersampling_df['Class'] == 1].values
q25, q75 = np.percentile(v14_fraud, 25), np.percentile(v14_fraud, 75)
v14_iqr = q75 - q25
v14_cut_off = v14_iqr * 1.5
v14_lower, v14_upper = q25 - v14_cut_off, q75 + v14_cut_off
outliers = [x for x in v14_fraud if x < v14_lower or x > v14_upper]
undersampling_df = undersampling_df.drop(undersampling_df[(undersampling_df['V14'] > v14_upper) | (undersampling_df['V14'] < v14_lower)].index)

In [None]:
f,(ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(20,8))

# Feature V3
sns.boxplot(x="Class", y="V3", data=undersampling_df, ax=ax1, palette=colors)
ax1.set_title("V3 Feature \n Reduction of outliers", fontsize=14)

# Feature V10
sns.boxplot(x="Class", y="V10", data=undersampling_df, ax=ax2, palette=colors)
ax2.set_title("V10 Feature \n Reduction of outliers", fontsize=14)

# Feature V12
sns.boxplot(x="Class", y="V12", data=undersampling_df, ax=ax3, palette=colors)
ax3.set_title("V12 Feature \n Reduction of outliers", fontsize=14)

# Feature V14
sns.boxplot(x="Class", y="V14", data=undersampling_df,ax=ax4, palette=colors)
ax4.set_title("V14 Feature \n Reduction of outliers", fontsize=14)

plt.show()

In [None]:
f,(ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20,8))
# Feature V2
sns.boxplot(x="Class", y="V2", data=undersampling_df,ax=ax1, palette=colors)
ax1.set_title("V2 Feature \n Reduction of outliers", fontsize=14)

# Feature V11
sns.boxplot(x="Class", y="V11", data=undersampling_df,ax=ax2, palette=colors)
ax2.set_title("V11 Feature \n Reduction of outliers", fontsize=14)

# Feature V19
sns.boxplot(x="Class", y="V19", data=undersampling_df,ax=ax3, palette=colors)
ax3.set_title("V19 Feature \n Reduction of outliers", fontsize=14)

In [None]:
# 차원 줄이기: t-sne, pca, truncatedSVD
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD

x = undersampling_df.drop('Class', axis=1)
y = undersampling_df['Class']

# x_reduced_tsne = TSNE(n_components=2, random_state=42).fit_transform(x.values)
x_reduced_pca = PCA(n_components=10, random_state=42).fit_transform(x.values)
# x_reduced_svd = TruncatedSVD(n_components=2, algorithm='randomized', random_state=42).fit_transform(x.values)

In [None]:
# 시각화
import matplotlib.patches as mpatches

f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(24,6))
# labels = ['No Fraud', 'Fraud']
f.suptitle('Clusters using Dimensionality Reduction', fontsize=14)
blue_patch = mpatches.Patch(color='#0A0AFF', label='No Fraud')
red_patch = mpatches.Patch(color='#AF0000', label='Fraud')

# t-sne
# ax1.scatter(x_reduced_tsne[:,0], x_reduced_tsne[:,1], c=(y == 0), cmap='coolwarm', label='No Fraud', linewidths=2)
# ax1.scatter(x_reduced_tsne[:,0], x_reduced_tsne[:,1], c=(y == 1), cmap='coolwarm', label='Fraud', linewidths=2)
# ax1.set_title('t-SNE', fontsize=14)
# ax1.grid(True)
# ax1.legend(handles=[blue_patch, red_patch])

# pca
ax2.scatter(x_reduced_pca[:,0], x_reduced_pca[:,1], c=(y == 0), cmap='coolwarm', label='No Fraud', linewidths=2)
ax2.scatter(x_reduced_pca[:,0], x_reduced_pca[:,1], c=(y == 1), cmap='coolwarm', label='Fraud', linewidths=2)
ax2.set_title('PCA', fontsize=14)
ax2.grid(True)
ax2.legend(handles=[blue_patch, red_patch])

# truncatedSVD
# ax3.scatter(x_reduced_svd[:,0], x_reduced_svd[:,1], c=(y == 0), cmap='coolwarm', label='No Fraud', linewidths=2)
# ax3.scatter(x_reduced_svd[:,0], x_reduced_svd[:,1], c=(y == 1), cmap='coolwarm', label='Fraud', linewidths=2)
# ax3.set_title('Truncated SVD', fontsize=14)
# ax3.grid(True)
# ax3.legend(handles=[blue_patch, red_patch])

plt.show()

In [None]:
# Undersampling KNN, not dimensional reduction

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score

best_parameter = 0
best_recall_score = 0
lowest_FN = 1
lowest_FN_parameter = 0

def cal_FN(arr1, arr2):
    arr1 = arr1.values
    size = len(arr1)
    cnt = 0
    for i in range(size):
        if arr1[i] == 1 and arr2[i] == 0:
            cnt += 1

    return cnt / size


for i in range(1,20):
    clf = KNeighborsClassifier(n_neighbors=i)
    x = undersampling_df.drop('Class', axis=1)
    y = undersampling_df['Class']

    # test_size: test 데이터 셋 비율, default = 0.25, random_state : 데이터 분할시 셔플이 이루어지는데 이를 위한 시드값 (int나 RandomState로 입력)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    tmp_FN = cal_FN(y_test, y_pred)

    if lowest_FN > tmp_FN:
        lowest_FN = tmp_FN
        lowest_FN_parameter = i

    tmp_recall_score = recall_score(y_test, y_pred)
    if best_recall_score < tmp_recall_score:
        best_parameter = i
        best_recall_score = tmp_recall_score


print(lowest_FN_parameter, lowest_FN)
print(best_parameter, best_recall_score)

In [None]:
# Recall : TP/TP+FN
# Precision = TP/TP+FP
# Accuracy = (TP+TN)/(TP+FP+FN+TN)
# F1 score
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score

y_pred = clf.predict(original_Xtest)

print('Recall Score: {:.4f}'.format(recall_score(original_ytest, y_pred)))
print('Precision Score: {:.4f}'.format(precision_score(original_ytest, y_pred)))
print('F1 Score: {:.4f}'.format(f1_score(original_ytest, y_pred)))
print('Accuracy Score: {:.4f}'.format(accuracy_score(original_ytest, y_pred)))

In [None]:
# cross validation으로 점수 확인

# precision, recall and F1
from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()
y_train = np.array([number[0] for number in lb.fit_transform(y_train)])

recall = cross_val_score(classifier, X_train, y_train, cv=5, scoring='recall')
print('Recall', np.mean(recall), recall)
precision = cross_val_score(classifier, X_train, y_train, cv=5, scoring='precision')
print('Precision', np.mean(precision), precision)
f1 = cross_val_score(classifier, X_train, y_train, cv=5, scoring='f1')
print('F1', np.mean(f1), f1)