<a href="https://colab.research.google.com/github/sungchan1/goingSaboho/blob/gwangseok/credit_fraud.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler

df = pd.read_csv('/content/drive/MyDrive/Credit_fraud/data/creditcard.csv')
std_scaler = StandardScaler()
rob_scaler = RobustScaler()


df['Time'] = std_scaler.fit_transform(df['Time'].values.reshape(-1,1))
df['Amount'] = std_scaler.fit_transform(df['Amount'].values.reshape(-1,1))

In [None]:
# data distribution 시각화

import seaborn as sns
import matplotlib.pyplot as plt


row_cnt = len(df)
zero_cnt = df['Class'].value_counts()[0]
one_cnt = df['Class'].value_counts()[1]
ratio_no_fraud = round(zero_cnt/row_cnt * 100,2)
ratio_fraud = round(one_cnt/row_cnt * 100,2)


colors = ["#0101DF", "#DF0101"]
sns.countplot('Class', data=df, palette=colors)
plt.title(f'Class Distributions \n (0: No Fraud ({zero_cnt}, {ratio_no_fraud} %) '
          f'\n (1: Fraud ({one_cnt}, {ratio_fraud} %))', fontsize=14)

plt.show()

In [None]:
# Data Correlation Matrices

# pandas corr를 통해 피어슨 상관계수 사용
corr = df.corr()
plt.figure(figsize=(24,10))
ax=sns.heatmap(corr, cmap='coolwarm_r', annot_kws={'size':20})
ax.set_title("Correlation Matrix", fontsize=14)

plt.show()

In [None]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold

# UnderSampling 하기 전에 original dataframe을 test와 train으로 나눈다.
# Under 혹은 OverSampling 이후 원래 데이터로 검증하기 위해서이다.

X = df.drop('Class',axis=1)
y = df['Class']

# random_state : 난수 값을 지정하면 여러번 다시 수행해도 동일한 결과가 나오게 해줌
# shuffle : 데이터를 분리하기 전에 데이터를 미리 섞을지 결정
# test : train = 4 : 1
sss = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

for train_index, test_index in sss.split(X, y):
    original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
    original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]

# 배열로 바꾸기
original_Xtrain = original_Xtrain.values
original_Xtest = original_Xtest.values
original_ytrain = original_ytrain.values
original_ytest = original_ytest.values

In [None]:
# UnderSampling : NearMiss algorithm

# dataframe.sample(frac=1) : data를 뽑기전 random하게 섞기
df = df.sample(frac=1)

# fraud의 수가 492개 이므로 492개의 non_fraud를 가져온다.
fraud_df = df.loc[df['Class'] == 1]
non_fraud_df = df.loc[df['Class'] == 0][:492]

# pd.concat: data frame 합치기
undersampling_df = pd.concat([fraud_df, non_fraud_df]).sample(frac=1)

In [None]:
# undersampling data distribution 시각화

u_zero_cnt = undersampling_df['Class'].value_counts()[0]
u_one_cnt = undersampling_df['Class'].value_counts()[1]
sns.countplot('Class', data=undersampling_df, palette=colors)
plt.title(f'Equally Distributed Classes \n (0: No Fraud ({u_zero_cnt})'
            f'\n (1: Fraud ({u_one_cnt})', fontsize=14)
plt.show()

In [None]:
# undersampling data correlation
# pandas corr를 통해 피어슨 상관계수 사용
plt.figure(figsize=(24,10))

sub_sample_corr = undersampling_df.corr()
sns.heatmap(sub_sample_corr, cmap='coolwarm_r', annot_kws={'size':20})
ax2.set_title('SubSample Correlation Matrix \n (use for reference)', fontsize=14)
plt.show()
# heatmap의 class를 보자.
# V10, V12, V14, V17은 음의 상관계수를 가진다. 즉, 이 값들이 작을수록 fraud인 것이다.
# 반대로 V2, V4, V11, V19는 양의 상관계수를 가진다. 즉, 이 값들이 클수록 fraud인 것이다.

In [None]:
# v10, v12, v14, v17 outlier 확인하기
f, axes = plt.subplots(ncols=4, figsize=(20,4))

# Negative Correlations with our Class (The lower our feature value the more likely it will be a fraud transaction)
sns.boxplot(x="Class", y="V10", data=undersampling_df, palette=colors, ax=axes[0])
axes[0].set_title('V10 vs Class Negative Correlation')

sns.boxplot(x="Class", y="V12", data=undersampling_df, palette=colors, ax=axes[1])
axes[1].set_title('V12 vs Class Negative Correlation')


sns.boxplot(x="Class", y="V14", data=undersampling_df, palette=colors, ax=axes[2])
axes[2].set_title('V14 vs Class Negative Correlation')


sns.boxplot(x="Class", y="V17", data=undersampling_df, palette=colors, ax=axes[3])
axes[3].set_title('V17 vs Class Negative Correlation')

plt.show()

In [None]:
# v2, v4, v11, v19 outlier 확인하기
f, axes = plt.subplots(ncols=4, figsize=(20,4))

# Positive correlations (The higher the feature the probability increases that it will be a fraud transaction)
sns.boxplot(x="Class", y="V2", data=undersampling_df, palette=colors, ax=axes[0])
axes[0].set_title('V2 vs Class Positive Correlation')

sns.boxplot(x="Class", y="V4", data=undersampling_df, palette=colors, ax=axes[1])
axes[1].set_title('V4 vs Class Positive Correlation')


sns.boxplot(x="Class", y="V11", data=undersampling_df, palette=colors, ax=axes[2])
axes[2].set_title('V11 vs Class Positive Correlation')


sns.boxplot(x="Class", y="V19", data=undersampling_df, palette=colors, ax=axes[3])
axes[3].set_title('V19 vs Class Positive Correlation')

plt.show()

In [None]:
# V10, V12, V14, V17 data의 분포와 정규분포
# V14만 정규분포 형태를 띄고 있다.
from scipy.stats import norm

f, (ax1, ax2, ax3, ax4) = plt.subplots(1,4, figsize=(20, 6))

v10_fraud_dist = undersampling_df['V10'].loc[undersampling_df['Class'] == 1].values
sns.distplot(v10_fraud_dist,ax=ax1, fit=norm, color='#C5B3F9')
ax1.set_title('V10 Distribution \n (Fraud Transactions)', fontsize=14)

v14_fraud_dist = undersampling_df['V14'].loc[undersampling_df['Class'] == 1].values
sns.distplot(v14_fraud_dist,ax=ax3, fit=norm, color='#FB8861')
ax3.set_title('V14 Distribution \n (Fraud Transactions)', fontsize=14)

v12_fraud_dist = undersampling_df['V12'].loc[undersampling_df['Class'] == 1].values
sns.distplot(v12_fraud_dist,ax=ax2, fit=norm, color='#56F9BB')
ax2.set_title('V12 Distribution \n (Fraud Transactions)', fontsize=14)

v17_fraud_dist = undersampling_df['V17'].loc[undersampling_df['Class'] == 1].values
sns.distplot(v17_fraud_dist,ax=ax4, fit=norm, color='#C5B3F9')
ax4.set_title('V17 Distribution \n (Fraud Transactions)', fontsize=14)

plt.show()

In [None]:
# V2, V4, V11, V19 data의 분포와 정규분포
f, (ax1, ax2, ax3, ax4) = plt.subplots(1,4, figsize=(20, 6))

v2_fraud_dist = undersampling_df['V2'].loc[undersampling_df['Class'] == 1].values
sns.distplot(v2_fraud_dist,ax=ax1, fit=norm, color='#C5B3F9')
ax1.set_title('V2 Distribution \n (Fraud Transactions)', fontsize=14)

v4_fraud_dist = undersampling_df['V4'].loc[undersampling_df['Class'] == 1].values
sns.distplot(v4_fraud_dist,ax=ax3, fit=norm, color='#FB8861')
ax3.set_title('V4 Distribution \n (Fraud Transactions)', fontsize=14)

v11_fraud_dist = undersampling_df['V11'].loc[undersampling_df['Class'] == 1].values
sns.distplot(v11_fraud_dist,ax=ax2, fit=norm, color='#56F9BB')
ax2.set_title('V11 Distribution \n (Fraud Transactions)', fontsize=14)

v19_fraud_dist = undersampling_df['V19'].loc[undersampling_df['Class'] == 1].values
sns.distplot(v19_fraud_dist,ax=ax4, fit=norm, color='#C5B3F9')
ax4.set_title('V19 Distribution \n (Fraud Transactions)', fontsize=14)

plt.show()

In [None]:
# Anomaly Detection: remove "extreme outliers"
# Interquartile Range 방법을 사용해 25% 아래이거나 75% 위에 있는 data를 제거한다.

# V2 outliers 제거
v2_fraud = undersampling_df['V2']
q25, q75 = np.percentile(v2_fraud, 25), np.percentile(v2_fraud, 75)
v2_iqr = q75 - q25
v2_cut_off = v10_iqr * 1.5
v2_lower, v2_upper = q25 - v2_cut_off, q75 + v2_cut_off
outliers = [x for x in v2_fraud if x < v2_lower or x > v2_upper]
undersampling_df = undersampling_df.drop(undersampling_df[(undersampling_df['V2'] > v2_upper) | (undersampling_df['V2'] < v2_lower)].index)

# v11 outliers 제거
v11_fraud = undersampling_df['V11']
q25, q75 = np.percentile(v11_fraud, 25), np.percentile(v11_fraud, 75)
v11_iqr = q75 - q25
v11_cut_off = v11_iqr * 1.5
v11_lower, v11_upper = q25 - v11_cut_off, q75 + v11_cut_off
outliers = [x for x in v11_fraud if x < v11_lower or x > v11_upper]
undersampling_df = undersampling_df.drop(undersampling_df[(undersampling_df['V11'] > v11_upper) | (undersampling_df['V11'] < v11_lower)].index)

# v19 outliers 제거
v19_fraud = undersampling_df['V19']
q25, q75 = np.percentile(v19_fraud, 25), np.percentile(v19_fraud, 75)
v19_iqr = q75 - q25
v19_cut_off = v19_iqr * 1.5
v19_lower, v19_upper = q25 - v19_cut_off, q75 + v19_cut_off
outliers = [x for x in v19_fraud if x < v19_lower or x > v19_upper]
undersampling_df = undersampling_df.drop(undersampling_df[(undersampling_df['V19'] > v19_upper) | (undersampling_df['V19'] < v19_lower)].index)


# V10 outliers 제거
v10_fraud = undersampling_df['V10']
q25, q75 = np.percentile(v10_fraud, 25), np.percentile(v10_fraud, 75)
v10_iqr = q75 - q25
v10_cut_off = v10_iqr * 1.5
v10_lower, v10_upper = q25 - v10_cut_off, q75 + v10_cut_off
outliers = [x for x in v10_fraud if x < v10_lower or x > v10_upper]
undersampling_df = undersampling_df.drop(undersampling_df[(undersampling_df['V10'] > v10_upper) | (undersampling_df['V10'] < v10_lower)].index)

# V12 outliers 제거
v12_fraud = undersampling_df['V12'].loc[undersampling_df['Class'] == 1].values
q25, q75 = np.percentile(v12_fraud, 25), np.percentile(v12_fraud, 75)
v12_iqr = q75 - q25
v12_cut_off = v12_iqr * 1.5
v12_lower, v12_upper = q25 - v12_cut_off, q75 + v12_cut_off
outliers = [x for x in v12_fraud if x < v12_lower or x > v12_upper]
undersampling_df = undersampling_df.drop(undersampling_df[(undersampling_df['V12'] > v12_upper) | (undersampling_df['V12'] < v12_lower)].index)

# v14 outliers 제거
v14_fraud = undersampling_df['V14'].loc[undersampling_df['Class'] == 1].values
q25, q75 = np.percentile(v14_fraud, 25), np.percentile(v14_fraud, 75)
v14_iqr = q75 - q25
v14_cut_off = v14_iqr * 1.5
v14_lower, v14_upper = q25 - v14_cut_off, q75 + v14_cut_off
outliers = [x for x in v14_fraud if x < v14_lower or x > v14_upper]
undersampling_df = undersampling_df.drop(undersampling_df[(undersampling_df['V14'] > v14_upper) | (undersampling_df['V14'] < v14_lower)].index)

In [None]:
f,([ax1, ax2, ax3], [ax4, ax5, ax6]) = plt.subplots(2, 3, figsize=(20,16))
# Feature V10
sns.boxplot(x="Class", y="V10", data=undersampling_df, ax=ax1, palette=colors)
ax1.set_title("V10 Feature \n Reduction of outliers", fontsize=14)
# Feature V12
sns.boxplot(x="Class", y="V12", data=undersampling_df, ax=ax2, palette=colors)
ax2.set_title("V12 Feature \n Reduction of outliers", fontsize=14)

# Feature V14
sns.boxplot(x="Class", y="V14", data=undersampling_df,ax=ax3, palette=colors)
ax3.set_title("V14 Feature \n Reduction of outliers", fontsize=14)

# Feature V2
sns.boxplot(x="Class", y="V2", data=undersampling_df,ax=ax4, palette=colors)
ax4.set_title("V2 Feature \n Reduction of outliers", fontsize=14)

# Feature V11
sns.boxplot(x="Class", y="V11", data=undersampling_df,ax=ax5, palette=colors)
ax5.set_title("V11 Feature \n Reduction of outliers", fontsize=14)

# Feature V19
sns.boxplot(x="Class", y="V19", data=undersampling_df,ax=ax6, palette=colors)
ax6.set_title("V19 Feature \n Reduction of outliers", fontsize=14)



plt.show()

In [None]:
# 차원 줄이기
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA