<a href="https://colab.research.google.com/github/sungchan1/goingSaboho/blob/gwangseok/Creditcard_Fraud_oversampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, RobustScaler
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold
from sklearn.model_selection import GridSearchCV, ShuffleSplit, learning_curve, cross_val_score
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier



df = pd.read_csv('/content/drive/MyDrive/Credit_fraud/data/creditcard.csv')
std_scaler = StandardScaler()


df['Time'] = std_scaler.fit_transform(df['Time'].values.reshape(-1,1))
df['Amount'] = std_scaler.fit_transform(df['Amount'].values.reshape(-1,1))

In [None]:
x = df.drop('Class',axis=1)
y = df['Class']

# stratify 옵션은 train data와 test data의 샘플의 클래스 비율을 일정하게 하게한다.
# train: test = 4 : 1
# train: 394개의 fraud // test: 98개의 fraud
x_train, x_test, y_train, y_test = train_test_split(x,y,stratify=y, test_size = 0.2)

In [None]:
# Oversampling SMOTE

smote = SMOTE(random_state=0)
x_train_over, y_train_over = smote.fit_sample(x_train,y_train)

In [None]:
# Recall : TP/TP+FN
# Precision = TP/TP+FP
# Accuracy = (TP+TN)/(TP+FP+FN+TN)
# F1 score = 2 * (Precison * Recall) / (Precision + Recall)
def print_metric(y_test, y_pred):
    print(f"Recall Score: {recall_score(y_test, y_pred)}")
    print(f"Precision Score: {precision_score(y_test, y_pred)}")
    print(f"F1 Score: {f1_score(y_test, y_pred)}")
    print(f"Accuracy Score: {accuracy_score(y_test, y_pred)}")
    print(f"AUC: {roc_auc_score(y_test, y_pred, average='macro')}")

In [None]:
# ylim: y축 범위 제한
# cv: default = none >> 5
# n_jobs: 연산을 위한 CPU 개수 지정
# train_sizes: learning curve 생성시 사용할 데이터 사이즈 지정 (training set의 상대적인 또는 절대적인 숫자)
def plot_learning_curve(estimator, x, y, ylim=None, cv=None, n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5), s=None):
    if ylim is not None:
        plt.ylim(*ylim)
    # train_sizes : (392 + 392)의 80% 를 0.1, 0.325, 0.55, 0.775, 1의 비율로 학습시긴다.
    train_sizes, train_scores, test_scores = learning_curve(estimator, x, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring=s)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    # 평균에 표준 편차를 +-해준 영역을 색칠한다.
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="#ff9124")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="#2492ff")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="#ff9124", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="#2492ff", label="Cross-validation score")
    plt.xlabel('Training size')
    plt.ylabel('F1 Score')
    # 그림에 선 표시
    plt.grid(True)
    # 범례 표시: best - 자동으로 최적의 위치에
    plt.legend(loc="best")
    plt.show()

cv = ShuffleSplit(n_splits=5, test_size=0.2)

In [None]:
# data distribution 시각화
row_cnt = len(df)
zero_cnt = df['Class'].value_counts()[0]
one_cnt = df['Class'].value_counts()[1]
ratio_no_fraud = round(zero_cnt/row_cnt * 100,2)
ratio_fraud = round(one_cnt/row_cnt * 100,2)


colors = ["#0101DF", "#DF0101"]
sns.countplot('Class', data=df, palette=colors)
plt.title(f'Class Distributions \n (0: No Fraud ({zero_cnt}, {ratio_no_fraud} %) '
          f'\n (1: Fraud ({one_cnt}, {ratio_fraud} %))', fontsize=14)

plt.show()

In [None]:
# Data Correlation Matrices
# pandas corr를 통해 피어슨 상관계수 사용
corr = df.corr()
plt.figure(figsize=(24,10))
ax=sns.heatmap(corr, cmap='coolwarm_r', annot_kws={'size':20})
ax.set_title("Correlation Matrix", fontsize=14)

plt.show()

In [None]:
new_df = pd.concat([pd.DataFrame(x_train_over), pd.DataFrame(y_train_over)], axis=1)
label = new_df.iloc[:,-1]
u_zero_cnt = label.value_counts()[0]
u_one_cnt = label.value_counts()[1]
colors = ["#0101DF", "#DF0101"]
sns.countplot(label, palette=colors)
plt.title(f'Equally Distributed Classes \n (0: No Fraud ({u_zero_cnt})'
            f'\n (1: Fraud ({u_one_cnt})', fontsize=14)
plt.show()

In [None]:
# pandas corr를 통해 피어슨 상관계수 사용
plt.figure(figsize=(24,10))

sub_sample_corr = new_df.corr()
ax=sns.heatmap(sub_sample_corr, cmap='coolwarm_r', annot_kws={'size':20})
ax.set_title('UnderSampling Correlation Matrix \n (use for reference)', fontsize=14)
plt.show()
# heatmap의 class를 보자.
# V3, V10, V12, V14은 음의 상관계수를 가진다. 즉, 이 값들이 작을수록 fraud인 것이다.
# 반대로 V2, V4, V11는 양의 상관계수를 가진다. 즉, 이 값들이 클수록 fraud인 것이다.

In [None]:
# GridSearch + Cross Validation으로 KNN hyper parameter 구하기.

knc_params = {"n_neighbors": list(range(1,11)), 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}

# GridSearchCV 인자설명
# cv = 하나의 파라미터 쌍으로 모델링할 때 train, test 교차검증을 3번실시하겠다는 뜻
# refit=True : GridSearch한 후 가장 최고로 좋은 파라미터로 학습시켜 놓겠다.
# 이것 때문에 애초에 GridSearchCV 적용한 객체만으로 최적의 파라미터 적용된 모델로드 가능

grid_knc = GridSearchCV(KNeighborsClassifier(), knc_params, cv = 5, refit=True, scoring='f1')

# GridSearch 하면서 모든 파라미터값들에 대해 학습 수행
grid_knc.fit(x_train_over, y_train_over)


# 최적의 파라미터는 best_params_에 할당되어있음
print(f"최적의 파라미터 : {grid_knc.best_params_}")
print(f"최적의 파라미터 모델의 F1 : {grid_knc.best_score_}")

In [None]:
# 최적의 파라미터로 학습되어 있는 모델링 할당
# knc = grid_knc.best_estimator_

# n이 증가할수록 precision 값은 증가하지만 recall 값은 감소한다.
knc = KNeighborsClassifier(n_neighbors=3)
knc.fit(x_train_over, y_train_over)
y_pred = knc.predict(x_test)

print_metric(y_test, y_pred)

In [None]:
plt.figure(figsize=(10, 7))
plt.title("KNN Learning Curve", fontsize=14)
plot_learning_curve(KNeighborsClassifier(n_neighbors=3), x_train_over, y_train_over , (0.80, 1.01), cv=cv, n_jobs=-1, s='f1')

In [None]:
# LogisticRegression은 규제를 강하게 줘 overfitting을 방지해야 한다. >> 0.000001 정도
lr = LogisticRegression(C=0.000001)
lr.fit(x_train_over, y_train_over)
y_pred = lr.predict(x_test)

print_metric(y_test, y_pred)

In [None]:
plt.figure(figsize=(10, 7))
plt.title("Logistic Regression Learning Curve", fontsize=14)
plot_learning_curve(lr, x_train_over, y_train_over, (0.80, 1.01), cv=cv, s='f1')

In [None]:
svc = SVC(C=0.0001, kernel='linear')
svc.fit(x_train_over, y_train_over)
y_pred = svc.predict(x_test)

print_metric(y_test, y_pred)

In [None]:
plt.figure(figsize=(10, 7))
plt.title("Support Vector Machine Learning Curve", fontsize=14)
plot_learning_curve(svc, x_train_over, y_train_over, (0.80, 1.01), cv=cv, s='f1')

In [None]:
lgb = LGBMClassifier(n_estimators=1000,num_leaves=64,n_jobs=-1,boost_from_average=False, application='binary')
lgb.fit(x_train_over, y_train_over)

In [None]:
y_pred = lgb.predict(x_test)
print_metric(y_test, y_pred)

In [None]:
plt.figure(figsize=(10, 7))
plt.title("LightGBM Learning Curve", fontsize=14)
plot_learning_curve(lgb, x_train_over, y_train_over, (0.80, 1.01), cv=cv, s='f1')

In [None]:
x_train_over, y_train_over

In [None]:
import tensorflow as tf
import numpy as np
from tensorlow.keras.models import Sequential, load_model
from tensorlow.keras.layers import Dense, Flatten, Dropout, InputLayer
from tensorlow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorlow.normalization import BatchNormalization
from tensorlow.keras.metrics import FalseNegatives, FalsePositives, TrueNegatives
from tensorlow.keras.metrics import TruePositives, Precision, Recall, Accuracy

metrics = [
    FalseNegatives(name="fn"),
    FalsePositives(name="fp"),
    TrueNegatives(name="tn"),
    TruePositives(name="tp"),
    Precision(name="precision"),
    Recall(name="recall"),
    Accuracy(name='accuracy')
]

model = Sequential(
    [
        Dense(256, activation="relu", input_shape=(train_features.shape[-1],)),
        Dense(256, activation="relu"),
        Dropout(0.3),
        Dense(256, activation="relu"),
        Dropout(0.3),
        Dense(1, activation="sigmoid"),
    ]
)
model.summary()