In [8]:
import tensorflow as tf
import shutil
import pandas as pd
import numpy as np

from sklearn.metrics import fbeta_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN,SMOTETomek

Using TensorFlow backend.


In [9]:
data = pd.read_csv('creditcard.csv',dtype='float32',encoding='utf-8')

In [10]:
## data setting
# setting up testing and training sets
df_train, df_test = train_test_split(data, test_size=0.2, random_state=27)
print('df_train :', df_train.shape)
print('df_test :', df_test.shape)

df_train : (227845, 31)
df_test : (56962, 31)


In [11]:
df_train, df_valid = train_test_split(df_train, test_size=0.1, random_state=27)
print('df_train :', df_train.shape)
print('df_valid :', df_valid.shape)
print('df_test :', df_test.shape)
print('df_train Class: \n', df_train.Class.value_counts())

df_train : (205060, 31)
df_valid : (22785, 31)
df_test : (56962, 31)
df_train Class: 
 0.0    204712
1.0       348
Name: Class, dtype: int64


In [12]:
### Preprocess

In [13]:
##### 1. OUTLIER Delete in train data => 시각화 분석 결과 정상유저의 threshold 범위 안에 사기유저 있음
###### 즉, threshold 를 얼마정도 잡고 데이터를 없애도 크게 상관은 없어보임
def outlier_treatment(df,col,beta):
    q1,q3 = df.describe().loc['25%',col], df.describe().loc['75%',col]
    IQR = q3 - q1
    lower_range = q1 - (beta * IQR)
    upper_range = q3 + (beta * IQR)
#     print(col,'lower_range :',lower_range)
#     print(col,'upper_range :',upper_range)
    df = df[(df[col] > lower_range) & (df[col] < upper_range)]
    return df

def cleaning_df(df,cols,beta=10):
    for i in cols:
        df = outlier_treatment(df,i,beta=beta)
    print('outlier delete')
    return df

In [14]:
##### 2. feature engineering
def add_engineered_features(features):
    features = features.astype('float32')
    features['V14pV12'] = features['V14'] + features['V12']
    features['V2pV11'] = features['V2'] + features['V11']
    features['V10pV3'] = features['V10'] + features['V3']
    features['V17pV14'] = features['V17'] + features['V14']
    features['V4pV2'] = features['V4'] + features['V2']

    features['sqV3-V2'] = (features['V3'] - features['V2']) **2
    features['sqV8'] = (features['V8']) **2
    features['sqV2'] = (features['V2']) **2
    features['sqV17'] = (features['V17']) **2

    features['V17-V11'] = features['V17'] - features['V11'] 
    print('generate feature engineered')
    return features

##### deprecated - 3. normalization --> only Amount
def zscore(col):
    mean = df_train['Amount'].mean()
    std = df_train['Amount'].std()
    return (col - mean) / std

In [15]:
##### 4.DATA Augmentation --> oversampling SMOTE
def add_smote_features(features):
    sm = SMOTE(random_state=27, k_neighbors=5)
    features, _ = sm.fit_sample(features, features.Class)
    features = pd.DataFrame(features, columns=list(features))
    print(features.Class.value_counts())
    features = features.astype('float32')
    print('augmentation using SMOTE')
    return features

def add_smoteenn_features(features):
    sm = SMOTEENN(random_state=27)
    features, _ = sm.fit_resample(features, features.Class)
    features = pd.DataFrame(features, columns=list(features))
    print(features.Class.value_counts())
    features = features.astype('float32')
    print('augmentation using SMOTEENN')
    return features

def add_smotetomek_features(features):
    sm = SMOTETomek(random_state=27)
    features, _ = sm.fit_resample(features, features.Class)
    features = pd.DataFrame(features, columns=list(features))
    print(features.Class.value_counts())
    features = features.astype('float32')
    print('augmentation using SMOTETomek')
    return features

def add_adasyn_features(features):
    ada = ADASYN(random_state=27)
    features, _ = ada.fit_resample(features, features.Class)
    features = pd.DataFrame(features, columns=list(features))
    print(features.Class.value_counts())
    features = features.astype('float32')
    print('augmentation using ADASYN')
    return features

In [16]:
outlier_cols = ['V2','V6','V7','V13','V16','V23','V24','V25','V26','V28','Amount']
df_train = cleaning_df(df_train,cols=outlier_cols,beta=10)
df_train = add_engineered_features(df_train)
df_train = add_smote_features(df_train) ### 원하는 sampling model 선택 (adasyn, smoteenn, smotetomek 등)

df_valid = add_engineered_features(df_valid)
df_test = add_engineered_features(df_test)

outlier delete
generate feature engineered
1.0    199840
0.0    199840
Name: Class, dtype: int64
augmentation using SMOTE
generate feature engineered
generate feature engineered


In [18]:
df_train.to_csv('train.csv',index=False)
df_valid.to_csv('valid.csv',index=False)
df_test.to_csv('test.csv',index=False)

In [20]:
82/112

0.7321428571428571