In [203]:
import tensorflow as tf
import shutil
import pandas as pd
import numpy as np

from sklearn.metrics import fbeta_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE,ADASYN
from imblearn.combine import SMOTEENN,SMOTETomek

In [4]:
data = pd.read_csv('creditcard.csv',dtype='float32',encoding='utf-8')

In [5]:
## data setting
# setting up testing and training sets
df_train, df_test = train_test_split(data, test_size=0.2, random_state=27)
print('df_train :', df_train.shape)
print('df_test :', df_test.shape)

df_train : (227845, 31)
df_test : (56962, 31)


In [6]:
df_train, df_valid = train_test_split(df_train, test_size=0.1, random_state=27)
print('df_train :', df_train.shape)
print('df_valid :', df_valid.shape)
print('df_test :', df_test.shape)
print('df_train Class: \n', df_train.Class.value_counts())

df_train : (205060, 31)
df_valid : (22785, 31)
df_test : (56962, 31)
df_train Class: 
 0.0    204712
1.0       348
Name: Class, dtype: int64


In [None]:
### Preprocess

In [None]:
##### 1. OUTLIER Delete in train data => 시각화 분석 결과 정상유저의 threshold 범위 안에 사기유저 있음
###### 즉, threshold 를 얼마정도 잡고 데이터를 없애도 크게 상관은 없어보임

In [77]:
def outlier_treatment(df,col,beta):
    q1,q3 = df.describe().loc['25%',col], df.describe().loc['75%',col]
    IQR = q3 - q1
    lower_range = q1 - (beta * IQR)
    upper_range = q3 + (beta * IQR)
#     print(col,'lower_range :',lower_range)
#     print(col,'upper_range :',upper_range)
    df = df[(df[col] > lower_range) & (df[col] < upper_range)]
    return df

def cleaning_df(df,cols,beta=10):
    for i in cols:
        df = outlier_treatment(df,i,beta=beta)
    print('outlier delete')
    return df

In [None]:
##### 2. normalization

In [243]:
def zscore(col):
    mean = df_train['Amount'].mean()
    std = df_train['Amount'].std()
    return (col - mean) / std

In [244]:
##### 3. feature engineering

In [245]:
df_tr = df_train.copy()
df_tr['V14pV12'] = df_tr['V14'] + df_tr['V12']
df_tr['V2pV11'] = df_tr['V2'] + df_tr['V11']
df_tr['V10pV3'] = df_tr['V10'] + df_tr['V3']
df_tr['V17pV14'] = df_tr['V17'] + df_tr['V14']
df_tr['V4pV2'] = df_tr['V4'] + df_tr['V2']

df_tr['sqV3-V2'] = (df_tr['V3'] - df_tr['V2']) **2
df_tr['sqV8'] = (df_tr['V8']) **2
df_tr['sqV2'] = (df_tr['V2']) **2
df_tr['sqV17'] = (df_tr['V17']) **2


df_tr['V17-V11'] = df_tr['V17'] - df_tr['V11'] 
df_tr = df_tr[AA]

In [246]:
def add_engineered_features(features):
    features = features.astype('float32')
    features['V14pV12'] = features['V14'] + features['V12']
    features['V2pV11'] = features['V2'] + features['V11']
    features['V10pV3'] = features['V10'] + features['V3']
    features['V17pV14'] = features['V17'] + features['V14']
    features['V4pV2'] = features['V4'] + features['V2']

    features['sqV3-V2'] = (features['V3'] - features['V2']) **2
    features['sqV8'] = (features['V8']) **2
    features['sqV2'] = (features['V2']) **2
    features['sqV17'] = (features['V17']) **2

    features['V17-V11'] = features['V17'] - features['V11'] 
    print('generate feature engineered')
    return features

def add_engineered_features(features):
    features = features.astype('float32')
    features["V17-V11"] = features["V17"] - features["V11"]
    features["V17-V2"] = features["V17"] - features["V2"]
    features["V17-V15"] = features["V17"] - features["V15"]
    features["V14-V11"] = features["V14"] - features["V11"]
    features["V14-V12"] = features["V14"] - features["V12"]
    features["V14-V4"] = features["V14"] - features["V4"]
    features["V12-V11"] = features["V12"] - features["V11"]
    features["V12-V4"] = features["V12"] - features["V4"]
    features["V10-V8"] = features["V10"] - features["V8"]
    features["V5-V4"] = features["V5"] - features["V4"]
    features["V3-V2"] = features["V3"] - features["V2"]
    features["V3-V4"] = features["V3"] - features["V4"]
    features["V7-V6"] = features["V7"] - features["V6"]
    
    features["V11V4"] = features["V11"] * features["V4"]
    features["V19V8"] = features["V19"] * features["V8"]
    features["V16V10"] = features["V16"] * features["V10"]
    features["V3V8"] = features["V3"] * features["V8"]
    features["V7V8"] = features["V7"] * features["V8"]
    
    features["V17V17"] = features["V17"] ** 2
    features["V16V16"] = features["V16"] ** 2
    features["V12V12"] = features["V12"] ** 2
    features["V4V4"] = features["V4"] ** 2
    features["sqV17-V11"] = features["V17-V11"] **2
    features["sqV17-V15"] = features["V17-V15"] **2
    features["sqV17-V2"] = features["V17-V2"] **2
    features["sqV14-V11"] = features["V14-V11"] **2
    features["sqV14-V4"] = features["V14-V4"] **2
    features["sqV12-V11"] = features["V12-V11"] **2
    features["sqV10-V8"] = features["V10-V8"] **2
    features["sqV3-V4"] =features["V3-V4"] **2
    
    features["V12pV14"] = ((features["V12"]+features["V14"]) ** 2)**(1/2)
#     features = features.astype('float32')
    print('generate feature engineered')
    return features

In [247]:
##### 4.DATA Augmentation --> oversampling SMOTE

In [248]:
def add_smote_features(features):
    sm = SMOTE(random_state=27, k_neighbors=5)
    features, _ = sm.fit_sample(features, features.Class)
    features = pd.DataFrame(features, columns=list(features))
    print(features.Class.value_counts())
    features = features.astype('float32')
    print('augmentation using SMOTE')
    return features

def add_smoteenn_features(features):
    sm = SMOTEENN(random_state=27)
    features, _ = sm.fit_resample(features, features.Class)
    features = pd.DataFrame(features, columns=list(features))
    print(features.Class.value_counts())
    features = features.astype('float32')
    print('augmentation using SMOTEENN')
    return features

def add_smotetomek_features(features):
    sm = SMOTETomek(random_state=27)
    features, _ = sm.fit_resample(features, features.Class)
    features = pd.DataFrame(features, columns=list(features))
    print(features.Class.value_counts())
    features = features.astype('float32')
    print('augmentation using SMOTETomek')
    return features

def add_adasyn_features(features):
    ada = ADASYN(random_state=27)
    features, _ = ada.fit_resample(features, features.Class)
    features = pd.DataFrame(features, columns=list(features))
    print(features.Class.value_counts())
    features = features.astype('float32')
    print('augmentation using ADASYN')
    return features

In [249]:
# feature_cols
outlier_cols = ['V2','V6','V7','V13','V16','V23','V24','V25','V26','V28','Amount']
# FEATURE_NAMES = ['V'+str(i) for i in range(1,29)] + \
#                 ["V14-V4","V17-V15","V12-V11","V10-V8","V5-V4","V3-V2","V3-V4","V17-V11","V12-V4","V14-V11","V17-V2","V7-V6","V14-V12"] + \
#                 ["V11V4","V16V10","V19V8","V17V17","V16V16","V12V12","V4V4","sqV17-V11","sqV17-V15","sqV17-V2","V3V8","V7V8","sqV14-V11","sqV14-V4","sqV12-V11","sqV10-V8","sqV3-V4"] +\
#                 ["V12pV14"]

FEATURE_NAMES = ['V'+str(i) for i in range(1,29)]
# FEATURE_NAMES.append('Amount')
# FEATURE_NAMES.append('Time')

#### 직접 눈으로보고 만든 피처  +  앙상블모델에 의한 피처 임포턴스로 추가한 피처(V26,V17,V14,V10,V7)
# FEATURE_NAMES = ['V14pV12','V2pV11','V17-V11','sqV3-V2','V10pV3','V17pV14','V4pV2','sqV8','sqV2','sqV17',
#                 'V17','V14','V10','V26','V7']
feature_cols = [tf.feature_column.numeric_column(k) for k in FEATURE_NAMES]
feature_cols.append(tf.feature_column.numeric_column('Amount',normalizer_fn=zscore))
FEATURE_NAMES.append('Amount')
LABEL_NAME = 'Class'
# feature_cols = [tf.feature_column.numeric_column(k) for k in ['V'+str(i) for i in range(1,29)]]
# feature_cols.append(tf.feature_column.numeric_column("V17-V15"))
# feature_cols.append(tf.feature_column.numeric_column("V12-V11"))
# feature_cols.append(tf.feature_column.numeric_column("V10-V8"))
# feature_cols.append(tf.feature_column.numeric_column("V5-V4"))
# feature_cols.append(tf.feature_column.numeric_column("V3-V2"))


In [260]:
# pandas_input_fn
def train_input_fn(df,batch_size=128,num_epochs=5,shuffle=True):
    # 1.delete outlier
    df = cleaning_df(df,outlier_cols,beta=10)
    # 2.feature engineering
    df = add_engineered_features(df)
    # 3.add_smote
    df = add_smotetomek_features(df)
    return tf.estimator.inputs.pandas_input_fn(
      x=df[FEATURE_NAMES],
      y=df[LABEL_NAME],
      batch_size = batch_size,
      num_epochs = num_epochs,
      shuffle = shuffle,
      queue_capacity = 1000,
      num_threads = 1
  )

def eval_input_fn(df,batch_size=128):
    # 2.feature engineering
    df = add_engineered_features(df)
    return tf.estimator.inputs.pandas_input_fn(
      x=df[FEATURE_NAMES],
      y=df[LABEL_NAME],
      batch_size = batch_size,
      num_epochs = 1,
      shuffle = False,
      queue_capacity = 1000,
      num_threads = 1
  )

def predict_input_fn(df,batch_size=128):
    # 2.feature engineering
    df = add_engineered_features(df)
    return tf.estimator.inputs.pandas_input_fn(
      x=df[FEATURE_NAMES],
      y=df[LABEL_NAME],
      batch_size = batch_size,
      num_epochs = 1,
      shuffle = False,
      queue_capacity = 1000,
      num_threads = 1
  )

------

In [261]:
OUTDIR = "credit-prac"

config = tf.estimator.RunConfig(
    model_dir = OUTDIR,
    tf_random_seed = 1,
    save_checkpoints_steps = 1000
)

model = tf.estimator.DNNClassifier(
        hidden_units = [30,30,30], 
        feature_columns = feature_cols,
        config = config,
        optimizer='Adam',
        batch_norm=True
)

INFO:tensorflow:Using config: {'_tf_random_seed': 1, '_is_chief': True, '_eval_distribute': None, '_num_ps_replicas': 0, '_experimental_distribute': None, '_model_dir': 'credit-prac', '_task_id': 0, '_global_id_in_cluster': 0, '_keep_checkpoint_every_n_hours': 10000, '_experimental_max_worker_delay_secs': None, '_num_worker_replicas': 1, '_service': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_protocol': None, '_save_checkpoints_steps': 1000, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f65177a5d30>, '_session_creation_timeout_secs': 7200, '_evaluation_master': '', '_keep_checkpoint_max': 5, '_device_fn': None, '_save_checkpoints_secs': None, '_save_summary_steps': 100, '_task_type': 'worker', '_master': '', '_log_step_count_steps': 100, '_train_distribute': None}


In [257]:
%%time
tf.logging.set_verbosity(tf.logging.INFO) 
shutil.rmtree(path = OUTDIR, ignore_errors = True)

model.train(input_fn = train_input_fn(df=df_train, batch_size=128, num_epochs=None), steps = 10000)

outlier delete
generate feature engineered
1.0    197792
0.0    193212
Name: Class, dtype: int64
augmentation using SMOTEENN
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into credit-prac/model.ckpt.
INFO:tensorflow:loss = 114.62146, step = 1
INFO:tensorflow:global_step/sec: 16.9371
INFO:tensorflow:loss = 0.00054929225, step = 101 (5.906 sec)
INFO:tensorflow:global_step/sec: 33.1682
INFO:tensorflow:loss = 0.0005036519, step = 201 (3.014 sec)
INFO:tensorflow:global_step/sec: 34.6029
INFO:tensorflow:loss = 0.00043755161, step = 301 (2.894 sec)
INFO:tensorflow:global_step/sec: 35.6635
INFO:tensorflow:loss = 0.0004216545, step = 401 (2.800 sec)
INFO:tensorflow:global_step/sec: 36.9986
INFO:tensorflow:loss = 0.00036741694, step = 501 (2.703 sec)
INFO:tensorflow:g

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifier at 0x7f6518246908>

In [258]:
metrics = model.evaluate(input_fn=eval_input_fn(df=df_valid, batch_size=128))

generate feature engineered
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2019-12-12T04:43:22Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from credit-prac/model.ckpt-10000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2019-12-12-04:43:28
INFO:tensorflow:Saving dict for global step 10000: accuracy = 0.3735791, accuracy_baseline = 0.99824446, auc = 0.6294845, auc_precision_recall = 0.43885592, average_loss = 99.48548, global_step = 10000, label/mean = 0.001755541, loss = 12663.558, precision = 0.0024470391, prediction/mean = 0.6274654, recall = 0.875
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 10000: credit-prac/model.ckpt-10000


In [259]:
def get_predictions(model, input_fn):
    return [x["class_ids"][0] for x in model.predict(input_fn=input_fn)]

# Create a confusion matrix
with tf.Graph().as_default():
    y_pred = get_predictions(model, predict_input_fn(df=df_test, batch_size=128))
    cm = tf.confusion_matrix(df_test["Class"],y_pred)
    with tf.Session() as session:
        cm_out = session.run(cm)
        print(cm_out)
print()
print('Class1')
print('precision :',cm_out[1][1]/(cm_out[0][1]+cm_out[1][1]))
print('recall :',cm_out[1][1]/(cm_out[1][0]+cm_out[1][1]))
print('fbeta_score :', fbeta_score(df_test["Class"], y_pred, beta=5))

generate feature engineered
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from credit-prac/model.ckpt-10000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
[[21415 35443]
 [   22    82]]

Class1
precision : 0.0023082336382828994
recall : 0.7884615384615384
fbeta_score : 0.055921311475409845


---

In [228]:
pd.DataFrame(cm_out)

Unnamed: 0,0,1
0,56826,32
1,21,83


In [None]:
oversamling 안한것 결과
[[56796    62]
 [   20    84]]

Class1
precision : 0.5753424657534246
recall : 0.8076923076923077
fbeta_score : 0.7953386744355425

#### LOGIT..

In [None]:
df_logit_train = df_train.copy()
# 0.Amount normalization
df_logit_train['Amount'] = zscore(df_logit_train['Amount'])
# 1.delete outlier
df_logit_train = cleaning_df(df_logit_train,outlier_cols,beta=10)
# 2.feature engineering
df_logit_train = add_engineered_features(df_logit_train)
# 3.add_smote
df_logit_train = add_smoteenn_features(df_logit_train)

df_logit_train = df_logit_train.drop('Time',axis=1)

mid_df_logit_train = pd.concat([df_logit_train[df_logit_train.Class == 0].sample(frac=1).iloc[:len(df_logit_train)//2, :],df_logit_train[df_logit_train.Class ==1]], axis=0) 

In [None]:
df_logit_test = df_test.copy()
# 0.Amount normalization
df_logit_test['Amount'] = zscore(df_logit_test['Amount'])
# 2.feature engineering
df_logit_test = add_engineered_features(df_logit_test)
df_logit_test = df_logit_test.drop('Time',axis=1)

In [None]:
# df_logit_train_y = mid_df_logit_train.pop('Class')
df_logit_train_y = df_logit_train.pop('Class')
df_logit_test_y = df_logit_test.pop('Class')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
# clf = LogisticRegression(solver='saga',class_weight='balanced').fit(mid_df_logit_train, df_logit_train_y)
clf = LogisticRegression(solver='saga',class_weight='balanced').fit(df_logit_train, df_logit_train_y)
pred_y = clf.predict(df_logit_test)
print(confusion_matrix(df_logit_test_y,pred_y))
print(classification_report(df_logit_test_y,pred_y))

-- scale

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
scaler.fit(df_logit_train)
df_logit_train_scaler = scaler.transform(df_logit_train)
df_logit_test_scaler = scaler.transform(df_logit_test)
clf = LogisticRegression(solver='saga').fit(df_logit_train_scaler, df_logit_train_y)
pred_y = clf.predict(df_logit_test_scaler)
print(confusion_matrix(df_logit_test_y,pred_y))
print(classification_report(df_logit_test_y,pred_y))

### 직접 피처만들어서 샘플링 안돌리고 그냥 한 결과

In [79]:
AA = ['V14pV12', 'V2pV11', 'V17-V11','sqV3-V2','V10pV3','V17pV14','V4pV2','sqV8','sqV2','sqV17','Class']

In [80]:
df_tr = df_train.copy()
df_tr['V14pV12'] = df_tr['V14'] + df_tr['V12']
df_tr['V2pV11'] = df_tr['V2'] + df_tr['V11']
df_tr['V10pV3'] = df_tr['V10'] + df_tr['V3']
df_tr['V17pV14'] = df_tr['V17'] + df_tr['V14']
df_tr['V4pV2'] = df_tr['V4'] + df_tr['V2']

df_tr['sqV3-V2'] = (df_tr['V3'] - df_tr['V2']) **2
df_tr['sqV8'] = (df_tr['V8']) **2
df_tr['sqV2'] = (df_tr['V2']) **2
df_tr['sqV17'] = (df_tr['V17']) **2


df_tr['V17-V11'] = df_tr['V17'] - df_tr['V11'] 
df_tr = df_tr[AA]

#-----------------------
df_te = df_test.copy()
df_te['V14pV12'] = df_te['V14'] + df_te['V12']
df_te['V2pV11'] = df_te['V2'] + df_te['V11']
df_te['V10pV3'] = df_te['V10'] + df_te['V3']
df_te['V17pV14'] = df_te['V17'] + df_te['V14']
df_te['V4pV2'] = df_te['V4'] + df_te['V2']

df_te['sqV3-V2'] = (df_te['V3'] - df_te['V2']) **2
df_te['sqV8'] = (df_te['V8']) **2
df_te['sqV2'] = (df_te['V2']) **2
df_te['sqV17'] = (df_te['V17']) **2

df_te['V17-V11'] = df_te['V17'] - df_te['V11']
df_te = df_te[AA]

In [81]:
# df_logit_train_y = mid_df_logit_train.pop('Class')
df_tr_y = df_tr.pop('Class')
df_te_y = df_te.pop('Class')

In [82]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [123]:
# clf = LogisticRegression(solver='saga',class_weight='balanced').fit(mid_df_logit_train, df_logit_train_y)
clf = LogisticRegression(solver='saga',class_weight='balanced').fit(df_tr, df_tr_y)
pred_y = clf.predict(df_te)
print(confusion_matrix(df_te_y,pred_y))
print(classification_report(df_te_y,pred_y))

print('fbeta_score :', fbeta_score(df_te_y, pred_y, beta=5))



[[48348  8510]
 [    9    95]]
              precision    recall  f1-score   support

         0.0       1.00      0.85      0.92     56858
         1.0       0.01      0.91      0.02       104

    accuracy                           0.85     56962
   macro avg       0.51      0.88      0.47     56962
weighted avg       1.00      0.85      0.92     56962

fbeta_score : 0.22043730477465417


#### 4. Gradient Boosting Tree

In [174]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

In [167]:
gb_model = GradientBoostingClassifier(loss='exponential',
                                      learning_rate=0.1,
                                      n_estimators=100,
                                      min_samples_split=3,
                                      min_samples_leaf=2,
                                      max_depth=5,).fit(df_tr, df_tr_y)

In [168]:
gb_pred = gb_model.predict(df_te)

In [170]:
# 결과
print(pd.DataFrame(confusion_matrix(df_te_y, gb_pred)))

       0   1
0  56851   7
1     22  82


In [171]:
from sklearn.metrics import fbeta_score

In [172]:
print(classification_report(df_te_y, gb_pred))
print('fbeta_score :',fbeta_score(df_te_y, gb_pred,beta=5))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     56858
         1.0       0.92      0.79      0.85       104

    accuracy                           1.00     56962
   macro avg       0.96      0.89      0.92     56962
weighted avg       1.00      1.00      1.00     56962

fbeta_score : 0.792859799181852


#### 5. Ramdom Forest

In [176]:
rf_model = RandomForestClassifier(n_estimators=100,
    criterion='gini',
    max_depth=15).fit(df_tr, df_tr_y)
rf_pred = rf_model.predict(df_te)
# 결과
print(pd.DataFrame(confusion_matrix(df_te_y, rf_pred)))
print(classification_report(df_te_y, rf_pred))
print('fbeta_score :',fbeta_score(df_te_y, rf_pred,beta=5))

       0   1
0  56853   5
1     21  83
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     56858
         1.0       0.94      0.80      0.86       104

    accuracy                           1.00     56962
   macro avg       0.97      0.90      0.93     56962
weighted avg       1.00      1.00      1.00     56962

fbeta_score : 0.802827380952381
