## 온라인 거래 데이터를 활용한 이상거래 탐지

> 비지도학습- 랜덤포레스트와 DNN을 통해 이상거래 탐지 모형 구축

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn import preprocessing
import xgboost as xgb
import gc
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

ModuleNotFoundError: No module named 'xgboost'

#### 데이터의 경우  identity와 transaction인 두 파일로 나눠짐
    모든 거래 데이터가 해당 ID를 가지고 있는 것은 아님=>TransactionID로 join
    
> train_transaction, train_identity, test_transaction, test_identity

* Transaction의 변수 
    * ProductCD
    * emaildomain
    * card1-card6
    * addr1, addr2
    * P_emaildomain
    * M1 - M9


* Identity의 변수
    * DeviceType
    * Deviceinfo
    * id_12 - id_38

* 각 변수들은 범주형

* 두 파일을 합쳐 train, test 데이터로 사용하고자 함

In [None]:
os.chdir(r"C:\Users\USER\20190722_아시아 경제 update\notebook\CopyProject")
train_transaction = pd.read_csv('train_transaction.csv', 
                                  index_col = 'TransactionID' )
train_identity = pd.read_csv('train_identity.csv', index_col = 'TransactionID')

In [None]:
test_transaction = pd.read_csv('test_transaction.csv', index_col = "TransactionID")
test_identity = pd.read_csv('test_identity.csv', index_col = "TransactionID")

In [None]:
train = train_transaction.merge(train_identity, how = 'left', left_index =True,
                                    right_index = True)
test = test_transaction.merge(test_identity, how = 'left', left_index =True,
                                 right_index=True)

In [None]:
print(train.shape)
print(test.shape)

In [None]:
y_train = train['isFraud']
X_train = train.drop(columns=['isFraud'])
X_test = test.copy()

In [None]:
# del train_transaction, train_identity, test_transaction, test_identity, train, test
# 한번만 run

### 데이터 탐색

In [None]:
X_train.head(3)

* TransactionDT 변수는 주어진 참조 날짜 시간이며, 실제 time-stamp가 아님
    따라서 위의 변수를 제거하고자 함

In [None]:
X_train.head(3)

In [None]:
X_test.head(3)

#### ◎ 결측치 및 쓸모 없는 변수 제거 

In [None]:
# X_train.isnull().sum(axis=0) 의 경우 중간생략으로 알 수 없는 컬럼이 많음

In [None]:
na_columns = [x for x in X_train.columns 
                if (X_train[x].isnull().sum(axis=0) / X_train.shape[0]) > 0.97 ]
na_columns_test = [x for x in X_test.columns 
                  if (X_test[x].isnull().sum(axis=0) / X_test.shape[0]) > 0.07 ]
# NaN 비율이 97% 이상인 열이면 제거 (임의의 비율)

In [None]:
columns_drop = list(set(na_columns + na_columns_test))
len(columns_drop)

In [None]:
X_train = X_train.drop(columns_drop, axis = 1)
X_test = X_test.drop(columns_drop, axis = 1)

In [None]:
drop_cols = [x for x in X_train.columns if x[0] == "V"]
X_train = X_train.drop(drop_cols, axis = 1)

In [None]:
c_drop = ['C1', 'C2','C5','C6','C9','C11', 'C14']
X_train = X_train.drop(c_drop, axis =1)

In [None]:
X_train.describe()

In [None]:
X_test = X_test.drop(drop_cols, axis = 1)

In [None]:
X_test = X_test.drop(c_drop, axis =1)

In [None]:
print(X_train.shape)
print(X_test.shape)

* 변수의 고유한 값 중 하나의 값에만 치우쳐져 있는 경우 분석의 결과를 왜곡시킬 수 있음
* 쓸모없는 변수라 판단하고 제거함

#### 인코딩 : Label encoding

In [None]:
# Label Encoding
for f in X_train.columns:
    if X_train[f].dtype=='object' or X_test[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(X_train[f].values) + list(X_test[f].values))
        X_train[f] = lbl.transform(list(X_train[f].values))
        X_test[f] = lbl.transform(list(X_test[f].values)) 
        
X_train = X_train.fillna(-999)
X_test = X_test.fillna(-999)

In [None]:
X_train.describe() #TransactionAmt, card1, card2 - log변환

In [None]:
X_train['TransactionAmt'] = np.log(X_train['TransactionAmt'])

In [None]:
X_train['card2'] = np.log(X_train['card2'])

In [None]:
X_train.describe()

In [None]:
from sklearn.preprocessing import Imputer, MinMaxScaler, LabelEncoder

In [None]:
MMscaler = MinMaxScaler()
X_test['card3'] = MMscaler.fit_transform(X_test[['card3']])
X_test.head(3)

In [None]:
MMscaler = MinMaxScaler()
X_test['card5'] = MMscaler.fit_transform(X_test[['card5']])

In [None]:
MMscaler = MinMaxScaler()
X_test['C4'] = MMscaler.fit_transform(X_test[['C4']])

In [None]:
MMscaler = MinMaxScaler()
X_test['C7'] = MMscaler.fit_transform(X_test[['C7']])

In [None]:
MMscaler = MinMaxScaler()
X_test['C8'] = MMscaler.fit_transform(X_test[['C8']])

In [None]:
MMscaler = MinMaxScaler()
X_test['C10'] = MMscaler.fit_transform(X_test[['C10']])

In [None]:
MMscaler = MinMaxScaler()
X_test['C12'] = MMscaler.fit_transform(X_test[['C12']])

In [None]:
MMscaler = MinMaxScaler()
X_test['C13'] = MMscaler.fit_transform(X_test[['C13']])

In [None]:
MMscaler = MinMaxScaler()
X_test['D1'] = MMscaler.fit_transform(X_test[['D1']])

In [None]:
MMscaler = MinMaxScaler()
X_test['D10'] = MMscaler.fit_transform(X_test[['D10']])

In [None]:
MMscaler = MinMaxScaler()
X_test['D15'] = MMscaler.fit_transform(X_test[['D15']])

In [None]:
X_test.describe()

In [None]:
X_train.shape

* 데이터 용량이 커서 돌아가는데 시간이 많이 걸림
* 메모리를 줄일 필요를 느낌 - kaggle의 reduce-mem-usage함수를 찾아봄

In [None]:
%%time
# From kernel https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
# WARNING! THIS CAN DAMAGE THE DATA 
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df
X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)

#### 데이터 불러오기 시간을 줄이기 위해 메모리 줄인 데이터 저장
* 다음 부터는 이 데이터를 불러와서 실행

In [None]:
X_test.to_csv("X_test.csv", header = True, index = True)
X_train.to_csv("X_train.csv", header = True, index = True)
#y_train.to_csv("y_train.csv", header = True, index = True)

### 데이터 탐색 시각화

In [None]:
os.chdir(r"C:\Users\Woodayoung\Desktop\Datastudy\asiae\data1")

In [None]:
X_train = pd.read_csv('X_train.csv', index_col = "TransactionID")
y_train = pd.read_csv('y_train.csv', index_col = "TransactionID")

In [None]:
train1 = X_train.copy()
train1['isFraud'] = y_train

In [None]:
header = X_train.columns

In [None]:
train.head(4)

전체

In [None]:
# 사기/ 비사기 도수 분포표
sns.countplot('isFraud', data = train)
plt.show()

In [None]:
y_train = y_train['isFraud']

In [None]:
y_train.value_counts()

In [None]:
y_train.value_counts(normalize=True) # evnet rate : about 3.5%

* 데이터의 불균형을 줄이면서 , 메모리 부족 문제를 해결하기 위해 Random Under-Sampling을 하기로 결정
    * 각자 다른 사양의 노트북을 사용하여 메모리 부족 문제가 심각
    
    
* 랜덤으로 삭제하기 때문에 표본에 따른 부정확한 결과가 나올 수 도 있음을 인지

In [None]:
i=X_train.card2.isnull()
X_train.card2[i] = 5.768366

In [None]:
X_train.isnull().sum(axis=0)

In [None]:
from imblearn.under_sampling import *

In [None]:
X_train_a, y_train_a = RandomUnderSampler(random_state=0).fit_sample(X_train, y_train)

In [None]:
# 사기와 비사기의 비율을 1대1로 맞춰줌
sns.countplot(y_train_a)
plt.show()

In [None]:
y_train_a.shape

TransactionAmt 변수

In [None]:
X_train_a = pd.DataFrame(X_train_a, columns = header)
y_train_a = pd.Series(y_train_a)

In [None]:
X_train_aa = X_train_a.copy()
X_train_aa['isFraud'] = y_train_a
train_a = X_train_aa

In [None]:
sns.boxplot('isFraud', 'TransactionAmt', data = train_a , notch = True)
plt.show()

In [None]:
df_temp = train_a[['isFraud']].copy()
df_temp['TransactionAmtc'] = pd.qcut(train_a.TransactionAmt,7)
sns.barplot(x='TransactionAmtc',y='isFraud', data=df_temp)
plt.show()

ProductCD 변수

In [None]:
X_train_a.ProductCD.unique()

In [None]:
sns.countplot('isFraud', data = train_a, hue = 'ProductCD')
plt.legend(loc='upper left')
plt.show()

In [None]:
sns.barplot(x='ProductCD', y='isFraud', data= train_a)
plt.show()

In [None]:
train_a.head(3)

In [None]:
a = X_train_a.C1.value_counts().sort_index()

In [None]:
plt.plot(a)
plt.show()

In [None]:
train_1 = train.loc[train['isFraud'] == 1]
train_0 = train.loc[train['isFraud'] == 0]
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 8))
train_1.groupby('card1')['card1'].count().plot(kind='barh', ax=ax1, title='Count of card1 fraud')
train_0.groupby('card1')['card1'].count().plot(kind='barh', ax=ax2, title='Count of card1 non-fraud')
plt.ax2.set_ylim([0, 800])
train_1.groupby('card2')['card2'].count().plot(kind='barh', ax=ax3, title='Count of card2 fraud')
train_0.groupby('card2')['card2'].count().plot(kind='barh', ax=ax4, title='Count of card2 non-fraud')
plt.show()

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 8))
train_1.groupby('card3')['card3'].count().plot(kind='barh', ax=ax1, title='Count of card3 fraud')
train_0.groupby('card3')['card3'].count().plot(kind='barh', ax=ax2, title='Count of card3 non-fraud')
train_1.groupby('card4')['card4'].count().plot(kind='barh', ax=ax3, title='Count of card4 fraud')
train_0.groupby('card4')['card4'].count().plot(kind='barh', ax=ax4, title='Count of card4 non-fraud')
plt.show()

In [None]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 8))
train_1.groupby('card5')['card5'].count().plot(kind='barh', ax=ax1, title='Count of card5 fraud')
train_0.groupby('card5')['card5'].count().plot(kind='barh', ax=ax2, title='Count of card5 non-fraud')
train_1.groupby('card6')['card6'].count().plot(kind='barh', ax=ax3, title='Count of card6 fraud')
train_0.groupby('card6')['card6'].count().plot(kind='barh', ax=ax4, title='Count of card6 non-fraud')
plt.show()

### 랜덤 포레스트

In [None]:
from sklearn import metrics, preprocessing
from sklearn.preprocessing import Imputer, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import tensorflow as tf
%matplotlib inline

In [None]:
X_train1, X_val, Y_train, Y_val = train_test_split(X_train, y_train, 
                                                    test_size=0.15, 
                                                    random_state=42)

In [None]:
print(X_train.shape)
print(Y_train.shape)

In [None]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, Y_train)
y_pred = rf.predict(X_val)
acc = np.mean(Y_val == y_pred )
print("SKLEARN Random Forest Accuracy = {:3.3f}".format(acc))

### 딥러닝

In [None]:
X=np.array(train1.drop(columns=['isFraud']))
y=np.array(pd.get_dummies(train1.isFraud))
headerX = train1.drop(columns=['isFraud']).columns

In [None]:
print(X.shape)
print(y.shape)

In [None]:
X_train1, X_val, y_train1, y_val = train_test_split(X, y, test_size=0.3, random_state=123)
n_train_size = y_train1.shape[0]

In [None]:
X_val.shape

In [None]:
batch_size = 40
n_batches  = 10000
learn_rate = 0.01

In [None]:
# 다중 계층!
W1 = tf.Variable(tf.random_normal([19,9],0,1))   # 입력 = 19, 출력 = 10.
b1 = tf.Variable(tf.random_normal([9],0,1))     
W2 = tf.Variable(tf.random_normal([9,2],0,1))   # 입력 = 10, 출력 = 2.
b2 = tf.Variable(tf.random_normal([2],0,1)) 

In [None]:
X_ph = tf.placeholder(tf.float32, [None, 19]) # 행의 개수 미정 (None).
y_ph = tf.placeholder(tf.float32,[None,2])

#### 학습 모형 정의

In [None]:
# 다중 계층!
hidden = tf.nn.sigmoid(tf.matmul(X_ph,W1) + b1)
y_model = tf.matmul(hidden, W2) + b2

#### 손실함수 정의, 최적화 방법 정의 

In [None]:
loss  = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_ph, logits=y_model))   # loss = Cross Entropy. 

In [None]:
#optimizer = tf.train.GradientDescentOptimizer(learning_rate = learn_rate) 
#optimizer = tf.train.MomentumOptimizer(learning_rate = learn_rate, momentum=0.8) 
optimizer = tf.train.AdamOptimizer(learning_rate = learn_rate) 

In [None]:
train = optimizer.minimize(loss)

#### Variable 전역 초기화

In [None]:
init = tf.global_variables_initializer()

#### Session을 시작해서 학습

In [None]:
with tf.Session() as sess:
        sess.run(init)
        for i in range(n_batches):
            idx_rnd = np.random.randint(0,n_train_size,batch_size)
            batch_X, batch_y = [X_train1[idx_rnd,:], y_train1[idx_rnd,:]]
            my_feed = {X_ph:batch_X, y_ph:batch_y}
            sess.run(train, feed_dict = my_feed)
            if (i + 1) % 2000 == 0: print(i + 1)
        # 모형 평가.
        correct_predictions = tf.equal(tf.argmax(y_ph, 1), tf.argmax(y_model, 1))      # Axis=1는 가로 방향 argmax().
        acc = tf.reduce_mean(tf.cast(correct_predictions, tf.float32))                 # 먼저 boolean을 float32로 cast.
        acc_value = sess.run(acc, feed_dict={X_ph:X_val, y_ph:y_val})   # 시험 데이터 전체.

#### 학습결과 출력

In [None]:
print("정확도 = {:5.3f}".format(acc_value))