## Google Drive Connect

In [None]:
# 구글 드라이브 연동
from google.colab import drive
drive.mount('/content/gdrive/')
path = '/content/gdrive/MyDrive/Dacon/'
file_name_test = 'test.csv'
file_name_train = 'train.csv'
file_name_val = 'val.csv'

Mounted at /content/gdrive/


In [None]:
# 워닝 무시
import warnings
warnings.filterwarnings('ignore')

## Package Load

In [None]:
import pandas as pd
import numpy as np

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout, Conv1D, MaxPooling1D
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances, distance
from sklearn.metrics import f1_score, accuracy_score, classification_report

## Data Load

In [None]:
test_df = pd.read_csv(path+file_name_test)
train_df = pd.read_csv(path+file_name_train)
val_df = pd.read_csv(path+file_name_val)

## Select feature : from EDA

In [None]:
select_feature = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V9', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V18', 'V30']
select_feature_val = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V9', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V18', 'V30', 'Class']

In [None]:
train_df = train_df[select_feature]

In [None]:
numpy_train = train_df.to_numpy()
# t_x, tt_x = train_test_split(numpy_train, test_size=0.2, random_state=42)

## Create AutoEncoder

In [None]:
input_dim = len(select_feature)

In [None]:
class Autoencoder(Model):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        self.input_dim = input_dim # 입력 차원의 수
        self.auto_encoder = Sequential()
        self.auto_encoder.add(Dense(8, input_shape=(self.input_dim, ), activation='relu'))
        self.auto_encoder.add(BatchNormalization())
        self.auto_encoder.add(Dense(4, activation='relu'))
        self.auto_encoder.add(BatchNormalization())
        self.auto_encoder.add(Dense(2, activation='relu'))
        self.auto_encoder.add(BatchNormalization())
        self.auto_encoder.add(Dense(4, activation='relu'))
        self.auto_encoder.add(BatchNormalization())
        self.auto_encoder.add(Dense(8, activation='relu'))
        self.auto_encoder.add(Dense(self.input_dim, activation='linear'))
        
    def call(self, x):
        return self.auto_encoder(x)

In [None]:
checkpointer = ModelCheckpoint(filepath=path + 'keras_best.h5', verbose=2, save_best_only=True, save_weights_only=True, monitor='val_cosine_similarity', mode='max')
earlystopping = EarlyStopping(monitor='val_cosine_similarity', mode='max', verbose=2, patience=100)

In [None]:
autoEncoder = Autoencoder(input_dim)
autoEncoder.compile(optimizer='adam', loss=MeanSquaredError(), metrics=["cosine_similarity"])

## Train AutoEncoder

In [None]:
autoEncoder.fit(numpy_train, numpy_train, epochs=1000, shuffle=True, validation_data = (numpy_train, numpy_train), batch_size=16384, callbacks = [checkpointer, earlystopping])

Epoch 1/1000
Epoch 1: val_cosine_similarity improved from -inf to -0.06310, saving model to /content/gdrive/MyDrive/Dacon/keras_best.h5
Epoch 2/1000
Epoch 2: val_cosine_similarity improved from -0.06310 to -0.04496, saving model to /content/gdrive/MyDrive/Dacon/keras_best.h5
Epoch 3/1000
Epoch 3: val_cosine_similarity improved from -0.04496 to -0.02772, saving model to /content/gdrive/MyDrive/Dacon/keras_best.h5
Epoch 4/1000
Epoch 4: val_cosine_similarity improved from -0.02772 to -0.01242, saving model to /content/gdrive/MyDrive/Dacon/keras_best.h5
Epoch 5/1000
Epoch 5: val_cosine_similarity improved from -0.01242 to 0.00484, saving model to /content/gdrive/MyDrive/Dacon/keras_best.h5
Epoch 6/1000
Epoch 6: val_cosine_similarity improved from 0.00484 to 0.02084, saving model to /content/gdrive/MyDrive/Dacon/keras_best.h5
Epoch 7/1000
Epoch 7: val_cosine_similarity improved from 0.02084 to 0.03594, saving model to /content/gdrive/MyDrive/Dacon/keras_best.h5
Epoch 8/1000
Epoch 8: val_cos

<keras.callbacks.History at 0x7fc011782ad0>

## Validation AutoEncoder by Euclidian

### 1) val 데이터 정의

In [None]:
val_X = val_df[select_feature]
val_Y = val_df[['Class']]

In [None]:
#scaler_v = MinMaxScaler()
input_val_X = val_X.to_numpy()

pred_val_X = autoEncoder.predict(input_val_X)
pred_val_X = pred_val_X.tolist()
pred_val_X = np.array(pred_val_X)

### 2) 임계값 정하기

- 유클리디안 거리 사용

In [None]:
# 정상값 살펴보기
u_normal = []

for i in val_Y[val_Y.Class == 0].index:
    u_normal.append(abs(np.linalg.norm(input_val_X[i] - pred_val_X[i])))
    
u_normal = np.array(u_normal)

In [None]:
np.min(u_normal), np.max(u_normal), np.median(u_normal)

(1.1244463159584879, 33.68507661224932, 3.0096114875449933)

In [None]:
# 이상치 살펴보기
u_anormaly = []

for i in val_Y[val_Y.Class == 1].index:
    u_anormaly.append(abs(np.linalg.norm(input_val_X[i] - pred_val_X[i])))
    
u_anormaly = np.array(u_anormaly)

In [None]:
np.min(u_anormaly), np.max(u_anormaly), np.median(u_anormaly)

(2.9383701994256413, 44.23111308958229, 13.899576917941662)

In [None]:
# 임계값 정하기
max_score = 0
best_thr = 0

for i in np.arange(np.min(u_normal), np.max(u_anormaly), 0.01):
  temp = []
  for j in range(len(pred_val_X)):
    temp.append(abs(np.linalg.norm(input_val_X[j] - pred_val_X[j])))
    temp_pred = []

  for j in range(len(temp)):
      if temp[j] > i:
          temp_pred.append(1)
      else:
          temp_pred.append(0)

  temp_df = pd.DataFrame(temp_pred, columns=['Class'])
  score = f1_score(val_Y, temp_df, average='macro')
  
  if max_score < score:
    max_score = score
    best_thr = i
    print(f'Max score : {max_score}')
    print(f'Best thr : {best_thr}')
    print('-' * 30)
    
abs_u_threshold = np.max(u_normal)
u_threshold = best_thr
print(f"임계값 : {u_threshold}")

Max score : 0.0010881344945152598
Best thr : 1.1244463159584879
------------------------------
Max score : 0.0011233393803556284
Best thr : 1.264446315958488
------------------------------
Max score : 0.001158541795224466
Best thr : 1.274446315958488
------------------------------
Max score : 0.0011937417393830074
Best thr : 1.304446315958488
------------------------------
Max score : 0.0012289392130924502
Best thr : 1.324446315958488
------------------------------
Max score : 0.001264134216613956
Best thr : 1.354446315958488
------------------------------
Max score : 0.0012993267502086495
Best thr : 1.364446315958488
------------------------------
Max score : 0.001440072190540509
Best thr : 1.374446315958488
------------------------------
Max score : 0.0015456053493475305
Best thr : 1.384446315958488
------------------------------
Max score : 0.0016159484489217607
Best thr : 1.3944463159584881
------------------------------
Max score : 0.0018269185403642267
Best thr : 1.40444631595848

---
- 코사인 유사도

In [None]:
# 정상값 살펴보기
c_normal = []

for i in val_Y[val_Y.Class == 0].index:
    c_normal.append(abs(cosine_similarity(input_val_X[i].reshape(1, -1), pred_val_X[i].reshape(1, -1))))
    
c_normal = np.array(c_normal)

In [None]:
np.min(c_normal), np.max(c_normal), np.median(c_normal)

In [None]:
# 이상치 살펴보기
c_anormaly = []

for i in val_Y[val_Y.Class == 1].index:
    c_anormaly.append(abs(cosine_similarity(input_val_X[i].reshape(1, -1), pred_val_X[i].reshape(1, -1))))
    
c_anormaly = np.array(c_anormaly)

In [None]:
np.min(c_anormaly), np.max(c_anormaly), np.median(c_anormaly)

In [None]:
# 임계값 정하기
max_score = 0
best_thr = 0

for i in np.arange(np.min(c_anormaly), np.max(c_normal), 0.01):
  temp = []
  for j in range(len(pred_val_X)):
    temp.append(abs(cosine_similarity(input_val_X[j].reshape(1, -1), pred_val_X[j].reshape(1, -1))))
    temp_pred = []

  for j in range(len(temp)):
      if temp[j] < i:
          temp_pred.append(1)
      else:
          temp_pred.append(0)

  temp_df = pd.DataFrame(temp_pred, columns=['Class'])
  score = f1_score(val_Y, temp_df, average='macro')
  
  if max_score < score:
    max_score = score
    best_thr = i
    print(f'Max score : {max_score}')
    print(f'Best thr : {best_thr}')
    print('-' * 30)

abs_c_threshold = np.min(c_normal)
c_threshold = best_thr
print(f"임계값 : {c_threshold}")

---
- 맨허튼 거리

In [None]:
# 정상값 살펴보기
m_normal = []

for i in val_Y[val_Y.Class == 0].index:
    m_normal.append(abs(np.sum(input_val_X[i] - pred_val_X[i])))
    
m_normal = np.array(m_normal)

In [None]:
np.min(m_normal), np.max(m_normal), np.median(m_normal)

In [None]:
# 이상치 살펴보기
m_anormaly = []

for i in val_Y[val_Y.Class == 1].index:
    m_anormaly.append(abs(np.sum(input_val_X[i] - pred_val_X[i])))
    
m_anormaly = np.array(m_anormaly)

In [None]:
np.min(m_anormaly), np.max(m_anormaly), np.median(m_anormaly)

In [None]:
# 임계값 정하기
max_score = 0
best_thr = 0

for i in np.arange(np.min(m_normal), np.max(m_anormaly), 0.01):
  temp = []
  for j in range(len(pred_val_X)):
    temp.append(abs(np.sum(input_val_X[j] - pred_val_X[j])))
    temp_pred = []

  for j in range(len(temp)):
      if temp[j] > i:
          temp_pred.append(1)
      else:
          temp_pred.append(0)

  temp_df = pd.DataFrame(temp_pred, columns=['Class'])
  score = f1_score(val_Y, temp_df, average='macro')
  
  if max_score < score:
    max_score = score
    best_thr = i
    print(f'Max score : {max_score}')
    print(f'Best thr : {best_thr}')
    print('-' * 30)

abs_m_threshold = np.max(m_normal)
m_threshold = best_thr
print(f"임계값 : {m_threshold}")

### 3) 예측하기

In [None]:
pred_cosine = []
pred_distance = []
pred_manhattan = []

for i in range(len(pred_val_X)):
    pred_cosine.append(abs(cosine_similarity(input_val_X[i].reshape(1, -1), pred_val_X[i].reshape(1, -1))))
    pred_distance.append(abs(np.linalg.norm(input_val_X[i] - pred_val_X[i])))
    pred_manhattan.append(abs(np.sum(input_val_X[i] - pred_val_X[i])))

In [None]:
pred_Class = []

for i in range(len(pred_cosine)):
    if (pred_cosine[i] < c_threshold) | (pred_distance[i] > u_threshold) | (pred_manhattan[i] > m_threshold):
        pred_Class.append(1)
    else:
        pred_Class.append(0)

### 4) f1_score

In [None]:
pred_val_df = pd.DataFrame(pred_Class, columns=['Class'])

In [None]:
f1_score(val_Y, pred_val_df, average='macro')

In [None]:
accuracy_score(val_Y, pred_val_df)

In [None]:
print(classification_report(val_Y, pred_val_df))

## Test data Predict

### 1) Test data 정의

In [None]:
input_test_X = test_df[select_feature].to_numpy()


In [None]:
pred_test_X = autoEncoder.predict(input_test_X)
pred_test_X = pred_test_X.tolist()
pred_test_X = np.array(pred_test_X)

### 2) 예측하기

In [None]:
pred_cosine = []
pred_distance = []
pred_manhattan = []

for i in range(len(pred_val_X)):
    pred_cosine.append(abs(cosine_similarity(input_test_X[i].reshape(1, -1), pred_test_X[i].reshape(1, -1))))
    pred_distance.append(abs(np.linalg.norm(input_test_X[i] - pred_test_X[i])))
    pred_manhattan.append(abs(np.sum(input_test_X[i] - pred_test_X[i])))

In [None]:
pred_Class = []

for i in range(len(pred_cosine)):
    if (pred_cosine[i] < c_threshold) | (pred_distance[i] > u_threshold) | (pred_manhattan[i] > m_threshold):
        pred_Class.append(1)
    else:
        pred_Class.append(0)

### 3) submit

In [None]:
submit = pd.read_csv(path + 'sample_submission.csv')
submit['Class'] = pred_Class
submit.to_csv(path + 'submit_keras.csv', index=False)