## 데이터 로드

In [1]:
file_name_test = './data/test.csv'
file_name_train = './data/train.csv'
file_name_val = './data/val.csv'

In [2]:
# 워닝 무시
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np

test_df = pd.read_csv(file_name_test)
train_df = pd.read_csv(file_name_train)
val_df = pd.read_csv(file_name_val)

## 데이터 분석

### 1) 데이터 시각화 분석

- 피처 선정
  - 제외 : V8, V13, V15, V19, V20, V21, V22, V23, V24, V25, V26, V27, V28, V29
  - 애매 : V30은 애매하다.
  - 선정 : V1, V2, V3, V4, V5, V6, V7, V9, V10, V11, V12, V14, V16, V17, V18, V30

In [4]:
# 데이터 시각화를 통한 피처선정
select_feature = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V9', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V18', 'V30']
select_feature_val = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V9', 'V10', 'V11', 'V12', 'V14', 'V16', 'V17', 'V18', 'V30', 'Class']

### 2) 데이터 확인

In [5]:
train_df.head()

Unnamed: 0,ID,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30
0,3,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,4.983721,-0.994972
1,4,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,1.418291,-0.994972
2,6,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,-0.256131,-0.99496
3,8,-0.644269,1.417964,1.07438,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,...,1.943465,-1.015455,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,0.262698,-0.994901
4,9,-0.894286,0.286157,-0.113192,-0.271526,2.669599,3.721818,0.370145,0.851084,-0.392048,...,-0.073425,-0.268092,-0.204233,1.011592,0.373205,-0.384157,0.011747,0.142404,0.9949,-0.994901


In [6]:
val_df.head()

Unnamed: 0,ID,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V22,V23,V24,V25,V26,V27,V28,V29,V30,Class
0,10,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,...,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,-0.255991,-0.994878,0
1,22,0.962496,0.328461,-0.171479,2.109204,1.129566,1.696038,0.107712,0.521502,-1.191311,...,0.402492,-0.048508,-1.371866,0.390814,0.199964,0.016371,-0.014605,0.168937,-0.994784,0
2,63,1.145524,0.575068,0.194008,2.598192,-0.09221,-1.04443,0.531588,-0.241888,-0.896287,...,-0.119703,-0.07651,0.69132,0.633984,0.048741,-0.053192,0.016251,0.169496,-0.994502,0
3,69,0.92706,-0.323684,0.387585,0.544474,0.246787,1.650358,-0.427576,0.615371,0.226278,...,0.079359,0.096632,-0.992569,0.085096,0.377447,0.036096,-0.00596,0.331307,-0.994467,0
4,83,-3.005237,2.600138,1.483691,-2.418473,0.306326,-0.824575,2.065426,-1.829347,4.009259,...,-0.181268,-0.163747,0.515821,0.136318,0.460054,-0.251259,-1.105751,-0.287012,-0.994373,0


## AutoEncoder 모델 생성
- 원본 Input과 Decoding된 Output의 코사인 유사도를 비교하여 이상치를 탐지한다.

### 1) AutoEncoder 모델 생성을 위한 패키지 로드

In [7]:
import random
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from sklearn.metrics import f1_score

### 2) gpu 설정

In [8]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

### 3) 하이퍼 파라미터 변수 정의

In [9]:
EPOCHS = 50 # 학습횟수
LR = 1e-2 # 학습률
BS = 16384 # 배치 사이즈
SEED = 41 # 랜덤값 고정

### 4) Random Seed 고정

In [10]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED) # Seed 고정

### 5) DataSet 생성

In [11]:
class MyDataset(Dataset):
    def __init__(self, df, eval_mode):
        self.df = df
        self.eval_mode = eval_mode
        if self.eval_mode:
            self.labels = self.df['Class'].values
            self.df = self.df.drop(columns=['Class']).values
        else:
            self.df = self.df.values
        
    def __getitem__(self, index):
        if self.eval_mode:
            self.x = self.df[index]
            self.y = self.labels[index]
            return torch.Tensor(self.x), self.y
        else:
            self.x = self.df[index]
            return torch.Tensor(self.x)
        
    def __len__(self):
        return len(self.df)

In [12]:
train_df = train_df[select_feature]
val_df = val_df[select_feature_val]

#train_df = train_df[train_df.columns[1:]]
#val_df = val_df[val_df.columns[1:]]

train_dataset = MyDataset(df=train_df, eval_mode=False)
train_loader = DataLoader(train_dataset, batch_size=BS, shuffle=True, num_workers=4)

val_dataset = MyDataset(df=val_df, eval_mode=True)
val_loader = DataLoader(val_dataset, batch_size=BS, shuffle=False, num_workers=4)

### 6) 1D AutoEncoder 모델 정의

In [13]:
class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.Encoder = nn.Sequential(
            nn.Linear(16,128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(),
            nn.Linear(128,256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(),
        )
        self.Decoder = nn.Sequential(
            nn.Linear(256,128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(),
            nn.Linear(128,16),
        )
        
    def forward(self, x):
        x = self.Encoder(x)
        x = self.Decoder(x)
        return x

## 학습

### 1) 학습 클래스 정의

In [14]:
class Trainer():
    def __init__(self, model, optimizer, train_loader, val_loader, scheduler, device):
        self.model = model
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.scheduler = scheduler
        self.device = device
        # Loss Function
        self.criterion = nn.L1Loss().to(self.device)
        
    def fit(self, ):
        self.model.to(self.device)
        best_score = 0
        for epoch in range(EPOCHS):
            self.model.train()
            train_loss = []
            for x in iter(self.train_loader):
                x = x.float().to(self.device)
                self.optimizer.zero_grad()

                _x = self.model(x)
                loss = self.criterion(x, _x)

                loss.backward()
                self.optimizer.step()

                train_loss.append(loss.item())

            score = self.validation(self.model, 0.961)
            print(f'Epoch : [{epoch}] Train loss : [{np.mean(train_loss)}] Val Score : [{score}])')

            if self.scheduler is not None:
                self.scheduler.step(score)

            if best_score < score:
                best_score = score
                torch.save(model.module.state_dict(), './model/best_model.pth', _use_new_zipfile_serialization=False)
    
    def validation(self, eval_model, thr):
        cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        dis = nn.PairwiseDistance(p=1.0)
        eval_model.eval()
        pred = []
        true = []
        with torch.no_grad():
            for x, y in iter(self.val_loader):
                batch_pred = []
                x = x.float().to(self.device)

                _x = self.model(x)
                distance = dis(x, _x).cpu().tolist()
                diff = cos(x, _x).cpu().tolist()
                for i in range(len(diff)):
                    if diff[i] < thr:
                        batch_pred.append(1)
                    else:
                        batch_pred.append(0)
                
                # batch_pred = np.where(np.array(diff)<thr, 1,0).tolist()
                pred += batch_pred
                true += y.tolist()

        return f1_score(true, pred, average='macro')

### 2) 모델 학습

In [15]:
model = nn.DataParallel(AutoEncoder())
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = LR)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=10, threshold_mode='abs', min_lr=1e-8, verbose=True)

trainer = Trainer(model, optimizer, train_loader, val_loader, scheduler, device)
trainer.fit()

Epoch : [0] Train loss : [0.41937497683933805] Val Score : [0.41608091863813823])
Epoch : [1] Train loss : [0.17159425147942134] Val Score : [0.5311641084852974])
Epoch : [2] Train loss : [0.12245482525655202] Val Score : [0.699339777108752])
Epoch : [3] Train loss : [0.09869449053491865] Val Score : [0.8786471773914175])
Epoch : [4] Train loss : [0.08679917348282677] Val Score : [0.8786471773914175])
Epoch : [5] Train loss : [0.07910579868725368] Val Score : [0.9031202878275757])
Epoch : [6] Train loss : [0.07584677317312785] Val Score : [0.9031202878275757])
Epoch : [7] Train loss : [0.07383141879524503] Val Score : [0.9031202878275757])
Epoch : [8] Train loss : [0.07180521956511907] Val Score : [0.9031202878275757])
Epoch : [9] Train loss : [0.0698029420205525] Val Score : [0.9031202878275757])
Epoch : [10] Train loss : [0.06759510508605412] Val Score : [0.8998944850872257])
Epoch : [11] Train loss : [0.06567889345543725] Val Score : [0.7939330251820103])
Epoch : [12] Train loss : [

## remove outlier from train data and retraining

### 1) 모델 불러오기

In [None]:
model = AutoEncoder()
model.load_state_dict(torch.load('./model/best_model.pth'))
model = nn.DataParallel(model)
model.eval()

### 2) 예측

In [None]:
def prediction(model, thr, test_loader, device):
    model.to(device)
    model.eval()
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    dis = nn.PairwiseDistance(p=1.0)
    pred = []
    with torch.no_grad():
        for x in iter(test_loader):
            batch_pred = []
            x = x.float().to(device)
            _x = model(x)
            
            distance = dis(x, _x).cpu().tolist()
            diff = cos(x, _x).cpu().tolist()
            
            for i in range(len(diff)):
                if diff[i] < thr:
                    batch_pred.append(1)
                else:
                    batch_pred.append(0)

            # batch_pred = np.where(np.array(diff)<thr, 1,0).tolist()
            pred += batch_pred
    return pred

In [None]:
preds = prediction(model, 0.99, train_loader, device)

### 3) train data labeling

In [None]:
train_df['Class'] = preds

### 4) remove outlier

In [None]:
clean_train_df = train_df[train_df['Class'] == 0]

In [None]:
clean_train_df = clean_train_df[select_feature]

In [None]:
len(clean_train_df) # before

In [None]:
len(clean_train_df) # after

### 5) dataset

In [None]:
clean_train_dataset = MyDataset(df=clean_train_df, eval_mode=False)
clean_train_loader = DataLoader(clean_train_dataset, batch_size=BS, shuffle=True, num_workers=4)

### 6) retraining

In [None]:
del model

In [None]:
# del model

model = nn.DataParallel(AutoEncoder())
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = LR)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=10, threshold_mode='abs', min_lr=1e-8, verbose=True)

trainer = Trainer(model, optimizer, clean_train_loader, val_loader, scheduler, device)
trainer.fit()

### 6. 추론

In [None]:
model = AutoEncoder()
model.load_state_dict(torch.load('./model/best_model.pth'))
model = nn.DataParallel(model)
model.eval()

In [None]:
test_df = test_df[select_feature]

In [None]:
test_dataset = MyDataset(test_df, False)
test_loader = DataLoader(test_dataset, batch_size=BS, shuffle=False, num_workers=4)

In [None]:
def prediction(model, thr, test_loader, device):
    model.to(device)
    model.eval()
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    dis = nn.PairwiseDistance(p=1.0)
    pred = []
    with torch.no_grad():
        for x in iter(test_loader):
            batch_pred = []
            x = x.float().to(device)
            _x = model(x)
            
            distance = dis(x, _x).cpu().tolist()
            diff = cos(x, _x).cpu().tolist()
            
            for i in range(len(diff)):
                if diff[i] < thr:
                    batch_pred.append(1)
                else:
                    batch_pred.append(0)

            # batch_pred = np.where(np.array(diff)<thr, 1,0).tolist()
            pred += batch_pred
    return pred

In [None]:
preds = prediction(model, 0.95, test_loader, device)

In [None]:
submit = pd.read_csv('./data/sample_submission.csv')
submit['Class'] = preds
submit.to_csv('./submit/submit.csv', index=False)