In [1]:
from google.colab import drive
# drive.mount('/content/drive/MyDrive/IITP/sohyun/creditcard_prediction/data')
drive.mount('/content/drive')

%cd drive/MyDrive/IITP/sohyun/creditcard_prediction/data

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/IITP/sohyun/creditcard_prediction/data


In [2]:
!pip install wandb -qqq
import wandb
wandb.login()

[K     |████████████████████████████████| 1.9 MB 4.9 MB/s 
[K     |████████████████████████████████| 168 kB 65.6 MB/s 
[K     |████████████████████████████████| 182 kB 76.8 MB/s 
[K     |████████████████████████████████| 62 kB 1.6 MB/s 
[K     |████████████████████████████████| 168 kB 74.4 MB/s 
[K     |████████████████████████████████| 166 kB 76.2 MB/s 
[K     |████████████████████████████████| 166 kB 61.4 MB/s 
[K     |████████████████████████████████| 162 kB 71.1 MB/s 
[K     |████████████████████████████████| 162 kB 81.4 MB/s 
[K     |████████████████████████████████| 158 kB 87.0 MB/s 
[K     |████████████████████████████████| 157 kB 75.5 MB/s 
[K     |████████████████████████████████| 157 kB 86.9 MB/s 
[K     |████████████████████████████████| 157 kB 80.2 MB/s 
[K     |████████████████████████████████| 157 kB 88.1 MB/s 
[K     |████████████████████████████████| 157 kB 88.2 MB/s 
[K     |████████████████████████████████| 157 kB 85.5 MB/s 
[K     |██████████████████

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 

··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [3]:
import random
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# wandb.init(project="") # wandb init

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [4]:
class MyDataset(Dataset):
    def __init__(self, df, eval_mode):
        self.df = df
        self.eval_mode = eval_mode
        if self.eval_mode:
            self.labels = self.df['Class'].values
            self.df = self.df.drop(columns=['Class']).values
        else:
            self.df = self.df.values
        
    def __getitem__(self, index):
        if self.eval_mode:
            self.x = self.df[index]
            self.y = self.labels[index]
            return torch.Tensor(self.x), self.y
        else:
            self.x = self.df[index]
            return torch.Tensor(self.x)
        
    def __len__(self):
        return len(self.df)

class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.Encoder = nn.Sequential(
            nn.Linear(30,64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.Linear(64,128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(),
        )
        self.Decoder = nn.Sequential(
            nn.Linear(128,64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.Linear(64,30),
        )
        
    def forward(self, x):
        x = self.Encoder(x)
        x = self.Decoder(x)
        return x

class Trainer():
    def __init__(self, model, optimizer, train_loader, val_loader, scheduler, device):
        self.model = model
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.scheduler = scheduler
        self.device = device
        # Loss Function
        self.criterion = nn.L1Loss().to(self.device)
        
    def fit(self, config, modelNum=None):
        self.model.to(self.device)
        best_score = 0
        for epoch in range(config.EPOCHS):
            self.model.train()
            train_loss = []
            for x in iter(self.train_loader):
                x = x.float().to(self.device)
                self.optimizer.zero_grad()

                _x = self.model(x)
                loss = self.criterion(x, _x)

                loss.backward()
                self.optimizer.step()

                train_loss.append(loss.item())

            score = self.validation(self.model, config.thr)
            # print(f'Epoch : [{epoch}] Train loss : [{np.mean(train_loss)}] Val Score : [{score}])')
            
            wandb.log({
                "validation f1": score,
                "loss": loss
            })
            if self.scheduler is not None:
              self.scheduler.step(score)

            if best_score < score:
              print(f'Epoch : [{epoch}] Train loss : [{np.mean(train_loss)}] Val Score : [{score}])')
              best_score = score
              if modelNum :
                torch.save(self.model.module.state_dict(), f'./best_model{modelNum}.pth', _use_new_zipfile_serialization=False)
              else :
                torch.save(self.model.module.state_dict(), f'./best_model.pth', _use_new_zipfile_serialization=False)
  
    def validation(self, eval_model, thr):
        cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        eval_model.eval()
        pred = []
        true = []
        with torch.no_grad():
            for x, y in iter(self.val_loader):
                x = x.float().to(self.device)

                _x = self.model(x)
                diff = cos(x, _x).cpu().tolist()
                batch_pred = np.where(np.array(diff)<thr, 1,0).tolist()
                pred += batch_pred
                true += y.tolist()

        return f1_score(true, pred, average='macro')

In [5]:
def prediction(model, thr, test_loader, device):
  model.to(device)
  model.eval()
  cos = nn.CosineSimilarity(dim=1, eps=1e-6)
  pred = []
  with torch.no_grad():
    for x in iter(test_loader):
      x = x.float().to(device)
      
      _x = model(x)
      
      diff = cos(x, _x).cpu().tolist()
      batch_pred = np.where(np.array(diff)<thr, 1, 0).tolist()
      pred += batch_pred
  return pred

In [None]:
def main(config):
  seed_everything(config.SEED) # Seed fix

  #---# DATA #---#
  train_df = pd.read_csv('./train.csv')
  train_df = train_df.drop(columns=['ID'])
  val_df = pd.read_csv('./val.csv')
  val_df = val_df.drop(columns=['ID'])
  test_df = pd.read_csv('./test.csv')
  test_df = test_df.drop(columns=['ID'])
  models = [] # list of models

  val_dataset = MyDataset(df = val_df, eval_mode=True)
  val_loader = DataLoader(val_dataset, batch_size=config.BS, shuffle=False)
  test_dataset = MyDataset(test_df, False)
  test_loader = DataLoader(test_dataset, batch_size=config.BS, shuffle=False, num_workers=6)

  # for refine
  train_dataset = MyDataset(df=train_df, eval_mode=False)
  train_loader = DataLoader(train_dataset, batch_size=config.BS, shuffle=True)
  model = nn.DataParallel(AutoEncoder())
  model.eval()
  optimizer = torch.optim.Adam(params = model.parameters(), lr = 1e-2)
  scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=5, threshold_mode='abs', min_lr=1e-8, verbose=True)
  
  trainer = Trainer(model, optimizer, train_loader, val_loader, scheduler, device)
  trainer.fit(config)

  model = AutoEncoder()
  model.load_state_dict(torch.load(f'./best_model.pth'))
  model = nn.DataParallel(model)
  model.eval()
  preds = prediction(model, config.thr, train_loader, device)
  print("<<<없애는 anomay 수>>> ", sum(preds))
  train_df_pseudo = train_df
  train_df_pseudo['Class'] = preds
  
  idx_anomal = train_df_pseudo[train_df_pseudo['Class'] == 1].index
  train_df_pseudo = train_df_pseudo.drop(idx_anomal)
  train_df_pseudo = train_df_pseudo.drop(columns=['Class'])
  train_df_pseudo = train_df_pseudo.reset_index(drop=True)

  # for ensemble
  for i in range(config.K):
    choose_idx = np.random.choice(train_df_pseudo.shape[0], 50000, replace=True)
    train_df_choose = train_df_pseudo.loc[choose_idx,:]
    train_dataset = MyDataset(df=train_df_choose, eval_mode=False)
    train_loader = DataLoader(train_dataset, batch_size=config.BS, shuffle=True)

    model = nn.DataParallel(AutoEncoder())
    model.eval()
    optimizer = torch.optim.Adam(params = model.parameters(), lr = config.LR)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=5, threshold_mode='abs', min_lr=1e-8, verbose=True)

    trainer = Trainer(model, optimizer, train_loader, val_loader, scheduler, device)
    trainer.fit(config, modelNum=(i+1))

    models.append(model)

  # # for test
  # model_preds = []
  # for i in range(config.K):
  #   model = AutoEncoder()
  #   model.load_state_dict(torch.load(f'./best_model{(i+1)}.pth'))
  #   model = nn.DataParallel(model)
  #   model.eval()
  #   preds = prediction(model, 0.97, test_loader, device)
  #   model_preds.append(preds)

  # model_pred_df = pd.DataFrame(model_preds).transpose()
  # row_sum = model_pred_df.sum(axis=1)
  # pred = np.where(row_sum > 3, 1, 0) # 클수록 anomaly
  
  # return pred

if __name__ == '__main__':
  wandb.init()
  import easydict
  args = easydict.EasyDict({
      "K" : 10,
      "EPOCHS" : 200, #65 ## 400
      "LR" : 1e-2,
      "BS" : 16384, #16384
      "SEED" : 1004,
      "thr" : 0.95
  })
  config = args

  print('------------ Options -------------')
  for k, v in sorted(args.items()):
    print('%s: %s' % (str(k), str(v)))
  print('-------------- End ----------------')

  pred = main(config)

In [None]:
import easydict
args = easydict.EasyDict({
    "K" : 10,
    "EPOCHS" : 200, #65 ## 400
    "LR" : 1e-2,
    "BS" : 16384, #16384
    "SEED" : 1004,
    "thr" : 0.95
})
config = args

#------------------#
#---# For test #---#
#------------------#
test_df = pd.read_csv('./test.csv')
test_df = test_df.drop(columns=['ID'])
test_dataset = MyDataset(test_df, False)
test_loader = DataLoader(test_dataset, batch_size=config.BS, shuffle=False, num_workers=6)

model_preds = []
for i in range(config.K):
  model = AutoEncoder()
  model.load_state_dict(torch.load(f'./best_model{(i+1)}.pth'))
  model = nn.DataParallel(model)
  model.eval()
  preds = prediction(model, config.thr, test_loader, device)
  model_preds.append(preds)

model_pred_df = pd.DataFrame(model_preds).transpose()
row_sum = model_pred_df.sum(axis=1)
pred = np.where(row_sum > 7, 1, 0) # 클수록 anomaly

  cpuset_checked))


In [None]:
#---# For submission #---#
submit = pd.read_csv('./sample_submission.csv')
submit['Class'] = pred
submit.to_csv('./submit_autoencoder_with_vote_clean_10000.csv', index=False)