In [1]:
from google.colab import drive
# drive.mount('/content/drive/MyDrive/IITP/sohyun/creditcard_prediction/data')
drive.mount('/content/drive')

%cd drive/MyDrive/IITP/sohyun/creditcard_prediction/data

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/IITP/sohyun/creditcard_prediction/data


In [2]:
!pip install wandb -qqq
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33msohyun[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
import random
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# wandb.init(project="") # wandb init

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [4]:
class MyDataset(Dataset):
    def __init__(self, df, eval_mode):
        self.df = df
        self.eval_mode = eval_mode
        if self.eval_mode:
            self.labels = self.df['Class'].values
            self.df = self.df.drop(columns=['Class']).values
        else:
            self.df = self.df.values
        
    def __getitem__(self, index):
        if self.eval_mode:
            self.x = self.df[index]
            self.y = self.labels[index]
            return torch.Tensor(self.x), self.y
        else:
            self.x = self.df[index]
            return torch.Tensor(self.x)
        
    def __len__(self):
        return len(self.df)

class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.Encoder = nn.Sequential(
            nn.Linear(30,64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.Linear(64,128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(),
        )
        self.Decoder = nn.Sequential(
            nn.Linear(128,64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.Linear(64,30),
        )
        
    def forward(self, x):
        x = self.Encoder(x)
        x = self.Decoder(x)
        return x

class Trainer():
    def __init__(self, model, optimizer, train_loader, val_loader, scheduler, device):
        self.model = model
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.scheduler = scheduler
        self.device = device
        # Loss Function
        self.criterion = nn.L1Loss().to(self.device)
        
    def fit(self, config, modelNum=None):
        self.model.to(self.device)
        best_score = 0
        for epoch in range(config.EPOCHS):
            self.model.train()
            train_loss = []
            for x in iter(self.train_loader):
                x = x.float().to(self.device)
                self.optimizer.zero_grad()

                _x = self.model(x)
                loss = self.criterion(x, _x)

                loss.backward()
                self.optimizer.step()

                train_loss.append(loss.item())

            score = self.validation(self.model, config.thr)
            # print(f'Epoch : [{epoch}] Train loss : [{np.mean(train_loss)}] Val Score : [{score}])')
            
            wandb.log({
                "validation f1": score,
                "loss": loss
            })
            if self.scheduler is not None:
              self.scheduler.step(score)

            if best_score < score:
              print(f'Epoch : [{epoch}] Train loss : [{np.mean(train_loss)}] Val Score : [{score}])')
              best_score = score
              if modelNum :
                torch.save(self.model.module.state_dict(), f'./best_model{modelNum}.pth', _use_new_zipfile_serialization=False)
              else :
                torch.save(self.model.module.state_dict(), f'./best_model.pth', _use_new_zipfile_serialization=False)
  
    def validation(self, eval_model, thr):
        cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        eval_model.eval()
        pred = []
        true = []
        with torch.no_grad():
            for x, y in iter(self.val_loader):
                x = x.float().to(self.device)

                _x = self.model(x)
                diff = cos(x, _x).cpu().tolist()
                batch_pred = np.where(np.array(diff)<thr, 1,0).tolist()
                pred += batch_pred
                true += y.tolist()

        return f1_score(true, pred, average='macro')

In [5]:
def prediction(model, thr, test_loader, device):
  model.to(device)
  model.eval()
  cos = nn.CosineSimilarity(dim=1, eps=1e-6)
  pred = []
  with torch.no_grad():
    for x in iter(test_loader):
      x = x.float().to(device)
      
      _x = model(x)
      
      diff = cos(x, _x).cpu().tolist()
      batch_pred = np.where(np.array(diff)<thr, 1, 0).tolist()
      pred += batch_pred
  return pred

In [None]:
def main(config):
  seed_everything(config.SEED) # Seed fix

  #---# DATA #---#
  train_df = pd.read_csv('./train.csv')
  train_df = train_df.drop(columns=['ID'])
  val_df = pd.read_csv('./val.csv')
  val_df = val_df.drop(columns=['ID'])
  test_df = pd.read_csv('./test.csv')
  test_df = test_df.drop(columns=['ID'])
  models = [] # list of models

  val_dataset = MyDataset(df = val_df, eval_mode=True)
  val_loader = DataLoader(val_dataset, batch_size=config.BS, shuffle=False)
  test_dataset = MyDataset(test_df, False)
  test_loader = DataLoader(test_dataset, batch_size=config.BS, shuffle=False, num_workers=6)

  # for refine
  train_dataset = MyDataset(df=train_df, eval_mode=False)
  train_loader = DataLoader(train_dataset, batch_size=config.BS, shuffle=True)
  model = nn.DataParallel(AutoEncoder())
  model.eval()
  optimizer = torch.optim.Adam(params = model.parameters(), lr = 1e-2)
  scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=5, threshold_mode='abs', min_lr=1e-8, verbose=True)
  
  trainer = Trainer(model, optimizer, train_loader, val_loader, scheduler, device)
  trainer.fit(config)

  model = AutoEncoder()
  model.load_state_dict(torch.load(f'./best_model.pth'))
  model = nn.DataParallel(model)
  model.eval()
  preds = prediction(model, config.thr, train_loader, device)
  print("<<<없애는 anomay 수>>> ", sum(preds))
  train_df_pseudo = train_df
  train_df_pseudo['Class'] = preds
  
  idx_anomal = train_df_pseudo[train_df_pseudo['Class'] == 1].index
  train_df_pseudo = train_df_pseudo.drop(idx_anomal)
  train_df_pseudo = train_df_pseudo.drop(columns=['Class'])
  train_df_pseudo = train_df_pseudo.reset_index(drop=True)

  # for ensemble
  for i in range(config.K):
    choose_idx = np.random.choice(train_df_pseudo.shape[0], 50000, replace=True)
    train_df_choose = train_df_pseudo.loc[choose_idx,:]
    train_dataset = MyDataset(df=train_df_choose, eval_mode=False)
    train_loader = DataLoader(train_dataset, batch_size=config.BS, shuffle=True)

    model = nn.DataParallel(AutoEncoder())
    model.eval()
    optimizer = torch.optim.Adam(params = model.parameters(), lr = config.LR)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=5, threshold_mode='abs', min_lr=1e-8, verbose=True)

    trainer = Trainer(model, optimizer, train_loader, val_loader, scheduler, device)
    trainer.fit(config, modelNum=(i+1))

    models.append(model)

  # # for test
  # model_preds = []
  # for i in range(config.K):
  #   model = AutoEncoder()
  #   model.load_state_dict(torch.load(f'./best_model{(i+1)}.pth'))
  #   model = nn.DataParallel(model)
  #   model.eval()
  #   preds = prediction(model, 0.97, test_loader, device)
  #   model_preds.append(preds)

  # model_pred_df = pd.DataFrame(model_preds).transpose()
  # row_sum = model_pred_df.sum(axis=1)
  # pred = np.where(row_sum > 3, 1, 0) # 클수록 anomaly
  
  # return pred

if __name__ == '__main__':
  wandb.init()
  import easydict
  args = easydict.EasyDict({
      "K" : 10,
      "EPOCHS" : 200, #65 ## 400
      "LR" : 1e-2,
      "BS" : 16384, #16384
      "SEED" : 1004,
      "thr" : 0.95
  })
  config = args

  print('------------ Options -------------')
  for k, v in sorted(args.items()):
    print('%s: %s' % (str(k), str(v)))
  print('-------------- End ----------------')

  pred = main(config)

In [7]:
import easydict
args = easydict.EasyDict({
    "K" : 10,
    "EPOCHS" : 200, #65 ## 400
    "LR" : 1e-2,
    "BS" : 16384, #16384
    "SEED" : 1004,
    "thr" : 0.95
})
config = args

#---# For test #---#
test_df = pd.read_csv('./test.csv')
test_df = test_df.drop(columns=['ID'])
test_dataset = MyDataset(test_df, False)
test_loader = DataLoader(test_dataset, batch_size=config.BS, shuffle=False, num_workers=6)

model_preds = []
for i in range(config.K):
  model = AutoEncoder()
  model.load_state_dict(torch.load(f'./best_model{(i+1)}.pth'))
  model = nn.DataParallel(model)
  model.eval()
  preds = prediction(model, config.thr, test_loader, device)
  model_preds.append(preds)

model_pred_df = pd.DataFrame(model_preds).transpose()
row_sum = model_pred_df.sum(axis=1)
pred = np.where(row_sum > 7, 1, 0) # 클수록 anomaly

  cpuset_checked))


In [8]:
#---# For submission #---#
submit = pd.read_csv('./sample_submission.csv')
submit['Class'] = pred
submit.to_csv('./submit_autoencoder_with_vote_clean_10000.csv', index=False)

In [None]:
# from mpl_toolkits.mplot3d import Axes3D
# import matplotlib.pyplot as plt
# %matplotlib notebook
# fig = plt.figure()
# ax = fig.add_subplot(111, projection='3d')
# ax.scatter(test_z[:,1],test_z[:,0], test_z[:,2], c=test_labels.astype(int))
# ax.set_xlabel('Encoded')
# ax.set_ylabel('Euclidean')
# ax.set_zlabel('Cosine')
# plt.show()

In [None]:
sum(pred)

338

In [11]:
train_df = pd.read_csv('./train.csv')
train_df = train_df.drop(columns=['ID'])
val_df = pd.read_csv('./val.csv')
val_df = val_df.drop(columns=['ID'])

In [12]:
train_df

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30
0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,4.983721,-0.994972
1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,1.418291,-0.994972
2,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,-0.371407,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.081080,-0.256131,-0.994960
3,-0.644269,1.417964,1.074380,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,1.249376,...,1.943465,-1.015455,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,0.262698,-0.994901
4,-0.894286,0.286157,-0.113192,-0.271526,2.669599,3.721818,0.370145,0.851084,-0.392048,-0.410430,...,-0.073425,-0.268092,-0.204233,1.011592,0.373205,-0.384157,0.011747,0.142404,0.994900,-0.994901
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113837,-12.516732,10.187818,-8.476671,-2.510473,-4.586669,-1.394465,-3.632516,5.498583,4.893089,8.655320,...,-0.944759,-1.565026,0.890675,-1.253276,1.786717,0.320763,2.090712,1.232864,-0.169496,1.034857
113838,1.884849,-0.143540,-0.999943,1.506772,-0.035300,-0.613638,0.190241,-0.249058,0.666458,0.120908,...,0.144008,0.634646,-0.042114,-0.053206,0.316403,-0.461441,0.018265,-0.041068,0.530986,1.034881
113839,-0.241923,0.712247,0.399806,-0.463406,0.244531,-1.343668,0.929369,-0.206210,0.106234,-0.284708,...,-0.228876,-0.514376,0.279598,0.371441,-0.559238,0.113144,0.131507,0.081265,-0.230699,1.034904
113840,0.120316,0.931005,-0.546012,-0.745097,1.130314,-0.235973,0.812722,0.115093,-0.204064,-0.657422,...,-0.314205,-0.808520,0.050343,0.102800,-0.435870,0.124079,0.217940,0.068803,-0.269825,1.034939
