In [1]:
from google.colab import drive
# drive.mount('/content/drive/MyDrive/IITP/sohyun/creditcard_prediction/data')
drive.mount('/content/drive')

%cd drive/MyDrive/IITP/sohyun/creditcard_prediction/data

Mounted at /content/drive
/content/drive/MyDrive/IITP/sohyun/creditcard_prediction/data


In [16]:
!pip install wandb -qqq
import wandb
wandb.login()

[K     |████████████████████████████████| 1.8 MB 14.9 MB/s 
[K     |████████████████████████████████| 157 kB 58.5 MB/s 
[K     |████████████████████████████████| 181 kB 59.2 MB/s 
[K     |████████████████████████████████| 63 kB 1.8 MB/s 
[K     |████████████████████████████████| 157 kB 74.9 MB/s 
[K     |████████████████████████████████| 156 kB 74.9 MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [15]:
import random
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# wandb.init(project="") # wandb init
SEED = 1004

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED) # Seed 고정

train_df = pd.read_csv('./train.csv')
train_df = train_df.drop(columns=['ID'])
val_df = pd.read_csv('./val.csv')
val_df = val_df.drop(columns=['ID'])
test_df = pd.read_csv('./test.csv')
test_df = test_df.drop(columns=['ID'])

# 분포 고려해서 train_df 정제
contamination = 0.0010551
thrs = []
for i in range(30) :
  a = train_df.iloc[:,i].quantile(contamination/2)
  b = train_df.iloc[:,i].quantile(1 - contamination/2)
  thrs.append([a,b])

idxs = []; idx1 = 0; idx2 = 0
for i in range(30) :
  c_name = "V" + str(i+1)
  idx1 = train_df[(train_df[[c_name]] <= thrs[i][0])][[c_name]].values.flatten()
  idx1 = np.where([np.logical_not(np.isnan(idx1))])[1]
  
  idx2 = train_df[(train_df[[c_name]] >= thrs[i][1])][[c_name]].values.flatten()
  idx2 = np.where([np.logical_not(np.isnan(idx2))])[1]

  idxs.extend(np.concatenate((idx1, idx2)))

from collections import Counter
counter = Counter(idxs)

pseudo_anomal = []
for k, v in dict(counter).items():
  if v >= 8 :
    pseudo_anomal.append(k)

train_df = train_df.drop(pseudo_anomal).reset_index(drop=True)

#-------------------#
#---# Normalize #---#
#-------------------#
# case 1 - standardscaler
# from sklearn.preprocessing import StandardScaler
# scaler_n = StandardScaler()
# scaler_n.fit(train_df)

# val_x = val_df.drop(columns=['Class'])
# train_x_scaleN = pd.DataFrame(scaler_n.transform(train_df), columns = train_df.columns) # 확인 : train_x_scaleN.mean(), train_x_scaleN.var()
# val_x_scaleN = pd.DataFrame(scaler_n.transform(val_x), columns = val_x.columns)
# test_x_scaleN = pd.DataFrame(scaler_n.transform(test_df), columns = test_df.columns)

# train_df = train_x_scaleN
# val_df = pd.concat([val_x_scaleN, pd.DataFrame(val_df['Class'])], axis=1)
# test_df = test_x_scaleN

In [18]:
EPOCHS = 400
LR = 1e-2
BS = 16384
WD = None #1e-4

class MyDataset(Dataset):
    def __init__(self, df, eval_mode):
        self.df = df
        self.eval_mode = eval_mode
        if self.eval_mode:
            self.labels = self.df['Class'].values
            self.df = self.df.drop(columns=['Class']).values
        else:
            self.df = self.df.values
        
    def __getitem__(self, index):
        if self.eval_mode:
            self.x = self.df[index]
            self.y = self.labels[index]
            return torch.Tensor(self.x), self.y
        else:
            self.x = self.df[index]
            return torch.Tensor(self.x)
        
    def __len__(self):
        return len(self.df)

train_dataset = MyDataset(df=train_df, eval_mode=False)
train_loader = DataLoader(train_dataset, batch_size=BS, shuffle=True)

val_dataset = MyDataset(df = val_df, eval_mode=True)
val_loader = DataLoader(val_dataset, batch_size=BS, shuffle=False)

test_dataset = MyDataset(test_df, False)
test_loader = DataLoader(test_dataset, batch_size=BS, shuffle=False, num_workers=6)

class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.Encoder = nn.Sequential(
            nn.Linear(30,64),
            nn.BatchNorm1d(64),
            # nn.LayerNorm(64),
            nn.LeakyReLU(),
            # nn.ReLU(),
            nn.Linear(64,128),
            nn.BatchNorm1d(128),
            # nn.LayerNorm(128),
            nn.LeakyReLU(),
            # nn.ReLU()
        )
        self.Decoder = nn.Sequential(
            nn.Linear(128,64),
            nn.BatchNorm1d(64),
            # nn.LayerNorm(64),
            nn.LeakyReLU(),
            # nn.ReLU(),
            nn.Linear(64,30)
        )
        ########################################
        # self.Encoder = nn.Sequential(
        #     nn.Linear(30,15),
        #     nn.BatchNorm1d(15),
        #     nn.LeakyReLU(),
        #     nn.Linear(15,10),
        #     nn.BatchNorm1d(10),
        #     nn.LeakyReLU(),
        #     nn.Linear(10,5),
        #     nn.BatchNorm1d(5),
        #     nn.LeakyReLU()
        # )
        # self.Decoder = nn.Sequential(
        #     nn.Linear(5,10),
        #     nn.BatchNorm1d(10),
        #     nn.LeakyReLU(),
        #     nn.Linear(10,15),
        #     nn.BatchNorm1d(15),
        #     nn.LeakyReLU(),
        #     nn.Linear(15,30)
        # )
        ########################################
        # self.Encoder = nn.Sequential(
        #     nn.Linear(30,60),
        #     nn.BatchNorm1d(60),
        #     nn.LeakyReLU(),
        #     nn.Linear(60,120),
        #     nn.BatchNorm1d(120),
        #     nn.LeakyReLU(),
        #     nn.Linear(120, 60),
        #     nn.BatchNorm1d(60),
        #     nn.LeakyReLU(),
        #     nn.Linear(60, 30),
        #     nn.BatchNorm1d(30),
        #     nn.LeakyReLU(),
        #     nn.Linear(30, 5),
        #     nn.BatchNorm1d(5),
        #     nn.LeakyReLU()
        # )
        # self.Decoder = nn.Sequential(
        #     nn.Linear(5,30),
        #     nn.BatchNorm1d(30),
        #     nn.LeakyReLU(),
        #     nn.Linear(30,60),
        #     nn.BatchNorm1d(60),
        #     nn.LeakyReLU(),
        #     nn.Linear(60,120),
        #     nn.BatchNorm1d(120),
        #     nn.LeakyReLU(),
        #     nn.Linear(120,30)
        # )
        
    def forward(self, x):
        x = self.Encoder(x)
        x = self.Decoder(x)
        return x

class Trainer():
    def __init__(self, model, optimizer, train_loader, val_loader, scheduler, device):
        self.model = model
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.scheduler = scheduler
        self.device = device
        # Loss Function
        self.criterion = nn.L1Loss().to(self.device)
        
    def fit(self, ):
        self.model.to(self.device)
        best_score = 0
        for epoch in range(EPOCHS):
            self.model.train()
            train_loss = []
            for x in iter(self.train_loader):
                x = x.float().to(self.device)
                self.optimizer.zero_grad()

                _x = self.model(x)
                loss = self.criterion(x, _x)

                loss.backward()
                self.optimizer.step()

                train_loss.append(loss.item())

            score = self.validation(self.model, 0.95)
            print(f'Epoch : [{epoch}] Train loss : [{np.mean(train_loss)}] Val Score : [{score}])')

            if self.scheduler is not None:
                self.scheduler.step(score)

            if best_score < score:
                best_score = score
                torch.save(model.module.state_dict(), './best_model.pth', _use_new_zipfile_serialization=False)
            wandb.log({
                "validation f1": score
            })
    
    def validation(self, eval_model, thr):
        cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        eval_model.eval()
        pred = []
        true = []
        with torch.no_grad():
            for x, y in iter(self.val_loader):
                x = x.float().to(self.device)

                _x = self.model(x)
                diff = cos(x, _x).cpu().tolist()
                batch_pred = np.where(np.array(diff)<thr, 1,0).tolist()
                pred += batch_pred
                true += y.tolist()

        return f1_score(true, pred, average='macro')

wandb.init()
model = nn.DataParallel(AutoEncoder())
model.eval()
if WD : optimizer = torch.optim.Adam(params = model.parameters(), lr = LR, weight_decay=WD)
else : optimizer = torch.optim.Adam(params = model.parameters(), lr = LR)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=10, threshold_mode='abs', min_lr=1e-8, verbose=True)
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50, eta_min=0.1)
# optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
# scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.00005, 
#                                               step_size_up=5, max_lr=0.0001, 
#                                               gamma=0.5, mode='exp_range')
trainer = Trainer(model, optimizer, train_loader, val_loader, scheduler, device)
trainer.fit()

  cpuset_checked))


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
validation f1,▁▇▇▇█▇██████████████████████████████████

0,1
validation f1,0.53942


Epoch : [0] Train loss : [0.5504933893680573] Val Score : [0.0014752523784167275])
Epoch : [1] Train loss : [0.3603777928011758] Val Score : [0.06484129100843904])
Epoch : [2] Train loss : [0.2681117185524532] Val Score : [0.29602032994949545])
Epoch : [3] Train loss : [0.21450603221143996] Val Score : [0.4111044055839009])
Epoch : [4] Train loss : [0.18204315858227865] Val Score : [0.4465154220118763])
Epoch : [5] Train loss : [0.16024331322738103] Val Score : [0.47455413607102637])
Epoch : [6] Train loss : [0.14460599422454834] Val Score : [0.48724719278117745])
Epoch : [7] Train loss : [0.13275623534406936] Val Score : [0.49429601949436064])
Epoch : [8] Train loss : [0.12325334868260793] Val Score : [0.49842777698275637])
Epoch : [9] Train loss : [0.11725947047982897] Val Score : [0.5045806247968846])
Epoch : [10] Train loss : [0.11233177461794444] Val Score : [0.5067537167017901])
Epoch : [11] Train loss : [0.10849422109978539] Val Score : [0.5095524022499065])
Epoch : [12] Train l

In [None]:
# for test
model = AutoEncoder()
model.load_state_dict(torch.load('./best_model.pth'))
model = nn.DataParallel(model)
model.eval()

def prediction(model, thr, test_loader, device):
    model.to(device)
    model.eval()
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    pred = []
    with torch.no_grad():
        for x in iter(test_loader):
            x = x.float().to(device)
            
            _x = model(x)
            
            diff = cos(x, _x).cpu().tolist()
            batch_pred = np.where(np.array(diff)<thr, 1,0).tolist()
            pred += batch_pred
    return pred

preds = prediction(model, 0.97, test_loader, device)

  cpuset_checked))


In [None]:
sum(preds),sum(preds)/len(test_df)

(322, 0.002259601552248023)

In [None]:
submit = pd.read_csv('./sample_submission.csv')
submit['Class'] = preds
submit.to_csv('./submit_autoencoder2.csv', index=False)

In [None]:
# from mpl_toolkits.mplot3d import Axes3D
# import matplotlib.pyplot as plt
# %matplotlib notebook
# fig = plt.figure()
# ax = fig.add_subplot(111, projection='3d')
# ax.scatter(test_z[:,1],test_z[:,0], test_z[:,2], c=test_labels.astype(int))
# ax.set_xlabel('Encoded')
# ax.set_ylabel('Euclidean')
# ax.set_zlabel('Cosine')
# plt.show()

In [2]:
import pandas as pd
import numpy as np
train_df = pd.read_csv('./train.csv')
train_df = train_df.drop(columns=['ID'])
val_df = pd.read_csv('./val.csv')
val_df = val_df.drop(columns=['ID'])
test_df = pd.read_csv('./test.csv')
test_df = test_df.drop(columns=['ID'])