In [47]:
from google.colab import drive
# drive.mount('/content/drive/MyDrive/IITP/sohyun/creditcard_prediction/data')
drive.mount('/content/drive')

%cd drive/MyDrive/IITP/sohyun/creditcard_prediction/data

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[Errno 2] No such file or directory: 'drive/MyDrive/IITP/sohyun/creditcard_prediction/data'
/content/drive/MyDrive/IITP/sohyun/creditcard_prediction/data


In [48]:
import random
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from sklearn.metrics import f1_score

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

EPOCHS = 400
LR = 1e-2
BS = 16384
SEED = 41

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED) # Seed 고정

train_df = pd.read_csv('./train.csv')
train_df = train_df.drop(columns=['ID'])
val_df = pd.read_csv('./val.csv')
val_df = val_df.drop(columns=['ID'])

class MyDataset(Dataset):
    def __init__(self, df, eval_mode):
        self.df = df
        self.eval_mode = eval_mode
        if self.eval_mode:
            self.labels = self.df['Class'].values
            self.df = self.df.drop(columns=['Class']).values
        else:
            self.df = self.df.values
        
    def __getitem__(self, index):
        if self.eval_mode:
            self.x = self.df[index]
            self.y = self.labels[index]
            return torch.Tensor(self.x), self.y
        else:
            self.x = self.df[index]
            return torch.Tensor(self.x)
        
    def __len__(self):
        return len(self.df)

train_dataset = MyDataset(df=train_df, eval_mode=False)
train_loader = DataLoader(train_dataset, batch_size=BS, shuffle=True) #num_workers=6

val_dataset = MyDataset(df = val_df, eval_mode=True)
val_loader = DataLoader(val_dataset, batch_size=BS, shuffle=False) #num_workers=6

class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.Encoder = nn.Sequential(
            nn.Linear(30,64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.Linear(64,128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(),
        )
        self.Decoder = nn.Sequential(
            nn.Linear(128,64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.Linear(64,30),
        )
        
    def forward(self, x):
        x = self.Encoder(x)
        x = self.Decoder(x)
        return x


class Trainer():
    def __init__(self, model, optimizer, train_loader, val_loader, scheduler, device):
        self.model = model
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.scheduler = scheduler
        self.device = device
        # Loss Function
        self.criterion = nn.L1Loss().to(self.device)
        
    def fit(self, ):
        self.model.to(self.device)
        best_score = 0
        for epoch in range(EPOCHS):
            self.model.train()
            train_loss = []
            for x in iter(self.train_loader):
                x = x.float().to(self.device)
                self.optimizer.zero_grad()

                _x = self.model(x)
                loss = self.criterion(x, _x)

                loss.backward()
                self.optimizer.step()

                train_loss.append(loss.item())

            score = self.validation(self.model, 0.95)
            print(f'Epoch : [{epoch}] Train loss : [{np.mean(train_loss)}] Val Score : [{score}])')

            if self.scheduler is not None:
                self.scheduler.step(score)

            if best_score < score:
                best_score = score
                torch.save(model.module.state_dict(), './best_model.pth', _use_new_zipfile_serialization=False)
    
    def validation(self, eval_model, thr):
        cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        eval_model.eval()
        pred = []
        true = []
        with torch.no_grad():
            for x, y in iter(self.val_loader):
                x = x.float().to(self.device)

                _x = self.model(x)
                diff = cos(x, _x).cpu().tolist()
                batch_pred = np.where(np.array(diff)<thr, 1,0).tolist()
                pred += batch_pred
                true += y.tolist()

        return f1_score(true, pred, average='macro')

model = nn.DataParallel(AutoEncoder())
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = LR)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=10, threshold_mode='abs', min_lr=1e-8, verbose=True)

trainer = Trainer(model, optimizer, train_loader, val_loader, scheduler, device)
trainer.fit()

Epoch : [0] Train loss : [0.5370450190135411] Val Score : [0.0025295113983675546])
Epoch : [1] Train loss : [0.353203011410577] Val Score : [0.11469736766126254])
Epoch : [2] Train loss : [0.2670533231326512] Val Score : [0.30696384850452524])
Epoch : [3] Train loss : [0.2174352662903922] Val Score : [0.3769625130322644])
Epoch : [4] Train loss : [0.18637913465499878] Val Score : [0.44575724386350246])
Epoch : [5] Train loss : [0.16547850412981852] Val Score : [0.4716149901712482])
Epoch : [6] Train loss : [0.1505056279046195] Val Score : [0.4853230261800434])
Epoch : [7] Train loss : [0.14029540973050253] Val Score : [0.49219101905022994])
Epoch : [8] Train loss : [0.1320484642471586] Val Score : [0.4971218579894749])
Epoch : [9] Train loss : [0.12731147131749562] Val Score : [0.500237456548652])
Epoch : [10] Train loss : [0.12142255795853478] Val Score : [0.5019615756952024])
Epoch : [11] Train loss : [0.11733524394886834] Val Score : [0.5039959969348181])
Epoch : [12] Train loss : [

KeyboardInterrupt: ignored

In [None]:
model = AutoEncoder()
model.load_state_dict(torch.load('./best_model.pth'))
model = nn.DataParallel(model)
model.eval()

test_df = pd.read_csv('./test.csv')
test_df = test_df.drop(columns=['ID'])

test_dataset = MyDataset(test_df, False)
test_loader = DataLoader(test_dataset, batch_size=BS, shuffle=False, num_workers=6)

def prediction(model, thr, test_loader, device):
    model.to(device)
    model.eval()
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    pred = []
    with torch.no_grad():
        for x in iter(test_loader):
            x = x.float().to(device)
            print(x)
            _x = model(x)
            
            diff = cos(x, _x).cpu().tolist()
            print(diff)
            print(len(diff))
            break
            batch_pred = np.where(np.array(diff)<thr, 1,0).tolist()
            pred += batch_pred
    return pred

preds = prediction(model, 0.95, test_loader, device)

  cpuset_checked))


tensor([[-1.3598, -0.0728,  2.5363,  ..., -0.0211,  1.7833, -0.9950],
        [ 1.1919,  0.2662,  0.1665,  ...,  0.0147, -0.2698, -0.9950],
        [-1.1582,  0.8777,  1.5487,  ...,  0.2152,  0.6706, -0.9950],
        ...,
        [ 0.8734, -1.2072,  0.9606,  ...,  0.0467,  1.9982, -0.5605],
        [ 1.1561, -0.2976,  1.1062,  ...,  0.0120, -0.1467, -0.5605],
        [-0.4090,  0.5839,  1.5979,  ...,  0.0677, -0.2269, -0.5604]])
[0.9981145262718201, 0.9989241361618042, 0.9983135461807251, 0.9977964758872986, 0.9977865219116211, 0.9995219707489014, 0.9991946220397949, 0.9988970756530762, 0.997672975063324, 0.9986920952796936, 0.9986172318458557, 0.9981147646903992, 0.9986519813537598, 0.9981908798217773, 0.9982603788375854, 0.9980898499488831, 0.9980898499488831, 0.999238133430481, 0.9982222318649292, 0.9984400272369385, 0.998162031173706, 0.9991767406463623, 0.9980661869049072, 0.9990675449371338, 0.9989785552024841, 0.9983569383621216, 0.9977784752845764, 0.9991705417633057, 0.997244

In [None]:
sum(preds),sum(preds)/len(test_df)

(317, 0.0022245145716230535)

In [49]:
model = AutoEncoder()
model.load_state_dict(torch.load('./best_model.pth'))
model = nn.DataParallel(model)
model.eval()

test_df = pd.read_csv('./test.csv')
test_df = test_df.drop(columns=['ID'])

test_dataset = MyDataset(test_df, False)
test_loader = DataLoader(test_dataset, batch_size=BS, shuffle=False, num_workers=6)

from sklearn.preprocessing import StandardScaler
def prediction(model, thr, test_loader, device):
    model.to(device)
    model.eval()
    pred = []
    with torch.no_grad():
        for x in iter(test_loader):
            x = x.float().to(device)
            
            _x = model(x)
            
            # diff = cos(x, _x).cpu().tolist()
            diff = np.linalg.norm(x - _x, axis=1)
            
            scaler = StandardScaler()
            scaled_diff = scaler.fit_transform(diff.reshape(-1,1))
            # diff = preprocessing.normalize(diff)
            
            thr = pd.DataFrame(scaled_diff).quantile(q=0.995)[0]
            thr2 = pd.DataFrame(scaled_diff).quantile(q=0.005)[0]
            batch_pred = np.where(np.logical_or((np.array(scaled_diff)>thr), (np.array(scaled_diff)<thr2)), 1,0).tolist()
            pred += batch_pred
    return pred

preds = prediction(model, 0.95, test_loader, device)

  cpuset_checked))


In [50]:
preds = sum(preds, [])
sum(preds),sum(preds)/len(test_df)

(1428, 0.010020841666491231)

In [51]:
submit = pd.read_csv('./sample_submission.csv')
submit['Class'] = preds
submit.to_csv('./submit_autoencoder_ed.csv', index=False)