In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import random
import torch.nn.functional as F
import wandb

from model import CustomModel
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split, StratifiedKFold
from tqdm import tqdm

In [2]:
train_path = "./dataset/train.csv"
test_path = "./dataset/test.csv"
seed = 2021
def seed_everything(seed) :
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
seed_everything(seed)
batch_size = 32

In [3]:
train_df = pd.read_csv(train_path)
y_train = train_df["quality"]
train_df.drop(["quality", "id"], axis=1, inplace=True)
column_list = list(train_df.columns)
del column_list[-1]
column_list.append("type_int")

In [4]:
train_type = train_df["type"]
train_df = (train_df-train_df.mean())/train_df.std()
train_df["type_int"] = train_type.map(lambda x : -1 if x == "white" else 1)
train_df = train_df[column_list]
train_df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,type_int
0,0.613069,-0.973720,0.346267,-0.862693,-0.444824,-0.896278,-0.363288,-1.135394,-0.952385,0.524733,1.342665,-1
1,0.227116,0.274060,-0.202226,-0.115200,-0.976715,0.426538,-0.058404,-1.443214,-0.828832,-0.343750,2.096552,-1
2,-0.853552,-0.412219,0.072020,-0.675819,-0.976715,-0.263627,0.013333,-1.986812,-0.581725,-1.546265,1.677726,-1
3,-0.621981,-0.100274,1.237568,1.089095,-0.248864,1.519300,1.609486,0.698430,-0.828832,0.457927,-0.918995,-1
4,0.921832,0.211671,0.757636,-0.655056,0.199044,-1.068820,-1.744229,0.272721,-0.272842,1.860860,0.588779,1
...,...,...,...,...,...,...,...,...,...,...,...,...
3226,-0.621981,-1.098498,-0.065103,-0.032145,-0.360841,0.656593,0.730705,-0.038374,0.839138,-0.410556,0.002422,-1
3227,-0.621981,-0.973720,-0.270788,2.407589,-0.248864,1.289245,0.407887,0.842516,0.221371,-1.212233,0.002422,-1
3228,-0.930743,-0.349830,-0.476472,-0.758874,-0.388835,-1.298875,-0.148076,-0.578697,-0.828832,0.123895,-1.254056,-1
3229,0.381498,-0.349830,-0.202226,0.300074,-0.416830,-0.091086,0.838311,0.174480,1.395128,0.457927,0.002422,-1


In [5]:
y_train = y_train-4

In [6]:
X = train_df.values
y = y_train.values

In [7]:
class CustomDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __getitem__(self, idx):
        return torch.tensor(self.x[idx]).float(), torch.tensor(self.y[idx])
    
    def __len__(self):
        return self.x.shape[0]

In [8]:
class FocalLoss(nn.Module) :
    def __init__(self, weight = None, gamma = 2., reduction = 'mean') :
        super().__init__()
        self.weight = weight
        self.gamma = gamma
        self.reduction = reduction
    
    def forward(self, input_tensor, target_tensor) :
        log_prob = F.log_softmax(input_tensor, dim = -1) # -log(p)
        prob = torch.exp(log_prob)                       # p
        return F.nll_loss(                               # Negative Log Likelihood Loss (F.log_softmax + F.nll_loss = F.cross_entropy)
            ((1-prob) ** self.gamma) * log_prob,
            target_tensor,
            weight = self.weight,
            reduction = self.reduction
        )      

In [9]:
def evaluate(model, dataloader):
    with torch.no_grad():
        model.eval()
        all_cnt = 0
        correct_cnt = 0
        for x, y in dataloader:
            preds = model(x)
            max_args = torch.argmax(preds, dim=1)
            res = (max_args==y).int()
            all_cnt += res.size(0)
            correct_cnt += torch.sum(res)
        return correct_cnt/all_cnt

In [10]:
epochs = 100

config = {"epoch":epochs, "batch_size":batch_size}

stratified_kfold = StratifiedKFold(n_splits=5)

fold_num = 0
for train_idx, test_idx in stratified_kfold.split(X, y):
    model = CustomModel(12 ,5)
    model.init_weights()
    criterion = FocalLoss()
    optimizer = torch.optim.Adam(model.parameters())

    
    PATH = f"fold{fold_num}"
    wandb.init(project="wine_quality_classification", config=config, name=f"fold_{fold_num}")
    train_x, val_x, train_y, val_y = X[train_idx], X[test_idx], y[train_idx], y[test_idx]
    
    train_dataset = CustomDataset(train_x, train_y)
    val_dataset = CustomDataset(val_x, val_y)
    
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    eval_train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

    for epoch in tqdm(range(epochs)):
        epoch_loss = 0
        for batch_x, batch_y in train_dataloader:
            model.train()
            preds = model(batch_x)
            loss = criterion(preds, batch_y)
            optimizer.zero_grad()
            loss.backward()

            optimizer.step()
            epoch_loss += loss
        train_acc = evaluate(model, eval_train_dataloader)
        val_acc = evaluate(model, val_dataloader)
        torch.save({"model_state_dict": model.state_dict(), "loss":epoch_loss}, f"{PATH}/model_{epoch}.pt")
        wandb.log({"train_accuracy": train_acc, "validation_accuracy": val_acc ,"loss": epoch_loss})
    fold_num += 1

[34m[1mwandb[0m: Currently logged in as: [33mai_esg[0m (use `wandb login --relogin` to force relogin)


100%|██████████| 100/100 [00:33<00:00,  2.98it/s]


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
loss,█▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▃▂▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁
train_accuracy,▂▁▂▃▃▃▃▃▃▃▃▃▃▃▄▅▄▅▅▆▅▅▅▆▆▆▆▆▅▆▇▇▇▇▇█▆███
validation_accuracy,▃▁▅▆▅▇▇▂▃▃▆▄▆▂▃▅▅▄▃▅▄▃▇▇▇██▇▃▅▄▇▆▄▆▇▅▇█▆

0,1
loss,14.91979
train_accuracy,0.81772
validation_accuracy,0.54096


100%|██████████| 100/100 [00:25<00:00,  3.93it/s]


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
loss,█▆▅▅▄▄▄▄▄▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁
train_accuracy,▁▁▂▂▁▂▂▂▃▃▄▃▃▃▄▅▄▅▄▅▅▆▅▆▆▅▇▆▅▆▆█▆████▇█▇
validation_accuracy,▃▂▅▅▁▅▅▅▅▇▇▇▄▃▅█▇█▆▆▆▆█▇▅▄▆▃▇▂▇▇▅▇█▇▇▆▅▅

0,1
loss,11.6805
train_accuracy,0.80735
validation_accuracy,0.56347


100%|██████████| 100/100 [00:25<00:00,  3.90it/s]


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
loss,█▆▅▅▅▄▄▄▄▄▄▄▄▃▃▃▃▃▃▃▃▃▂▂▂▃▂▂▂▂▂▂▂▁▁▁▂▁▁▁
train_accuracy,▁▁▂▂▃▃▃▃▃▃▃▃▄▃▄▄▄▅▅▆▅▆▆▅▆▆▇▆▅▅▇▇▇▆█▇█▇██
validation_accuracy,▇▂▄▇▅▃▆▄▆▆▂▁▆▂▆▅▆▅▅▆▁█▇▅▅▅▆▄▁▄▆▇█▅▆▅▃█▇█

0,1
loss,11.69016
train_accuracy,0.81547
validation_accuracy,0.55263


100%|██████████| 100/100 [00:25<00:00,  3.92it/s]


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
loss,█▆▅▅▅▅▅▄▄▄▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁
train_accuracy,▁▁▁▁▂▂▂▂▃▂▃▃▃▃▃▄▄▄▄▄▅▄▄▄▅▆▆▆▅▆▆▅▆▇▆▇▇▇▇█
validation_accuracy,▅▄▂▃▄▃▆▃▆▄▄▅▆▃▇▆▃▄▄▇▇▁▅▃█▄▅▆▃▄▅▄▁▅▅▃█▅▅▆

0,1
loss,11.86505
train_accuracy,0.86499
validation_accuracy,0.56347


100%|██████████| 100/100 [00:25<00:00,  3.86it/s]
