In [25]:
import warnings
warnings.simplefilter('ignore')
import math
import pandas as pd
import numpy as np
import sys
import time
import datetime
from contextlib import contextmanager
import logging
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, average_precision_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, KFold,GroupKFold,StratifiedGroupKFold
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder
from torch.nn import TransformerDecoder
from torch.nn import LayerNorm
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset
from torch.optim import lr_scheduler, AdamW
from transformers import get_linear_schedule_with_warmup
import gc
import random
import os
from types import SimpleNamespace
%matplotlib inline
import logging
pd.set_option('display.max_columns', 300)

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = False


In [2]:
debug = False
exp = "003"
if not os.path.exists(f"../out/exp/exp{exp}"):
    os.makedirs(f"../out/exp/exp{exp}")
    os.makedirs(f"../out/exp/exp{exp}/exp{exp}_model")
logger_path = f"../out/exp/exp{exp}/exp_{exp}.txt"
model_path =f"../out/exp/exp{exp}/exp{exp}_model/exp{exp}.pth"
LOGGER = logging.getLogger(__name__)
file_handler = logging.FileHandler(logger_path)
file_handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
LOGGER.addHandler(file_handler)

# config
seed = 0
shuffle = True
n_splits = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model config
batch_size = 32
n_epochs = 5
lr = 1e-4
weight_decay = 0.05
num_warmup_steps = 10

In [3]:
train_feature_path = f"../out/fe/fe004/train_feature.npy"
train_target_path = f"../out/fe/fe004/train_target.npy"
val_feature_path = f"../out/fe/fe004/val_feature.npy"
val_target_path = f"../out/fe/fe004/val_target.npy"


train_feature = np.load(train_feature_path)
train_target = np.load(train_target_path)
val_feature = np.load(val_feature_path)
val_target = np.load(val_target_path)

train_feature = train_feature.astype(np.float32)
val_feature = val_feature.astype(np.float32)
train_target = train_target.astype(np.float32)
val_target = val_target.astype(np.float32)

In [4]:
class SwingDataset(Dataset):
    def __init__(self, X, 
                 train = True, y = None):
        self.X = X
        self.train = train
        self.y = y
    
    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.train:
            return self.X[idx], self.y[idx]
        else:
            return self.X[idx]

In [22]:
class SwingGRU(nn.Module):
    def __init__(
        self, dropout=0.2,
        input_dim = 24,
        hidden_dim = 64,
        model_dim = 128,
        out_size = 11
        ):
        super(SwingGRU, self).__init__()
        self.numerical_linear  = nn.Sequential(
                nn.Linear(input_dim, hidden_dim),
                nn.LayerNorm(hidden_dim)
            )
        
        self.rnn = nn.GRU(hidden_dim, model_dim,
                            num_layers = 2, 
                            batch_first=True,
                            bidirectional=True)
                
        self.linear_out  = nn.Sequential(
                nn.Linear(model_dim * 2, 
                          model_dim),
                nn.LayerNorm(model_dim),
                nn.ReLU(),
                nn.Dropout(dropout),
                nn.Linear(model_dim, 
                          out_size),
                # nn.Sigmoid(),
        )
        self._reinitialize()
        
    def _reinitialize(self):
        """
        Tensorflow/Keras-like initialization
        """
        for name, p in self.named_parameters():
            if 'rnn' in name:
                if 'weight_ih' in name:
                    nn.init.xavier_uniform_(p.data)
                elif 'weight_hh' in name:
                    nn.init.orthogonal_(p.data)
                elif 'bias_ih' in name:
                    p.data.fill_(0)
                    # Set forget-gate bias to 1
                    n = p.size(0)
                    p.data[(n // 4):(n // 2)].fill_(1)
                elif 'bias_hh' in name:
                    p.data.fill_(0)
    
    def forward(self, numerical_array):
        
        numerical_embedding = self.numerical_linear(numerical_array)
        output,_ = self.rnn(numerical_embedding)
        # last = output[:, -1, :]
        last = torch.mean(output, dim=1)
        output = self.linear_out(last)
        return output

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=1000):
        super().__init__()
        
        # Create sinusoidal positional encoding
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        
        # Register as buffer (not a parameter but should be saved and loaded with the model)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        # x: [batch_size, seq_len, d_model]
        x = x + self.pe[:, :x.size(1), :]
        return x

class EncoderOnlyClassifier(nn.Module):
    def __init__(self, input_dim=24, n_enc=2, nhead=8, d_model=64, max_seq_len=1000):
        super().__init__()
        # Initialize Transformer model
        self.input_proj = nn.Sequential(
            nn.Linear(input_dim, d_model),
        )
        
        # Positional encoding
        self.pos_encoder = PositionalEncoding(d_model, max_seq_len)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=d_model * 4,
            batch_first=True,
            norm_first=True,
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_enc)
        self.classifier = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, 11),
            # nn.Sigmoid(),
        )

    def forward(self, src):
        # Project input to d_model dimension
        x = self.input_proj(src)  # -> (batch_size, seq_len, d_model)
        
        # Add positional encoding
        # x = self.pos_encoder(x)
        
        # Pass through transformer encoder
        memory = self.encoder(x)
        
        # Use the last time-step from encoder output
        # last = torch.mean(memory, dim=1)
        last = memory[:, -1, :]  # shape: (batch_size, d_model)
        logits = self.classifier(last)  # shape: (batch_size, 11)
        
        return logits



In [6]:
def to_numpy(p: torch.Tensor):
    if p.requires_grad:
        return p.detach().cpu().numpy()
    else:
        return p.cpu().numpy()

def metric_report(y_batch, out_batch):
    cut = [0, 2, 4, 7, 11]
    classes = ['gender', 'hand', 'year', 'level']
    for start, end, cls in zip(cut, cut[1:], classes):
        micro_roc_score = roc_auc_score(y_batch[:, start:end], out_batch[:, start:end], average='micro', multi_class='ovr')
        macro_roc_score = roc_auc_score(y_batch[:, start:end], out_batch[:, start:end], average='macro', multi_class='ovr')
        micro_presicion_score = average_precision_score(y_batch[:, start:end], out_batch[:, start:end], average='micro')
        macro_presicion_score = average_precision_score(y_batch[:, start:end], out_batch[:, start:end], average='macro')
        
        print(f"{cls} micro roc: {micro_roc_score:.4f}, macro roc: {macro_roc_score:.4f}, micro presci: {micro_presicion_score:.4f}, macro presci: {macro_presicion_score:.4f}")


In [14]:
    #AUC SCORE: 0.792(gender) + 0.998(hold) + 0.660(years) + 0.822(levels)


In [7]:
def get_loss(loss_tpye, class_weights):
    criterions = []
    cut = [0, 2, 4, 7, 11]
    classes = ['gender', 'hand', 'year', 'level']

    for start, end in zip(cut, cut[1:]):
        partial_weights = torch.tensor(class_weights[start:end]).to(device)
        if loss_tpye == 'CE':
            criterion = nn.CrossEntropyLoss(weight=partial_weights)
        elif loss_tpye == 'BCE':
            criterion = nn.BCEWithLogitsLoss(weight=partial_weights)
        else:
            raise ValueError(f"no such loss {loss_tpye}")
        criterions.append(criterion)
    
    return criterions

In [27]:
config = SimpleNamespace(
    batch_size = 32,
    n_epochs = 10,
    lr = 1e-5,
    weight_decay = 0.05,
    main_loss_weight = 0.6,
    loss_type = 'BCE',
    model_type = 'gru',
)

In [26]:
batch_size = 32
n_epochs = 10
lr = 1e-5
weight_decay = 0.05
num_warmup_steps = 10
main_loss_weight = 4/11
loss_type = 'BCE'

set_seed(42)
cut = [0, 2, 4, 7, 11]
classes = ['gender', 'hand', 'year', 'level']

train_weights = 1 / np.sum(train_target, axis=0)
final_weights = np.zeros((11,))
for start, end, cls in zip(cut, cut[1:], classes):
    class_weights = train_weights[start:end] / np.sum(train_weights[start:end])
    final_weights[start:end] = class_weights
    print(f"{cls} weights: {class_weights}")

train_ds = SwingDataset(train_feature, train=True, y = train_target)
val_ds = SwingDataset(val_feature, train=True, y = val_target)
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=4)
val_loader = DataLoader(val_ds, batch_size=batch_size, pin_memory=True, num_workers=4)

# model = SwingGRU()
model = EncoderOnlyClassifier()
model = model.to(device)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters,
                    lr=lr,
                    weight_decay=weight_decay,
                    )

num_train_optimization_steps = int(len(train_loader) * n_epochs)
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=num_warmup_steps,
                                            num_training_steps=num_train_optimization_steps)

criterions = get_loss(loss_type, final_weights)

best_loss = 9999.9
early_stop_cnt = 0

for epoch in range(n_epochs):
    train_out_batch = []
    val_out_batch = []
    train_y_batch = []
    val_y_batch = []
    train_loss = 0.0
    train_main_loss = 0.0
    train_aux_loss = 0.0
    val_loss = 0.0
    val_main_loss = 0.0
    val_aux_loss = 0.0
    
    model.train() 
    pbar = tqdm(train_loader, total=len(train_loader), leave = False)
    for d in pbar:
        X, y = d
        b, t, c = X.size()
        X, y = X.to(device), y.to(device)
        
        optimizer.zero_grad()
        
        out = model(X)
        
        aux_loss = 0
        main_loss = 0
        aux_loss += criterions[0](out[:, :2], y[:, :2])
        aux_loss += criterions[1](out[:, 2:4], y[:, 2:4])
        aux_loss += criterions[2](out[:, 4:7], y[:, 4:7])
        main_loss += criterions[3](out[:, 7:], y[:, 7:])
        loss = (1 - main_loss_weight) * aux_loss + main_loss_weight * main_loss
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        if loss_type == 'CE':
            train_out_batch.append(to_numpy(F.softmax(out)))
        elif loss_type == 'BCE':
            train_out_batch.append(to_numpy(F.sigmoid(out)))

        train_y_batch.append(to_numpy(y))
        train_loss += loss.item()
        train_main_loss += main_loss.item()
        train_aux_loss += aux_loss.item()
    
    train_out_batch = np.concatenate(train_out_batch)
    train_y_batch = np.concatenate(train_y_batch)
    
    train_loss = train_loss/len(train_loader)
    train_main_loss = train_main_loss/len(train_loader)
    train_aux_loss = train_aux_loss/len(train_loader)
    print(f"Train loss: {train_loss:.4f} Aux loss: {train_aux_loss:.4f} Main loss: {train_main_loss:.4f}")
    
    model.eval()
    pbar = tqdm(val_loader, total=len(val_loader), leave = False)
    with torch.no_grad():
        for d in pbar:
            X, y = d
            X, y = X.to(device), y.to(device)
            
            out = model(X)
            aux_loss = 0
            main_loss = 0
            aux_loss += criterions[0](out[:, :2], y[:, :2])
            aux_loss += criterions[1](out[:, 2:4], y[:, 2:4])
            aux_loss += criterions[2](out[:, 4:7], y[:, 4:7])
            main_loss += criterions[3](out[:, 7:], y[:, 7:])
            loss = (1 - main_loss_weight) * aux_loss + main_loss_weight * main_loss

            if loss_type == 'CE':
                val_out_batch.append(to_numpy(F.softmax(out)))
            elif loss_type == 'BCE':
                val_out_batch.append(to_numpy(F.sigmoid(out)))

            val_y_batch.append(to_numpy(y))
            val_loss += loss.item()
            val_main_loss += main_loss.item()
            val_aux_loss += aux_loss.item()


    val_out_batch = np.concatenate(val_out_batch)
    val_y_batch = np.concatenate(val_y_batch)

    val_loss = val_loss/len(val_loader)
    val_main_loss = val_main_loss/len(val_loader)
    val_aux_loss = val_aux_loss/len(val_loader)
    print(f"Val loss: {val_loss:.4f} Aux loss: {val_aux_loss:.4f} Main loss: {val_main_loss:.4f}")
    metric_report(val_y_batch, val_out_batch)
    
    if val_main_loss < best_loss:
        best_loss = val_main_loss
        print(f"✨ best Val loss at epoch {epoch}, loss {val_main_loss:.4f}")
        early_stop_cnt = 0
    else:
        early_stop_cnt += 1
    
    if early_stop_cnt == 3:
        print(f"No imporvement for {early_stop_cnt} Epochs, stopping")
        break


gender weights: [0.1936848 0.8063152]
hand weights: [0.13666478 0.8633352 ]
year weights: [0.43455723 0.3082604  0.2571824 ]
level weights: [0.08001771 0.31609336 0.53850025 0.06538874]


  0%|          | 0/444 [00:00<?, ?it/s]

Train loss: 0.5408 Aux loss: 0.7710 Main loss: 0.1380


  0%|          | 0/108 [00:00<?, ?it/s]

Val loss: 0.5481 Aux loss: 0.7890 Main loss: 0.1265
gender micro roc: 0.9290, macro roc: 0.4963, micro presci: 0.9167, macro presci: 0.5001
hand micro roc: 0.6158, macro roc: 0.5559, micro presci: 0.5721, macro presci: 0.5305
year micro roc: 0.6291, macro roc: nan, micro presci: 0.4222, macro presci: 0.3311
level micro roc: 0.6351, macro roc: 0.5175, micro presci: 0.3393, macro presci: 0.2554
✨ best Val loss at epoch 0, loss 0.1265


  0%|          | 0/444 [00:00<?, ?it/s]

Train loss: 0.4700 Aux loss: 0.6801 Main loss: 0.1023


  0%|          | 0/108 [00:00<?, ?it/s]

Val loss: 0.5372 Aux loss: 0.7768 Main loss: 0.1179
gender micro roc: 0.9374, macro roc: 0.5327, micro presci: 0.9247, macro presci: 0.5050
hand micro roc: 0.6366, macro roc: 0.5944, micro presci: 0.5917, macro presci: 0.5596
year micro roc: 0.6147, macro roc: nan, micro presci: 0.3963, macro presci: 0.3336
level micro roc: 0.6617, macro roc: 0.5184, micro presci: 0.3662, macro presci: 0.2575
✨ best Val loss at epoch 1, loss 0.1179


  0%|          | 0/444 [00:00<?, ?it/s]

Train loss: 0.4526 Aux loss: 0.6582 Main loss: 0.0927


  0%|          | 0/108 [00:00<?, ?it/s]

Val loss: 0.5310 Aux loss: 0.7680 Main loss: 0.1163
gender micro roc: 0.9416, macro roc: 0.5619, micro presci: 0.9294, macro presci: 0.5097
hand micro roc: 0.6530, macro roc: 0.6280, micro presci: 0.6056, macro presci: 0.5876
year micro roc: 0.6134, macro roc: nan, micro presci: 0.3914, macro presci: 0.3364
level micro roc: 0.6688, macro roc: 0.5201, micro presci: 0.3742, macro presci: 0.2597
✨ best Val loss at epoch 2, loss 0.1163


  0%|          | 0/444 [00:00<?, ?it/s]

Train loss: 0.4421 Aux loss: 0.6439 Main loss: 0.0891


  0%|          | 0/108 [00:00<?, ?it/s]

Val loss: 0.5229 Aux loss: 0.7555 Main loss: 0.1159
gender micro roc: 0.9446, macro roc: 0.5851, micro presci: 0.9333, macro presci: 0.5139
hand micro roc: 0.6686, macro roc: 0.6600, micro presci: 0.6192, macro presci: 0.6178
year micro roc: 0.6123, macro roc: nan, micro presci: 0.3920, macro presci: 0.3397
level micro roc: 0.6722, macro roc: 0.5220, micro presci: 0.3780, macro presci: 0.2619
✨ best Val loss at epoch 3, loss 0.1159


  0%|          | 0/444 [00:00<?, ?it/s]

Train loss: 0.4323 Aux loss: 0.6293 Main loss: 0.0876


  0%|          | 0/108 [00:00<?, ?it/s]

Val loss: 0.5169 Aux loss: 0.7460 Main loss: 0.1160
gender micro roc: 0.9475, macro roc: 0.6093, micro presci: 0.9379, macro presci: 0.5187
hand micro roc: 0.6825, macro roc: 0.6887, micro presci: 0.6329, macro presci: 0.6491
year micro roc: 0.6066, macro roc: nan, micro presci: 0.3903, macro presci: 0.3437
level micro roc: 0.6746, macro roc: 0.5247, micro presci: 0.3814, macro presci: 0.2651


  0%|          | 0/444 [00:00<?, ?it/s]

Train loss: 0.4215 Aux loss: 0.6127 Main loss: 0.0870


  0%|          | 0/108 [00:00<?, ?it/s]

Val loss: 0.5085 Aux loss: 0.7329 Main loss: 0.1160
gender micro roc: 0.9502, macro roc: 0.6322, micro presci: 0.9423, macro presci: 0.5235
hand micro roc: 0.6954, macro roc: 0.7151, micro presci: 0.6470, macro presci: 0.6820
year micro roc: 0.6107, macro roc: nan, micro presci: 0.3986, macro presci: 0.3484
level micro roc: 0.6764, macro roc: 0.5292, micro presci: 0.3841, macro presci: 0.2693


  0%|          | 0/444 [00:00<?, ?it/s]

Train loss: 0.4107 Aux loss: 0.5956 Main loss: 0.0869


  0%|          | 0/108 [00:00<?, ?it/s]

Val loss: 0.4977 Aux loss: 0.7160 Main loss: 0.1157
gender micro roc: 0.9523, macro roc: 0.6498, micro presci: 0.9459, macro presci: 0.5271
hand micro roc: 0.7070, macro roc: 0.7382, micro presci: 0.6606, macro presci: 0.7157
year micro roc: 0.6196, macro roc: nan, micro presci: 0.4126, macro presci: 0.3527
level micro roc: 0.6782, macro roc: 0.5348, micro presci: 0.3859, macro presci: 0.2733
✨ best Val loss at epoch 6, loss 0.1157


  0%|          | 0/444 [00:00<?, ?it/s]

Train loss: 0.4024 Aux loss: 0.5825 Main loss: 0.0871


  0%|          | 0/108 [00:00<?, ?it/s]

Val loss: 0.4933 Aux loss: 0.7092 Main loss: 0.1156
gender micro roc: 0.9541, macro roc: 0.6653, micro presci: 0.9491, macro presci: 0.5305
hand micro roc: 0.7151, macro roc: 0.7535, micro presci: 0.6696, macro presci: 0.7365
year micro roc: 0.6194, macro roc: nan, micro presci: 0.4168, macro presci: 0.3556
level micro roc: 0.6798, macro roc: 0.5392, micro presci: 0.3877, macro presci: 0.2767
✨ best Val loss at epoch 7, loss 0.1156


  0%|          | 0/444 [00:00<?, ?it/s]

Train loss: 0.3959 Aux loss: 0.5723 Main loss: 0.0870


  0%|          | 0/108 [00:00<?, ?it/s]

Val loss: 0.4902 Aux loss: 0.7043 Main loss: 0.1155
gender micro roc: 0.9550, macro roc: 0.6740, micro presci: 0.9509, macro presci: 0.5323
hand micro roc: 0.7203, macro roc: 0.7629, micro presci: 0.6756, macro presci: 0.7486
year micro roc: 0.6180, macro roc: nan, micro presci: 0.4183, macro presci: 0.3573
level micro roc: 0.6809, macro roc: 0.5420, micro presci: 0.3890, macro presci: 0.2790
✨ best Val loss at epoch 8, loss 0.1155


  0%|          | 0/444 [00:00<?, ?it/s]

Train loss: 0.3929 Aux loss: 0.5677 Main loss: 0.0870


  0%|          | 0/108 [00:00<?, ?it/s]

Val loss: 0.4899 Aux loss: 0.7038 Main loss: 0.1155
gender micro roc: 0.9553, macro roc: 0.6770, micro presci: 0.9515, macro presci: 0.5330
hand micro roc: 0.7219, macro roc: 0.7656, micro presci: 0.6773, macro presci: 0.7517
year micro roc: 0.6177, macro roc: nan, micro presci: 0.4189, macro presci: 0.3579
level micro roc: 0.6813, macro roc: 0.5429, micro presci: 0.3896, macro presci: 0.2798
✨ best Val loss at epoch 9, loss 0.1155


gru BCE main weight 0.6
gender micro roc: 0.8772, macro roc: 0.5145, micro presci: 0.8666, macro presci: 0.5027
hand micro roc: 0.9836, macro roc: 0.9998, micro presci: 0.9860, macro presci: 0.9999
year micro roc: 0.7048, macro roc: nan, micro presci: 0.5178, macro presci: 0.3901
level micro roc: 0.7452, macro roc: 0.6534, micro presci: 0.5440, macro presci: 0.4111
✨ best Val loss at epoch 9, loss 0.1065

gru CE main weight 0.6
gender micro roc: 0.7335, macro roc: 0.7485, micro presci: 0.7244, macro presci: 0.5415
hand micro roc: 0.9855, macro roc: 0.9858, micro presci: 0.9852, macro presci: 0.9852
year micro roc: 0.6026, macro roc: nan, micro presci: 0.4718, macro presci: 0.3886
level micro roc: 0.5973, macro roc: 0.6532, micro presci: 0.3167, macro presci: 0.4098
✨ best Val loss at epoch 0, loss 0.2248

encoder CE main weight 0.6
gender micro roc: 0.4246, macro roc: 0.5286, micro presci: 0.4889, macro presci: 0.5041
hand micro roc: 0.5058, macro roc: 0.4927, micro presci: 0.5479, macro presci: 0.5466
year micro roc: 0.5587, macro roc: nan, micro presci: 0.3985, macro presci: 0.3384
level micro roc: 0.5122, macro roc: 0.4933, micro presci: 0.2539, macro presci: 0.2658
✨ best Val loss at epoch 0, loss 0.2379

encoder BCE main weight 0.6
gender micro roc: 0.9370, macro roc: 0.5172, micro presci: 0.9355, macro presci: 0.5067
hand micro roc: 0.5816, macro roc: 0.4799, micro presci: 0.5629, macro presci: 0.5166
year micro roc: 0.6991, macro roc: nan, micro presci: 0.4950, macro presci: 0.3312
level micro roc: 0.6710, macro roc: 0.4723, micro presci: 0.3606, macro presci: 0.2458
✨ best Val loss at epoch 0, loss 0.1142
