In [1]:
import warnings
warnings.simplefilter('ignore')
import math
import pandas as pd
import numpy as np
import sys
import time
import datetime
from contextlib import contextmanager
import logging
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, average_precision_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold, KFold,GroupKFold,StratifiedGroupKFold
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder
from torch.nn import TransformerDecoder
from torch.nn import LayerNorm
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset
from torch.optim import lr_scheduler
from transformers import AdamW, get_linear_schedule_with_warmup
import gc
import random
import os
%matplotlib inline
import logging
pd.set_option('display.max_columns', 300)

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = False


In [2]:
debug = False
exp = "001"
if not os.path.exists(f"../out/exp/exp{exp}"):
    os.makedirs(f"../out/exp/exp{exp}")
    os.makedirs(f"../out/exp/exp{exp}/exp{exp}_model")
logger_path = f"../out/exp/exp{exp}/exp_{exp}.txt"
model_path =f"../out/exp/exp{exp}/exp{exp}_model/exp{exp}.pth"
LOGGER = logging.getLogger(__name__)
file_handler = logging.FileHandler(logger_path)
file_handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
LOGGER.addHandler(file_handler)

# config
seed = 0
shuffle = True
n_splits = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model config
batch_size = 24
n_epochs = 5
lr = 1e-3
weight_decay = 0.05
num_warmup_steps = 10

In [3]:
id_path = f"../out/fe/fe001/id_list.npy"
player_path = f"../out/fe/fe001/player_list.npy"
feature_arr_path = f"../out/fe/fe001/feature_arr.npy"
target_arr_path = f"../out/fe/fe001/target_arr.npy"
mask_arr_path = f"../out/fe/fe001/mask_arr.npy"
target_mask_arr_path = f"../out/fe/fe001/target_mask_arr.npy"

In [4]:
feature_arr = np.load(feature_arr_path)
target_arr = np.load(target_arr_path)
mask_arr = np.load(mask_arr_path)
target_mask_arr =np.load(target_mask_arr_path)
id_list = np.load(id_path)
player_list = np.load(player_path)

In [5]:
print(feature_arr.shape)
print(target_arr.shape)
print(mask_arr.shape)
print(target_mask_arr.shape)
print(id_list.shape)
print(player_list.shape)
player_list[:100]

(18361, 500, 24)
(18361, 500, 11)
(18361, 500)
(18361, 500, 11)
(18361,)
(18361,)


array([41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
       41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
       41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
       41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
       41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41,
       41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41])

In [6]:
# early mix (current)
# mid mix
# late mix (patchTST)
# patch transfromer (GRU + transformer), (transformer + GRU) ?

In [6]:
class SwingDataset(Dataset):
    def __init__(self, feature_arr, 
                 mask_arr,
                 train = True, y = None, target_mask = None):
        self.feature_arr = feature_arr
        self.mask_arr = mask_arr
        self.train = train
        self.y = y
        self.target_mask = target_mask
    
    def __len__(self):
        return len(self.feature_arr)

    def __getitem__(self, idx):
        attention_mask = self.mask_arr[idx] == 0

        if self.train : 
            return {
              'feature_arr': torch.tensor(self.feature_arr[idx],dtype=torch.float32),
              'mask_arr':torch.tensor(self.mask_arr[idx], dtype=torch.long),  
            #   'attention_mask': torch.tensor(attention_mask, dtype=torch.bool),
              "y":torch.tensor(self.y[idx], dtype=torch.long)
               }
        else:
            return {
              'feature_arr': torch.tensor(self.feature_arr[idx],dtype=torch.float32),
              'mask_arr':torch.tensor(self.mask_arr[idx], dtype=torch.long),  
            #   'attention_mask': torch.tensor(attention_mask, dtype=torch.bool),
               }


In [7]:
class SwingGRU(nn.Module):
    def __init__(
        self, dropout=0.2,
        input_numerical_size = 24,
        numeraical_linear_size = 64,
        model_size = 128,
        linear_out = 128,
        out_size=11):
        super(SwingGRU, self).__init__()
        self.numerical_linear  = nn.Sequential(
                nn.Linear(input_numerical_size, numeraical_linear_size),
                nn.LayerNorm(numeraical_linear_size)
            )
        
        self.rnn = nn.GRU(numeraical_linear_size, model_size,
                            num_layers = 2, 
                            batch_first=True,
                            bidirectional=True)
        
        
                
        self.linear_out  = nn.Sequential(
                nn.Linear(model_size*2, 
                          linear_out),
                nn.LayerNorm(linear_out),
                nn.ReLU(),
                nn.Dropout(dropout),
                nn.Linear(linear_out, 
                          out_size))
        self._reinitialize()
        
    def _reinitialize(self):
        """
        Tensorflow/Keras-like initialization
        """
        for name, p in self.named_parameters():
            if 'rnn' in name:
                if 'weight_ih' in name:
                    nn.init.xavier_uniform_(p.data)
                elif 'weight_hh' in name:
                    nn.init.orthogonal_(p.data)
                elif 'bias_ih' in name:
                    p.data.fill_(0)
                    # Set forget-gate bias to 1
                    n = p.size(0)
                    p.data[(n // 4):(n // 2)].fill_(1)
                elif 'bias_hh' in name:
                    p.data.fill_(0)
    
    def forward(self, numerical_array):
        
        numerical_embedding = self.numerical_linear(numerical_array)
        output,_ = self.rnn(numerical_embedding)
        output = self.linear_out(output[:, -1, :])
        return output


In [8]:
@contextmanager
def timer(name):
    t0 = time.time()
    yield 
    LOGGER.info(f'[{name}] done in {time.time() - t0:.0f} s')


In [9]:
# group by level
target_labels = target_arr[:, 0, -4:]
target_labels = np.argmax(target_labels, axis=1)
# group by year
# target_labels = target_arr[:, 0, 4:7]
# target_labels = np.argmax(target_labels, axis=1)


In [10]:
gkf = StratifiedGroupKFold(n_splits=5,shuffle=True,random_state = seed)
iterator = gkf.split(feature_arr, y = target_labels, groups= player_list)

In [11]:
train_ds = SwingDataset(feature_arr, mask_arr, train=True, y = target_arr, target_mask=target_mask_arr)
train_loader = DataLoader(train_ds, batch_size=2, pin_memory=True, num_workers=4)
X = next(iter(train_loader))

In [12]:
print(X.keys())
print(X['feature_arr'].shape)
print(X['mask_arr'].shape)
# print(X['attention_mask'].shape)
# print(X['target_mask'].shape)
print(X['y'].shape)

dict_keys(['feature_arr', 'mask_arr', 'y'])
torch.Size([2, 500, 24])
torch.Size([2, 500])
torch.Size([2, 500, 11])


In [14]:
(0.62496989 - 0.5) * 4 + 0.5

0.9998795600000001

In [13]:
model = SwingGRU()
X = torch.randn((2, 100, 24))
y = model(X)
y.shape


torch.Size([2, 11])

In [14]:
d = next(iter(train_loader))

In [15]:
X, X_mask, y = d['feature_arr'], d['mask_arr'], d['y']

In [18]:
X, X_mask, y = d['feature_arr'], d['mask_arr'], d['y']
criterion = nn.BCEWithLogitsLoss()
output = model(X)
loss = criterion(output[:, 100:], y[:, 100:].float())
print(loss)
prob = F.sigmoid(output).flatten(end_dim=1).detach().numpy()
y = y.flatten(end_dim=1).detach().numpy()
print(y[:10, -4:])
print(prob[:10, -4:])

level_score = roc_auc_score(y[:, -4:], prob[:, -4:], average='micro', multi_class='ovr')
print(level_score)

tensor(0.7225, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
[[0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]]
[[0.6139204  0.48410693 0.461842   0.65950394]
 [0.629697   0.53958386 0.50846887 0.55952424]
 [0.5945234  0.5363443  0.5661934  0.5703213 ]
 [0.624775   0.5706702  0.44688225 0.5274387 ]
 [0.6150585  0.4953904  0.5827365  0.59241873]
 [0.7358353  0.6104891  0.4725673  0.5590229 ]
 [0.60574275 0.5496781  0.50589293 0.55996925]
 [0.64458054 0.5704017  0.57075834 0.43976256]
 [0.67575556 0.5435472  0.4838261  0.5121095 ]
 [0.5768638  0.480682   0.4759126  0.4649096 ]]
0.5362476666666667


In [None]:
with timer('TTGRU'):
    for fold, (train_idx, val_idx) in enumerate(iterator):
        LOGGER.info(f"start fold:{fold}, train size: {len(train_idx)}, val size: {len(val_idx)}")
        with timer(f"fold {fold}"):
            
            train_feature = feature_arr[train_idx]
            train_target = target_arr[train_idx]
            train_mask = mask_arr[train_idx]
            
            val_feature = feature_arr[val_idx]
            val_target = target_arr[val_idx]
            val_mask = mask_arr[val_idx]

            train_ds = SwingDataset(train_feature, train_mask, train=True, y = train_target)
            val_ds = SwingDataset(val_feature, val_mask, train=True, y = val_target)
            train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=4)
            val_loader = DataLoader(val_ds, batch_size=batch_size, pin_memory=True, num_workers=4)
            
            
            model = SwingGRU()
            model = model.to(device)
            param_optimizer = list(model.named_parameters())
            no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [
                {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay},
                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]
            optimizer = AdamW(optimizer_grouped_parameters,
                              lr=lr,
                              weight_decay=weight_decay,
                              )
            num_train_optimization_steps = int(len(train_loader) * n_epochs)
            scheduler = get_linear_schedule_with_warmup(optimizer,
                                                        num_warmup_steps=num_warmup_steps,
                                                        num_training_steps=num_train_optimization_steps)
            criterion = nn.BCEWithLogitsLoss()
            best_val_score = 0
            
            for epoch in range(n_epochs):                
                model.train() 
                train_losses_batch = []
                train_score_batch = []
                val_losses_batch = []
                val_score_batch = []
                epoch_loss = 0
                
                pbar = tqdm(train_loader, total=len(train_loader), leave = False)
                for d in pbar:
                    X, X_mask, y = d['feature_arr'].to(device), d['mask_arr'].to(device), d['y'].to(device)
                    seq_len = X.shape[1]
                    if seq_len <= 100:
                        continue
                    
                    optimizer.zero_grad()
                    
                    output = model(X)
                    loss = criterion(output[:, 100:], y[:, 100:].float())
                    loss.backward()
                    optimizer.step()
                    scheduler.step()
                    
                    with torch.no_grad():
                        prob = F.sigmoid(output).flatten(end_dim=1).detach().cpu().numpy()
                        y = y.flatten(end_dim=1).detach().cpu().numpy()
                        
                        level_score = roc_auc_score(y[:, -4:], prob[:, -4:], average='micro', multi_class='ovr')

                    
                    train_losses_batch.append(loss.item())
                    train_score_batch.append(level_score)
                    
                train_loss = np.mean(train_losses_batch)
                train_score = np.mean(train_score_batch)
                print(f"train loss: {train_loss:.4f}, level score: {train_score:.4f}")
                
                model.eval()
                ys = []
                probs = []
                pbar = tqdm(val_loader, total=len(val_loader), leave=False)
                with torch.no_grad():
                    for d in pbar:
                        X, X_mask, y = d['feature_arr'].to(device), d['mask_arr'].to(device), d['y'].to(device)
                        seq_len = X.shape[1]
                        if seq_len <= 100:
                            continue
                                                
                        output = model(X)
                        loss = criterion(output[:, 100:], y[:, 100:].float())
                        
                        prob = F.sigmoid(output).flatten(end_dim=1).cpu().numpy()
                        y = y.flatten(end_dim=1).cpu().numpy()
                        ys.append(y)
                        probs.append(prob)
                        
                        level_score = roc_auc_score(y[:, -4:], prob[:, -4:], average='micro', multi_class='ovr')
                        val_losses_batch.append(loss.item())
                        val_score_batch.append(level_score)
                        
                val_loss = np.mean(val_losses_batch)
                val_score = np.mean(val_score_batch)
                ys = np.concatenate(ys)
                probs = np.concatenate(probs)
                print(ys.shape, probs.shape)
                cm = confusion_matrix(ys[:, -4:].argmax(axis=1), probs[:, -4:].argmax(axis=1))


                

  0%|          | 0/594 [00:00<?, ?it/s]

train loss: 0.1699, level score: 0.9475


  0%|          | 0/172 [00:00<?, ?it/s]

val loss: 0.6603, level score: 0.7958
(2060000, 11) (2060000, 11)
Confusion Matrix:
[[443699   9852  62427 340656]
 [     0      0      0      0]
 [128486     35    852  50168]
 [140870   4593   1383 876979]]


  0%|          | 0/594 [00:00<?, ?it/s]

train loss: 0.0196, level score: 0.9992


  0%|          | 0/172 [00:00<?, ?it/s]

val loss: 0.8931, level score: 0.7970
(2060000, 11) (2060000, 11)
Confusion Matrix:
[[465342  13166  32039 346087]
 [     0      0      0      0]
 [145469    138    417  33517]
 [138139  12978   4761 867947]]


  0%|          | 0/594 [00:00<?, ?it/s]

train loss: 0.0066, level score: 0.9999


  0%|          | 0/172 [00:00<?, ?it/s]

val loss: 0.8548, level score: 0.8115
(2060000, 11) (2060000, 11)
Confusion Matrix:
[[596569  17383  43789 198893]
 [     0      0      0      0]
 [163847    184     34  15476]
 [189512  59650   7531 767132]]


  0%|          | 0/594 [00:00<?, ?it/s]

train loss: 0.0028, level score: 1.0000


  0%|          | 0/172 [00:00<?, ?it/s]

val loss: 0.9791, level score: 0.7842
(2060000, 11) (2060000, 11)
Confusion Matrix:
[[493106  38292  46208 279028]
 [     0      0      0      0]
 [147305    485     72  31679]
 [158644  17863   2909 844409]]


  0%|          | 0/594 [00:00<?, ?it/s]

train loss: 0.0016, level score: 1.0000


  0%|          | 0/172 [00:00<?, ?it/s]

val loss: 1.0044, level score: 0.8032
(2060000, 11) (2060000, 11)
Confusion Matrix:
[[532840  17704  42629 263461]
 [     0      0      0      0]
 [153606    131      4  25800]
 [179564  12300   2690 829271]]


In [30]:
ys = np.concatenate(ys)
probs = np.concatenate(probs)
print(ys.shape, probs.shape)
cm = confusion_matrix(ys[:, -4:].argmax(axis=1), probs[:, -4:].argmax(axis=1))
print(cm)

ValueError: zero-dimensional arrays cannot be concatenated