In [1]:
import os
import time
import math
import random
import numpy as np
import pandas as pd
import torch
from torch import nn, einsum
import torch.nn.functional as F
import torch_optimizer as optim
from torch.utils.data import TensorDataset,DataLoader
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
from einops import rearrange
from einops.layers.torch import Rearrange

In [2]:
DEBUG = False

In [3]:
MODEL_NAME = 'transformer_v19'
base_dir = "./"
if not os.path.exists(f'models/{MODEL_NAME}'):
    os.makedirs(f'models/{MODEL_NAME}')

In [4]:
train_df = pd.read_csv(base_dir + 'train.csv')
test_df = pd.read_csv(base_dir + 'test.csv')

In [5]:
le = LabelEncoder()
train_df['pressure'] = le.fit_transform(train_df['pressure'])

In [6]:
train_df['RC'] = (train_df['R'].astype(str) + '_' + train_df['C'].astype(str))
train_df['RC'] = train_df['RC'].map({'20_50':0, '20_20':1, '50_20':2, '50_50':3, '5_50':4, '5_20':5, '50_10':6, '20_10':7, '5_10':8})
test_df['RC'] = (test_df['R'].astype(str) + '_' + test_df['C'].astype(str))
test_df['RC'] = test_df['RC'].map({'20_50':0, '20_20':1, '50_20':2, '50_50':3, '5_50':4, '5_20':5, '50_10':6, '20_10':7, '5_10':8})
# train_df['u_in_cat'] = train_df['u_in'].round().astype(int)
# test_df['u_in_cat'] = test_df['u_in'].round().astype(int)
train_df['u_in_0'] = (train_df['u_in'].round() == 0).astype(int)
test_df['u_in_0'] = (test_df['u_in'].round() == 0).astype(int)
train_df['u_in_5'] = (train_df['u_in'].round() == 5).astype(int)
test_df['u_in_5'] = (test_df['u_in'].round() == 5).astype(int)

In [7]:
from sklearn.preprocessing import RobustScaler, StandardScaler
RS = StandardScaler()
all_df = pd.concat([train_df,test_df])
train_df['u_in'] = np.log1p(train_df['u_in'] - all_df['u_in'].min())
test_df['u_in'] = np.log1p(test_df['u_in'] - all_df['u_in'].min())
all_df['u_in'] = np.log1p(all_df['u_in'] - all_df['u_in'].min())

RS.fit(all_df[['u_in','time_step']])
train_df[['u_in','time_step']] = RS.transform(train_df[['u_in','time_step']])
test_df[['u_in','time_step']] = RS.transform(test_df[['u_in','time_step']])

# from sklearn.preprocessing import RobustScaler, StandardScaler
# RS = StandardScaler()
# all_df = pd.concat([train_df,test_df])
# RS.fit(all_df[['u_in','time_step']])
# train_df[['u_in','time_step']] = RS.transform(train_df[['u_in','time_step']])
# test_df[['u_in','time_step']] = RS.transform(test_df[['u_in','time_step']])

In [8]:
X_tr = train_df[['RC','u_in','u_out','u_in_0','u_in_5','time_step']].values.reshape(-1,80,6)
X_test = test_df[['RC','u_in','u_out','u_in_0','u_in_5','time_step']].values.reshape(-1,80,6)
# X_tr = train_df[features].values.reshape(-1,80,18)
# X_test = test_df[features].values.reshape(-1,80,18)

In [9]:
y_tr = train_df['pressure'].values.reshape(-1,80)

In [10]:
X_tr.shape,X_test.shape,y_tr.shape

((75450, 80, 6), (50300, 80, 6), (75450, 80))

In [11]:
if DEBUG:
    X_tr = X_tr[:1000]
    y_tr = y_tr[:1000]
    X_test = X_test[:1000]

In [12]:
def train_one_epoch(model, optimizer, train_dataloader, epoch, device = torch.device('cpu')):
    model.eval()
    MA_loss = 0
    count = 0
    for X,y in train_dataloader:
        X = X.to(device)
        y = y.to(device)
        y = F.one_hot(y,950).float()
        y[:,:,:-1] += 0.1 * y[:,:,1:]
        y[:,:,1:] += 0.1 * y[:,:,:-1]
        y[y==1] = 0.8
        
        optimizer.zero_grad()
        mask1 = X[:,:,2] == 0
        mask2 = X[:,:,2] == 1
        pred = model(X)

        pred = torch.sigmoid(pred[mask1].reshape(-1,950))
        y = y[mask1].reshape(-1,950)

        loss = -torch.sum(y * torch.log(1e-8 + pred) + (1-y) * torch.log(1 - pred + 1e-8),dim=-1).mean()
        loss.backward()
        optimizer.step()
        
        MA_loss += loss.item() * len(y)
        count += len(y)
    MA_loss /= count
    return MA_loss

def evaluation(model, val_dataloader, device = torch.device('cpu')):
    model.eval()
    criterion = nn.L1Loss()
    MA_loss = 0
    count = 0
    with torch.no_grad():
        for X,y in val_dataloader:
            X = X.to(device)
            y = y.to(device)
            mask = X[:,:,2] == 0
            pred = model(X)
            pred = torch.argmax(pred,dim=-1)
            pred = pred[mask].reshape(-1).cpu().numpy()
            pred = torch.Tensor(le.inverse_transform(pred)).to(device)
            y = y[mask].reshape(-1).cpu().long().numpy()
            y = torch.Tensor(le.inverse_transform(y)).to(device)
            loss = criterion(pred, y)
            # loss = criterion(pred.reshape(-1), y.reshape(-1))
            MA_loss += loss.item() * len(y)
            count += len(y)
        MA_loss /= count
    return MA_loss

def inference(model, test_dataloader, device = torch.device('cpu'), istest = False):
    model.eval()
    prediction = []
    with torch.no_grad():
        if istest:
            for X in test_dataloader:
                X = X[0]
                X = X.to(device)
                pred = model(X).cpu()
#                 pred = torch.argmax(pred,dim=-1)
                prediction.append(pred.half())
        else:
            for X, y in test_dataloader:
                X = X.to(device)
                pred = model(X).cpu()
                pred = torch.argmax(pred,dim=-1)
                prediction.append(pred)
    prediction = torch.cat(prediction,dim=0).numpy()
    if istest == False:
        prediction = le.inverse_transform(prediction.reshape(-1)).reshape(prediction.shape)
    return prediction

In [13]:
import torch
from torch import nn, einsum
import torch.nn.functional as F

from einops import rearrange
from einops.layers.torch import Rearrange

# helper functions

def exists(val):
    return val is not None

def default(val, d):
    return val if exists(val) else d

def calc_same_padding(kernel_size):
    pad = kernel_size // 2
    return (pad, pad - (kernel_size + 1) % 2)

# helper classes

class Swish(nn.Module):
    def forward(self, x):
        return x * x.sigmoid()

class GLU(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.dim = dim

    def forward(self, x):
        out, gate = x.chunk(2, dim=self.dim)
        return out * gate.sigmoid()

class DepthWiseConv1d(nn.Module):
    def __init__(self, chan_in, chan_out, kernel_size, padding):
        super().__init__()
        self.padding = padding
        self.conv = nn.Conv1d(chan_in, chan_out, kernel_size, groups = chan_in)

    def forward(self, x):
        x = F.pad(x, self.padding)
        return self.conv(x)

# attention, feedforward, and conv module

class Scale(nn.Module):
    def __init__(self, scale, fn):
        super().__init__()
        self.fn = fn
        self.scale = scale

    def forward(self, x, **kwargs):
        return self.fn(x, **kwargs) * self.scale

class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.fn = fn
        self.norm = nn.LayerNorm(dim)

    def forward(self, x, **kwargs):
        x = self.norm(x)
        return self.fn(x, **kwargs)

class FeedForward(nn.Module):
    def __init__(
        self,
        dim,
        mult = 4,
        dropout = 0.
    ):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, dim * mult),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(dim * mult, dim),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class ConformerConvModule(nn.Module):
    def __init__(
        self,
        dim,
        causal = False,
        expansion_factor = 2,
        kernel_size = 31,
        dropout = 0.):
        super().__init__()

        inner_dim = dim * expansion_factor
        padding = calc_same_padding(kernel_size) if not causal else (kernel_size - 1, 0)

        self.net = nn.Sequential(
            nn.LayerNorm(dim),
            Rearrange('b n c -> b c n'),
            nn.Conv1d(dim, inner_dim * 2, 1),
            GLU(dim=1),
            DepthWiseConv1d(inner_dim, inner_dim, kernel_size = kernel_size, padding = padding),
            nn.BatchNorm1d(inner_dim) if not causal else nn.Identity(),
            Swish(),
            nn.Conv1d(inner_dim, dim, 1),
            Rearrange('b c n -> b n c'),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

In [14]:
class CustomAttention(nn.Module):
    def __init__(
        self,
        dim,
        heads = 8,
        dim_head = 64,
        dropout = 0.,
        max_pos_emb = 512,
        causal = False
    ):
        super().__init__()
        inner_dim = dim_head * heads
        self.heads= heads
        self.scale = dim_head ** -0.5
        self.to_q = nn.Linear(dim, inner_dim, bias = False)
        self.to_kv = nn.Linear(dim, inner_dim * 2, bias = False)
        self.to_out = nn.Linear(inner_dim, dim)

        self.max_pos_emb = max_pos_emb
        # self.rel_pos_emb1 = nn.Linear(1, heads, bias=False)
        # self.rel_pos_emb2 = nn.Linear(1, heads, bias=False)
        self.rel_pos_emb = nn.Sequential(nn.Linear(1, dim_head),nn.GELU(),nn.Linear(dim_head, dim_head))
        self.dropout = nn.Dropout(dropout)

        self.causal = causal

    def forward(self, x, position, context = None, mask = None, context_mask = None):
        n, device, h, max_pos_emb, has_context = x.shape[-2], x.device, self.heads, self.max_pos_emb, exists(context)
        context = default(context, x)

        q, k, v = (self.to_q(x), *self.to_kv(context).chunk(2, dim = -1))
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))

        dots = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale

        # shaw's relative positional embedding
        dist = rearrange(position, 'b i -> b i () ()') - rearrange(position, 'b j -> b () j ()')
        # rel_pos_emb = self.rel_pos_emb1(F.relu(dist)) + self.rel_pos_emb2(F.relu(-dist))   #(bijh)
        # pos_attn = rel_pos_emb.permute(0,3,1,2)
        rel_pos_emb = self.rel_pos_emb(dist)
        pos_attn = einsum('b h n d, b n r d -> b h n r', q, rel_pos_emb) * self.scale
        dots = dots + pos_attn

        if exists(mask) or exists(context_mask):
            mask = default(mask, lambda: torch.ones(*x.shape[:2], device = device))
            context_mask = default(context_mask, mask) if not has_context else default(context_mask, lambda: torch.ones(*context.shape[:2], device = device))
            mask_value = -torch.finfo(dots.dtype).max
            mask = rearrange(mask, 'b i -> b () i ()') * rearrange(context_mask, 'b j -> b () () j')
            dots.masked_fill_(~mask, mask_value)

        if self.causal:
            mask = torch.tril(torch.ones(dots.shape[-2:],device=dots.device)).T
            mask = rearrange(mask, 'n r -> () () n r')
            dots = dots - mask * 999

        attn = dots.softmax(dim = -1)

        out = einsum('b h i j, b h j d -> b h i d', attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        out = self.to_out(out)
        return self.dropout(out)


class CustomConformerBlock(nn.Module):
    def __init__(
        self,
        *,
        dim,
        dim_head = 64,
        heads = 8,
        ff_mult = 4,
        conv_expansion_factor = 2,
        conv_kernel_size = 31,
        attn_dropout = 0.,
        ff_dropout = 0.,
        conv_dropout = 0.,
        causal = False
    ):
        super().__init__()
        self.ff1 = FeedForward(dim = dim, mult = ff_mult, dropout = ff_dropout)
        self.attn = CustomAttention(dim = dim, dim_head = dim_head, heads = heads, dropout = attn_dropout, causal = causal)
        self.conv = ConformerConvModule(dim = dim, causal = causal, expansion_factor = conv_expansion_factor, kernel_size = conv_kernel_size, dropout = conv_dropout)
        self.ff2 = FeedForward(dim = dim, mult = ff_mult, dropout = ff_dropout)

        self.attn = PreNorm(dim, self.attn)
        self.ff1 = Scale(0.5, PreNorm(dim, self.ff1))
        self.ff2 = Scale(0.5, PreNorm(dim, self.ff2))

    def forward(self, x, pos, mask = None):
        x = self.ff1(x) + x
        x = self.attn(x, position = pos, mask = mask) + x
        x = self.conv(x) + x
        x = self.ff2(x) + x
        return x

In [15]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, max_len = 5000):
        super().__init__()
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model)
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self):
        return self.pe

class BrainModel(nn.Module):
    def __init__(self):
        super().__init__()
        DIM = 256
        n_layers = 4
        self.input_layer = nn.Sequential(nn.Linear(5,DIM),nn.Mish(),nn.Linear(DIM,DIM),nn.Mish())
        self.emb_RC = nn.Embedding(9,DIM)
        self.emb_u_in = nn.Embedding(101,DIM)
        self.scale_layer = nn.Linear(2*DIM,DIM)
        self.encoder = nn.ModuleList()
        for i in range(n_layers):
            self.encoder.append(CustomConformerBlock(dim = DIM,
                          dim_head = DIM//8,
                          heads = 8,
                          ff_mult = 4,
                          conv_expansion_factor = 2,
                          conv_kernel_size = 5,
                          attn_dropout = 0.1,
                          ff_dropout = 0.2,
                          conv_dropout = 0.05,
                          causal = False))
        self.fc = nn.Sequential(nn.Linear(DIM,DIM),nn.Mish(),nn.Linear(DIM,950))
        
    def forward(self, X):
        #(B,L,C)
        pos = X[:,:,-1]
        X_dense = self.input_layer(X[:,:,1:])
        X = torch.cat([X_dense, self.emb_RC(X[:,:,0].long())],dim=-1)
        X = self.scale_layer(X)
        for layer in self.encoder:
            X = layer(X,pos)
        y = self.fc(X)
        return y

In [16]:
# N_EPOCHS = 300
# N_FOLDS = 5
# BATCH_SIZE = 64
# oof_path = f"oofs/oof_{MODEL_NAME}"
# device = torch.device('cuda:1')
# kf = StratifiedKFold(N_FOLDS,shuffle=True, random_state=42)
# test_dataset = TensorDataset(torch.Tensor(X_test))
# test_dataloader = DataLoader(test_dataset, batch_size=3*BATCH_SIZE, shuffle=False, drop_last=False, num_workers=0)
# oof = np.zeros(y_tr.shape)
# y_true = le.inverse_transform(y_tr.reshape(-1)).reshape(y_tr.shape)
# test_pred = np.zeros([len(X_test),y_tr.shape[1],950])
# for fold, (train_index, val_index) in enumerate(kf.split(X_tr[:,0,0], X_tr[:,0,0])):
#     print("fold:",fold)
#     model_path = f'models/{MODEL_NAME}/model_{fold}.pt'
#     train_dataset = TensorDataset(torch.Tensor(X_tr[train_index]),torch.Tensor(y_tr[train_index]))
#     val_dataset = TensorDataset(torch.Tensor(X_tr[val_index]),torch.Tensor(y_tr[val_index]))
#     train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=0)
#     val_dataloader = DataLoader(val_dataset, batch_size=3*BATCH_SIZE, shuffle=False, drop_last=False, num_workers=0)
#     model = BrainModel().to(device)
#     model.load_state_dict(torch.load(model_path))
#     val_pred = inference(model, val_dataloader, device, False)
#     oof[val_index] = val_pred
#     test_pred += inference(model, test_dataloader, device, True)
#     mask = X_tr[val_index,:,2]==0
#     print(f"fold {fold}, score:",mean_absolute_error(y_true[val_index][mask], oof[val_index][mask]))
# mask = X_tr[:,:,2]==0
# print("CV score:",mean_absolute_error(y_true[mask][oof[mask]!=0], oof[mask][oof[mask]!=0]))
# test_pred = test_pred.argmax(-1)
# test_pred = le.inverse_transform(test_pred.reshape(-1)).reshape(test_pred.shape)
# np.save(oof_path,oof)

fold: 0
fold 0, score: 0.1420778559669055
fold: 1
fold 1, score: 0.1386840213427894
fold: 2
fold 2, score: 0.139729647541404
fold: 3
fold 3, score: 0.1389837594911228
fold: 4
fold 4, score: 0.1429318176443991
CV score: 0.14048080780127353


In [17]:
# if not DEBUG:
#     sub_path = f"subs/submission_logits_{MODEL_NAME}.csv"
#     sub = pd.read_csv(base_dir+"sample_submission.csv")
#     sub['pressure'] = test_pred.reshape(-1)
#     sub.to_csv(sub_path,index=False)

In [16]:
N_EPOCHS = 300
N_FOLDS = 5
BATCH_SIZE = 64
oof_path = f"oofs/oof_finetune_{MODEL_NAME}"
device = torch.device('cuda:1')
kf = StratifiedKFold(N_FOLDS,shuffle=True, random_state=42)
oof = np.zeros(y_tr.shape)
y_true = le.inverse_transform(y_tr.reshape(-1)).reshape(y_tr.shape)
test_pred = np.zeros([len(X_test),y_tr.shape[1],950])
for fold, (train_index, val_index) in enumerate(kf.split(X_tr[:,0,0], X_tr[:,0,0])):
    print("fold:",fold)
    for RC in tqdm(range(9)):
        model_path = f'models/{MODEL_NAME}/model_{fold}_RC_{RC}.pt'
        
        val_idx = np.intersect1d(val_index,np.where(X_tr[:,0,0]==RC))
        test_idx = X_test[:,0,0]==RC
        
        val_dataset = TensorDataset(torch.Tensor(X_tr[val_idx]),torch.Tensor(y_tr[val_idx]))
        test_dataset = TensorDataset(torch.Tensor(X_test[test_idx]))
        val_dataloader = DataLoader(val_dataset, batch_size=3*BATCH_SIZE, shuffle=False, drop_last=False, num_workers=0)
        test_dataloader = DataLoader(test_dataset, batch_size=3*BATCH_SIZE, shuffle=False, drop_last=False, num_workers=0)
        
        model = BrainModel().to(device)
        model.load_state_dict(torch.load(model_path))
        
        val_pred = inference(model, val_dataloader, device, False)
        oof[val_idx] = val_pred        
        test_pred[test_idx] += inference(model, test_dataloader, device, True)
    mask = X_tr[val_index,:,2]==0
    print(f"fold {fold}, score:",mean_absolute_error(y_true[val_index][mask], oof[val_index][mask]))
mask = X_tr[:,:,2]==0
print("CV score:",mean_absolute_error(y_true[mask][oof[mask]!=0], oof[mask][oof[mask]!=0]))
np.save(oof_path,oof)
test_pred = test_pred.argmax(-1)
test_pred = le.inverse_transform(test_pred.reshape(-1)).reshape(test_pred.shape)

fold: 0


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))


fold 0, score: 0.13952747260305273
fold: 1


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))


fold 1, score: 0.13633791152930647
fold: 2


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))


fold 2, score: 0.13688889797250622
fold: 3


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))


fold 3, score: 0.1368432903358686
fold: 4


HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))


fold 4, score: 0.1407616392015034
CV score: 0.13807126093594196


In [17]:
if not DEBUG:
    sub_path = f"subs/submission_finetune_logits_{MODEL_NAME}.csv"
    sub = pd.read_csv(base_dir+"sample_submission.csv")
    sub['pressure'] = test_pred.reshape(-1)
    sub.to_csv(sub_path,index=False)