In [1]:
!pip install einops -q
!pip install torch_optimizer -q
!python -m pip install git+https://github.com/lessw2020/Ranger21.git

Collecting git+https://github.com/lessw2020/Ranger21.git
  Cloning https://github.com/lessw2020/Ranger21.git to /tmp/pip-req-build-muxlxlht
  Running command git clone -q https://github.com/lessw2020/Ranger21.git /tmp/pip-req-build-muxlxlht
  Resolved https://github.com/lessw2020/Ranger21.git to commit 0a906ef9df4a4c394a48e5778b2b94f2c8e1ce8e
Building wheels for collected packages: ranger21
  Building wheel for ranger21 (setup.py) ... [?25l- \ done
[?25h  Created wheel for ranger21: filename=ranger21-0.0.1-py3-none-any.whl size=24730 sha256=ec1bd76486053341cf50c038f00493691349ed36e8cec786bd48637d58edeb82
  Stored in directory: /tmp/pip-ephem-wheel-cache-7cim0fq8/wheels/31/42/43/ea29fca967e54e6001d53a9c7acf2071d33154f26a27b18cb6
Successfully built ranger21
Installing collected packages: ranger21
Successfully installed ranger21-0.0.1


In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
from torch import nn, einsum
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset, Subset
from sklearn.model_selection import GroupKFold
from tqdm.auto import tqdm, trange
from einops import rearrange
from einops.layers.torch import Rearrange
import torch_optimizer as optim
from ranger21 import Ranger21

In [3]:
asset_info = pd.read_csv("../input/g-research-crypto-forecasting/asset_details.csv").set_index('Asset_ID').sort_index()

In [4]:
target_weight = asset_info['Weight'].values.reshape(1,-1)

In [5]:
# ['Asset_ID','Count', 'Open', 'High', 'Low', 'Close','Volume', 'VWAP','time_sin','time_cos']
X = np.load("../input/crypto-dataset-preparation-v2/crypto_X.npy")
y = np.load("../input/crypto-dataset-preparation-v2/crypto_y.npy")

In [6]:
date = pd.to_datetime(pd.Series(np.arange(len(X))*60+1514764860),unit='s').dt.date.astype(str)

In [7]:
# mask = date >= '2019-01-01'
# X = X[mask]
# y = y[mask]
# date = date[mask].reset_index(drop=True)

In [8]:
X = X.astype(np.float32)
y = y.astype(np.float32)

X[:,:,-4] = np.log1p(X[:,:,-4])
X[:,:,-3] = np.log1p(X[:,:,-3])

X[np.isnan(X)|np.isinf(X)] = 0

  """


In [9]:
class CryptoDataset(Dataset):
    def __init__(self, X,y,ts=60):
        self.ts = ts
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        indices = np.arange(idx-self.ts+1,idx+1)
        X = self.X[indices]
        y = self.y[idx]
        return X,y

# def corr_metric(y_true, y_pred):
#     score = 0
#     for i in range(y_pred.shape[1]):
#         mask = (~np.isnan(y_true[:,i])) & (y_pred[:,i]!=0)
#         y = y_true[:,i][mask]
#         pred = y_pred[:,i][mask]
#         a = (y - y.mean()) / y.std()
#         b = (pred - pred.mean()) / pred.std()
#         corr = (a * b).mean()
#         score += target_weight[i] * corr
#     score /= target_weight.sum()
#     return score
    
def wmean(x, w):
    return np.sum(x * w) / np.sum(w)

def wcov(x, y, w):
    return np.sum(w * (x - wmean(x, w)) * (y - wmean(y, w))) / np.sum(w)

def wcorr(x, y, w):
    return wcov(x, y, w) / np.sqrt(wcov(x, x, w) * wcov(y, y, w))
    
def corr_metric(y_true, y_pred):
    w = target_weight.repeat(len(y_true),1).reshape(-1)
    yt = y_true.reshape(-1)
    yp = y_pred.reshape(-1)
    mask = (~np.isnan(yt)) & (yp!=0)
    w = w[mask]
    yt = yt[mask]
    yp = yp[mask]
    return wcorr(yp, yt, w)

# def criterion(pred, label):
#     y1 = pred
#     y2 = (label - label.mean()) / label.std()
#     loss = -(y1 * y2).mean()
#     return loss
    
# def criterion(pred, label):
#     y1 = pred
#     y2 = (label - label.mean()) / label.std()
#     loss = -(y1 * y2).mean()
#     return loss
    
def criterion(a, b, w):
    sum_w = torch.sum(w)
    mean_a = torch.sum(a * w) / sum_w
    mean_b = torch.sum(b * w) / sum_w
    var_a = torch.sum(w * torch.square(a - mean_a)) / sum_w
    var_b = torch.sum(w * torch.square(b - mean_b)) / sum_w

    cov = torch.sum((a * b * w)) / torch.sum(w) - mean_a * mean_b
    corr = cov / torch.sqrt(var_a * var_b)
    loss = -corr
    return loss
    
def get_dataloader(dataset, indices, batch_size,istrain=True):
    d = Subset(dataset, indices)
    d = DataLoader(d, batch_size=batch_size, num_workers=2,shuffle=istrain,drop_last=istrain)
    return d

def train_one_epoch(model, optimizer, scheduler, train_dataloader, device = torch.device('cpu')):
    model.train()
    MA_loss = 0
    count = 0
    for X,y in train_dataloader:
        X = X.to(device)
        y = y.to(device)
        w = torch_target_weight.repeat(len(y),1)
        optimizer.zero_grad()
        
        mixup_mask = (torch.rand(len(y)) < 0.2).to(device)
        random_samples = torch.randint(0,len(y),(len(y),)).to(device)
        X2 = X.clone()
        X2[mixup_mask] = X2[random_samples[mixup_mask]]
        y[mixup_mask] = (y[mixup_mask] + y[random_samples[mixup_mask]]) / 2
        
        pred = model(X, X2)
        mask = ~torch.isnan(y.reshape(-1))
        loss = criterion(pred.reshape(-1)[mask],y.reshape(-1)[mask],w.reshape(-1)[mask])
        loss.backward()
        optimizer.step()
        scheduler.step()
        MA_loss += loss.item() * len(y)
        count += len(y)
    MA_loss /= count
    return MA_loss

def evaluation(model, val_dataloader, device = torch.device('cpu')):
    model.eval()
    MA_loss = 0
    count = 0
    predictions = []
    labels = []
    with torch.no_grad():
        for X,y in val_dataloader:
            X = X.to(device)
            y = y.to(device)
            w = torch_target_weight.repeat(len(y),1)
            pred = model(X)
            mask = ~torch.isnan(y.reshape(-1))
            loss = criterion(pred.reshape(-1)[mask],y.reshape(-1)[mask],w.reshape(-1)[mask])
            MA_loss += loss.item() * len(y)
            count += len(y)
            predictions.append(pred.cpu())
            labels.append(y.cpu())
        MA_loss /= count
    predictions = torch.cat(predictions,dim=0).numpy()
    labels = torch.cat(labels,dim=0).numpy()
    score = corr_metric(labels, predictions)
    return MA_loss, score, predictions, labels

In [10]:
def exists(val):
    return val is not None

def default(val, d):
    return val if exists(val) else d

class Attention(nn.Module):
    def __init__(
        self,
        dim,
        heads = 8,
        dim_head = 64,
        dropout = 0.,
        max_pos_emb = 512
    ):
        super().__init__()
        inner_dim = dim_head * heads
        self.heads= heads
        self.to_q = nn.Linear(dim, inner_dim, bias = False)
        self.to_kv = nn.Linear(dim, inner_dim * 2, bias = False)
        self.to_out = nn.Linear(inner_dim, dim)

        self.rel_pos_emb = nn.Parameter(torch.zeros([1,heads,max_pos_emb,max_pos_emb]))
        self.scale = nn.Parameter(torch.full([1,heads,max_pos_emb,max_pos_emb] ,dim_head ** -0.5))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, context = None, mask = None, context_mask = None):
        n, device, h, has_context = x.shape[-2], x.device, self.heads, exists(context)
        context = default(context, x)

        q, k, v = (self.to_q(x), *self.to_kv(context).chunk(2, dim = -1))
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))

        dots = einsum('b h i d, b h j d -> b h i j', q, k)
        dots = dots * self.scale + self.rel_pos_emb

        if exists(mask) or exists(context_mask):
            mask = default(mask, lambda: torch.ones(*x.shape[:2], device = device))
            context_mask = default(context_mask, mask) if not has_context else default(context_mask, lambda: torch.ones(*context.shape[:2], device = device))
            mask_value = -torch.finfo(dots.dtype).max
            mask = rearrange(mask, 'b i -> b () i ()') * rearrange(context_mask, 'b j -> b () () j')
            dots.masked_fill_(~mask, mask_value)

        attn = dots.softmax(dim = -1)

        out = einsum('b h i j, b h j d -> b h i d', attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        out = self.to_out(out)
        return self.dropout(out)

In [11]:
class TransformerLayer2d(nn.Module):
    def __init__(self, d_model, nheads, d_hidden, dropout=0.0, nts=90, nasset=14):
        super().__init__()
        self.norm1 = nn.LayerNorm(d_model)
#         self.mlp1 = nn.Sequential(nn.Linear(d_ts,d_hidden),nn.GELU(),nn.Dropout(dropout),
#                                   nn.Linear(d_hidden,d_ts),nn.Dropout(dropout))
        self.attn1 = Attention(d_model, nheads, d_model//nheads, dropout, max_pos_emb=nts)
        self.norm2 = nn.LayerNorm(d_model)
#         self.mlp2 = nn.Sequential(nn.Linear(d_asset,d_hidden),nn.GELU(),nn.Dropout(dropout),
#                                   nn.Linear(d_hidden,d_asset),nn.Dropout(dropout))
        self.attn2 = Attention(d_model, nheads, d_model//nheads, dropout, max_pos_emb=nasset)
        
        self.norm3 = nn.LayerNorm(d_model)
        self.mlp3 = nn.Sequential(nn.Linear(d_model,d_hidden),nn.GELU(),nn.Dropout(dropout),
                                  nn.Linear(d_hidden,d_model),nn.Dropout(dropout))
    def forward(self, x):
        #(B,ts,asset,D)
        B,T,A,D = x.shape
        
        x2 = self.norm1(x)
        x2 = rearrange(x2, 'b t a d -> (b a) t d')
        x2 = self.attn1(x2)
        x2 = rearrange(x2, '(b a) t d -> b t a d', b=B)
        x = x + x2
        
        x2 = self.norm2(x)
        x2 = rearrange(x2, 'b t a d -> (b t) a d')
        x2 = self.attn2(x2)
        x2 = rearrange(x2, '(b t) a d -> b t a d', b=B)
        x = x + x2
        
        x2 = self.norm3(x)
        x2 = self.mlp3(x2)
        x = x + x2
        return x

class TransformerLayer1d(nn.Module):
    def __init__(self, d_model, nheads, d_hidden, dropout=0.0, nasset=14):
        super().__init__()
        self.norm1 = nn.LayerNorm(d_model)
        self.attn1 = Attention(d_model, nheads, d_model//nheads, dropout, max_pos_emb=nasset)
        self.norm2 = nn.LayerNorm(d_model)
        self.mlp2 = nn.Sequential(nn.Linear(d_model,d_hidden),nn.GELU(),nn.Dropout(dropout),
                                  nn.Linear(d_hidden,d_model),nn.Dropout(dropout))
    def forward(self, x):
        #(B,asset,D)
        x2 = self.norm1(x)
        x2 = self.attn1(x2)
        x = x + x2
        
        x2 = self.norm2(x)
        x2 = self.mlp2(x2)
        x = x + x2
        return x
    
class CryptoModel(nn.Module):
    def __init__(self, SEQ_LENGTH=60):
        super().__init__()
        d_model = 64
        n_layers = 2
        nheads = 4
        self.norm = nn.InstanceNorm1d(7)
        self.asset_emb = nn.Embedding(14,d_model)
        self.dense_emb = nn.Linear(9,d_model)
        self.encoder1 = nn.ModuleList()
        for i in range(n_layers):
            self.encoder1.append(TransformerLayer2d(d_model,nheads,4*d_model,0.2,SEQ_LENGTH,14))
        self.post_norm1 = nn.LayerNorm(d_model)
        self.decoder1 = nn.Sequential(nn.Linear(SEQ_LENGTH,d_model),nn.GELU(),nn.Dropout(0.2),
                                  nn.Linear(d_model,1))
        self.encoder2 = nn.ModuleList()
        for i in range(n_layers):
            self.encoder2.append(TransformerLayer1d(d_model,nheads,4*d_model,0.2,14))
        self.fc = nn.Sequential(nn.Linear(d_model,d_model),nn.GELU(),nn.Dropout(0.0),
                                  nn.Linear(d_model,1))
        self.out_norm = nn.BatchNorm1d(1, eps=0, affine=False)
        
        self.pe1 = nn.Parameter(torch.empty([1,SEQ_LENGTH,1,d_model]))
        self.pe2 = nn.Parameter(torch.empty([1,1,14,d_model]))
        nn.init.normal_(self.pe1, 0.0, 0.2)
        nn.init.normal_(self.pe2, 0.0, 0.2)
        
        self.missing_emb = nn.Parameter(torch.empty([1,SEQ_LENGTH,14,d_model]))
        nn.init.normal_(self.missing_emb)
            
    def forward(self, inp, inp2=None):
        B,T,A,D = inp.shape
        mask = inp.abs().sum(-1) == 0
        inp = rearrange(inp,'b t a d -> (b a) d t')
        inp[:,1:8] = self.norm(inp[:,1:8])
        inp = rearrange(inp,'(b a) d t -> b t a d', b=B)
        X = self.asset_emb(inp[:,:,:,0].long()) + self.dense_emb(inp[:,:,:,1:])
        X[mask] = self.missing_emb.repeat([len(X),1,1,1])[mask]
        if not (inp2 is None):
            mask = inp2.abs().sum(-1) == 0
            inp2 = rearrange(inp2,'b t a d -> (b a) d t')
            inp2[:,1:8] = self.norm(inp2[:,1:8])
            inp2 = rearrange(inp2,'(b a) d t -> b t a d', b=B)
            X2 = self.asset_emb(inp2[:,:,:,0].long()) + self.dense_emb(inp2[:,:,:,1:])
            X2[mask] = self.missing_emb.repeat([len(X2),1,1,1])[mask]
            X = (X + X2) / 2
            
        X = X + self.pe1 + self.pe2
        for layer in self.encoder1:
            X = layer(X)
        X = self.post_norm1(X)
        X = rearrange(X,'b t a d -> b a d t')
        X = self.decoder1(X).squeeze(-1)
        for layer in self.encoder2:
            X = layer(X)
        y = self.fc(X).squeeze(-1)
        y = self.out_norm(y.unsqueeze(1)).squeeze(1)
        return y

In [12]:
BATCH_SIZE = 256
EPOCHS = 10
GROUPS = 3
SEQ_LENGTH = 60
FREQ = 5

device = torch.device('cuda')
kf = GroupKFold(GROUPS)
dataset = CryptoDataset(X,y,ts=SEQ_LENGTH)
Feasible_indices = np.arange(len(X))
Feasible_indices = Feasible_indices[(Feasible_indices>=SEQ_LENGTH-1)&(y.sum(-1).sum(-1)!=0)]
Evaluation_indices = np.where(date >= '2021-01-01')[0]
torch_target_weight = torch.Tensor(target_weight).to(device)

In [13]:
oof = np.zeros(y.shape)
group = date.values
for fold, (train_index, val_index) in enumerate(kf.split(group,group,group)):
    print("fold:",fold)
    model_path = f"model_fold{fold}.pt"

    train_indices = np.intersect1d(train_index, Feasible_indices)
    val_indices = np.intersect1d(val_index, Feasible_indices)
    
    val_dataloader = get_dataloader(dataset, val_indices, 3*BATCH_SIZE,False)

    model = CryptoModel(SEQ_LENGTH).to(device)
    optimizer = optim.Ranger(model.parameters(),lr=1e-3,alpha=0.5,k=5,weight_decay=0)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer,
                                                                     10*(len(train_indices)//(BATCH_SIZE*FREQ)),
                                                                     T_mult=2)

    best_score = -1.0
    for epoch in trange(EPOCHS):
        train_dataloader = get_dataloader(dataset, train_indices[epoch%FREQ::FREQ], BATCH_SIZE,True)
        train_loss = train_one_epoch(model, optimizer, scheduler, train_dataloader, device)
        val_loss, val_score, val_pred, val_label = evaluation(model, val_dataloader, device)
        if val_score > best_score:
            best_score = val_score
            oof[val_indices] = val_pred
            torch.save(model.state_dict(), model_path)
        print(f"epoch {epoch}")
        print(f"train_loss {train_loss}")
        print(f"val_loss {val_loss}")
        print(f"val_score {val_score}")
        print(f"best_score {best_score}")
        print("***************************************")
    model.load_state_dict(torch.load(model_path))

cv_score = corr_metric(y, oof)
print(cv_score)

fold: 0


  0%|          | 0/10 [00:00<?, ?it/s]

	addcmul_(Number value, Tensor tensor1, Tensor tensor2)
Consider using one of the following signatures instead:
	addcmul_(Tensor tensor1, Tensor tensor2, *, Number value) (Triggered internally at  /usr/local/src/pytorch/torch/csrc/utils/python_arg_parser.cpp:1025.)
  exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)


epoch 0
train_loss -0.0095979242854378
val_loss -0.024616821818120357
val_score 0.024295377403437488
best_score 0.024295377403437488
***************************************
epoch 1
train_loss -0.031040421483136904
val_loss -0.03713430004808695
val_score 0.04236545247976503
best_score 0.04236545247976503
***************************************
epoch 2
train_loss -0.042827530654653695
val_loss -0.04953923125317797
val_score 0.05761638228986366
best_score 0.05761638228986366
***************************************
epoch 3
train_loss -0.05494587890329346
val_loss -0.05384623700142948
val_score 0.06052104064952357
best_score 0.06052104064952357
***************************************
epoch 4
train_loss -0.06017173481712025
val_loss -0.0507497262822069
val_score 0.06700044418704965
best_score 0.06700044418704965
***************************************
epoch 5
train_loss -0.06771268759804187
val_loss -0.05671807538993149
val_score 0.06619823338333394
best_score 0.06700044418704965
***********

  0%|          | 0/10 [00:00<?, ?it/s]

epoch 0
train_loss -0.013952853519771347
val_loss -0.033096833157092706
val_score 0.03219820443712779
best_score 0.03219820443712779
***************************************
epoch 1
train_loss -0.03515472811821435
val_loss -0.03194087541268036
val_score 0.04421963830920167
best_score 0.04421963830920167
***************************************
epoch 2
train_loss -0.04559441931225409
val_loss -0.04574335133555722
val_score 0.05233805538620182
best_score 0.05233805538620182
***************************************
epoch 3
train_loss -0.054228797635051125
val_loss -0.0536048407293668
val_score 0.06686732360504972
best_score 0.06686732360504972
***************************************
epoch 4
train_loss -0.061505582213796005
val_loss -0.057436306999842014
val_score 0.07314032463840756
best_score 0.07314032463840756
***************************************
epoch 5
train_loss -0.0696675685632478
val_loss -0.05811928883191787
val_score 0.07487323488909509
best_score 0.07487323488909509
***********

  0%|          | 0/10 [00:00<?, ?it/s]

epoch 0
train_loss -0.01246610851484223
val_loss -0.03112404441602569
val_score 0.03497705495511551
best_score 0.03497705495511551
***************************************
epoch 1
train_loss -0.03635235929888389
val_loss -0.03462866421903896
val_score 0.04784166051836377
best_score 0.04784166051836377
***************************************
epoch 2
train_loss -0.04512244218496704
val_loss -0.04639056418432649
val_score 0.05858172509235797
best_score 0.05858172509235797
***************************************
epoch 3
train_loss -0.0569938071881243
val_loss -0.04949289650472971
val_score 0.06043060002612944
best_score 0.06043060002612944
***************************************
epoch 4
train_loss -0.06340005871208218
val_loss -0.054713658895336144
val_score 0.06977355755099848
best_score 0.06977355755099848
***************************************
epoch 5
train_loss -0.07082378154403282
val_loss -0.055365825682665334
val_score 0.07038015453226418
best_score 0.07038015453226418
*************

In [14]:
np.save("oof",oof)