In [1]:
# library

import os
import re
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam, SGD

## **Build Dataset**

In [2]:
# load base dataset

de_train = pd.read_parquet("/home/aiuser/taeuk/open-problems-single-cell-perturbations/de_train.parquet")
id_map = pd.read_csv("/home/aiuser/taeuk/open-problems-single-cell-perturbations/id_map.csv")
submission = pd.read_csv("/home/aiuser/taeuk/open-problems-single-cell-perturbations/sample_submission.csv")

In [3]:
gene_means = de_train.iloc[:, 5:].mean(axis=0).values
gene_stds = de_train.iloc[:, 5:].std(axis=0).values

print("means :", gene_means[:5])
print(" stds :", gene_stds[:5])

means : [0.36676917 0.25095126 0.1482586  0.15573331 0.68427495]
 stds : [1.63469675 1.18865995 2.27540959 2.18409453 3.18920633]


In [4]:
from scipy.stats import shapiro

alpha = 0.05
is_normals = []
for i in tqdm(range(5, de_train.shape[1])):
    _, p_value = shapiro(de_train.iloc[:, 5])
    is_normals.append(int(p_value >= alpha))
sum(is_normals) / len(is_normals) # 정규성은 없는듯

  0%|          | 0/18211 [00:00<?, ?it/s]

0.0

In [5]:
# cell & compound dictionary

cell_type_de_train = sorted(de_train.cell_type.unique())
sm_name_de_train = sorted(de_train.sm_name.unique())

cell_type_dict = {cell_type_de_train[i]:i for i in range(len(cell_type_de_train))}
sm_name_dict = {sm_name_de_train[i]:i for i in range(len(sm_name_de_train))}

In [3]:
# compound decomposition 

# smiles = list(de_train.SMILES.unique())
# voc = []

# r = re.compile(".")
# for sm in smiles:
#     voc += list(set(r.findall(sm)))
# voc = list(set(voc))
# voc.sort()

# smile = pd.DataFrame(np.zeros((len(smiles), len(voc))).astype(int))
# smile.columns = voc
# for i in range(smile.shape[0]):
#     for ele in r.findall(smiles[i]):
#         smile[ele][i] += 1
# smile = pd.DataFrame(smiles).join(smile)
# smile.columns = ["SMILES"] + list(smile.columns[1:])
# sm_id = de_train.iloc[:, 1:4].drop("sm_lincs_id", axis=1).drop_duplicates()
# smile = sm_id.merge(smile, how="left", on="SMILES")
# smile.drop("SMILES", axis=1, inplace=True)
# smile.to_csv("/home/aiuser/taeuk/open-problems-single-cell-perturbations/smile.csv", header=True, index=False)
smile = pd.read_csv("/home/aiuser/taeuk/open-problems-single-cell-perturbations/smile.csv")
display(de_train)
smile.head()


Unnamed: 0,cell_type,sm_name,sm_lincs_id,SMILES,control,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
0,NK cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.104720,-0.077524,-1.625596,-0.144545,0.143555,...,-0.227781,-0.010752,-0.023881,0.674536,-0.453068,0.005164,-0.094959,0.034127,0.221377,0.368755
1,T cells CD4+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.915953,-0.884380,0.371834,-0.081677,-0.498266,...,-0.494985,-0.303419,0.304955,-0.333905,-0.315516,-0.369626,-0.095079,0.704780,1.096702,-0.869887
2,T cells CD8+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,-0.387721,-0.305378,0.567777,0.303895,-0.022653,...,-0.119422,-0.033608,-0.153123,0.183597,-0.555678,-1.494789,-0.213550,0.415768,0.078439,-0.259365
3,T regulatory cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.232893,0.129029,0.336897,0.486946,0.767661,...,0.451679,0.704643,0.015468,-0.103868,0.865027,0.189114,0.224700,-0.048233,0.216139,-0.085024
4,NK cells,Mometasone Furoate,LSM-3349,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...,False,4.290652,-0.063864,-0.017443,-0.541154,0.570982,...,0.758474,0.510762,0.607401,-0.123059,0.214366,0.487838,-0.819775,0.112365,-0.122193,0.676629
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,T regulatory cells,Atorvastatin,LSM-5771,CC(C)c1c(C(=O)Nc2ccccc2)c(-c2ccccc2)c(-c2ccc(F...,False,-0.014372,-0.122464,-0.456366,-0.147894,-0.545382,...,-0.549987,-2.200925,0.359806,1.073983,0.356939,-0.029603,-0.528817,0.105138,0.491015,-0.979951
610,NK cells,Riociguat,LSM-45758,COC(=O)N(C)c1c(N)nc(-c2nn(Cc3ccccc3F)c3ncccc23...,False,-0.455549,0.188181,0.595734,-0.100299,0.786192,...,-1.236905,0.003854,-0.197569,-0.175307,0.101391,1.028394,0.034144,-0.231642,1.023994,-0.064760
611,T cells CD4+,Riociguat,LSM-45758,COC(=O)N(C)c1c(N)nc(-c2nn(Cc3ccccc3F)c3ncccc23...,False,0.338168,-0.109079,0.270182,-0.436586,-0.069476,...,0.077579,-1.101637,0.457201,0.535184,-0.198404,-0.005004,0.552810,-0.209077,0.389751,-0.337082
612,T cells CD8+,Riociguat,LSM-45758,COC(=O)N(C)c1c(N)nc(-c2nn(Cc3ccccc3F)c3ncccc23...,False,0.101138,-0.409724,-0.606292,-0.071300,-0.001789,...,0.005951,-0.893093,-1.003029,-0.080367,-0.076604,0.024849,0.012862,-0.029684,0.005506,-1.733112


Unnamed: 0,sm_name,#,(,),+,-,/,1,2,3,...,S,[,\,],c,l,n,o,r,s
0,Clotrimazole,0,2,2,0,0,0,8,0,0,...,0,0,0,0,21,1,2,0,0,0
1,Mometasone Furoate,0,8,8,0,0,0,4,2,2,...,0,8,0,8,4,2,0,1,0,0
2,Idelalisib,0,3,3,0,1,0,6,4,0,...,0,2,0,2,19,0,6,0,0,0
3,Vandetanib,0,3,3,0,0,0,4,2,2,...,0,0,0,0,14,0,2,0,1,0
4,Bosutinib,1,6,6,0,0,0,2,2,2,...,0,0,0,0,15,2,1,0,0,0


In [7]:
# correlation

# gene_names = list(de_train.columns[5:])
# de_melt = de_train.melt(id_vars=["sm_name", "cell_type"], value_vars=gene_names)

# de_pivot_cell = de_melt.pivot(index=["sm_name","variable"], columns="cell_type", values="value")
# de_pivot_comp = de_melt.pivot(index=["cell_type","variable"], columns="sm_name", values="value")
# de_corr_cell = de_pivot_cell.corr()
# de_corr_comp = de_pivot_comp.corr()

# de_corr_cell.to_csv("/home/aiuser/taeuk/open-problems-single-cell-perturbations/de_corr_cell.csv", header=True, index=True)
# de_corr_comp.to_csv("/home/aiuser/taeuk/open-problems-single-cell-perturbations/de_corr_comp.csv", header=True, index=True)

de_corr_cell = pd.read_csv("/home/aiuser/taeuk/open-problems-single-cell-perturbations/de_corr_cell.csv", header=0, index_col=0)
de_corr_comp = pd.read_csv("/home/aiuser/taeuk/open-problems-single-cell-perturbations/de_corr_comp.csv", header=0, index_col=0)

display(de_corr_cell.head())
display(de_corr_comp.head())

Unnamed: 0_level_0,B cells,Myeloid cells,NK cells,T cells CD4+,T cells CD8+,T regulatory cells
cell_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
B cells,1.0,0.647261,0.788213,0.485822,-0.138323,0.066281
Myeloid cells,0.647261,1.0,0.609587,0.323018,-0.194258,0.191862
NK cells,0.788213,0.609587,1.0,0.512614,0.00334,0.238791
T cells CD4+,0.485822,0.323018,0.512614,1.0,0.302101,0.123189
T cells CD8+,-0.138323,-0.194258,0.00334,0.302101,1.0,-0.075527


Unnamed: 0_level_0,5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-6-yl)pyrimidin-2-amine,ABT-199 (GDC-0199),ABT737,AMD-070 (hydrochloride),AT 7867,AT13387,AVL-292,AZ628,AZD-8330,AZD3514,...,Tivozanib,Topotecan,Tosedostat,Trametinib,UNII-BXU45ZH6LI,Vandetanib,Vanoxerine,Vardenafil,Vorinostat,YK 4-279
sm_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-6-yl)pyrimidin-2-amine,1.0,-0.188602,0.038083,0.190366,0.36348,0.560509,0.217774,0.269561,0.289016,0.074848,...,-0.153586,-0.13505,0.340408,0.404999,0.003311,-0.116056,0.15422,-0.07097,0.281518,-0.028914
ABT-199 (GDC-0199),-0.188602,1.0,0.284557,0.004126,-0.163023,-0.177109,-0.004786,-0.120617,-0.139353,0.067591,...,0.099148,0.093487,-0.168061,0.001327,0.173189,-0.12612,-0.04665,0.100247,-0.281967,0.206045
ABT737,0.038083,0.284557,1.0,-0.063961,0.071608,0.061823,-0.102238,0.186619,0.15849,0.267674,...,0.025967,0.000807,0.081146,0.072904,0.642046,-0.083797,0.192162,-0.0444,-0.01619,0.13245
AMD-070 (hydrochloride),0.190366,0.004126,-0.063961,1.0,0.021369,0.177195,0.163277,-0.026261,-0.00519,-0.077422,...,-0.010943,0.033453,0.027434,0.276627,-0.205314,-0.115135,-0.027755,0.094514,-0.020659,0.180129
AT 7867,0.36348,-0.163023,0.071608,0.021369,1.0,0.294058,0.028044,0.284242,0.289464,0.134293,...,-0.068767,-0.072275,0.307853,0.096939,0.162107,0.121511,0.235875,-0.09762,0.306202,-0.095428


In [8]:
# build custom dataset

class SCPpretrain(Dataset):
    def __init__(self, dataset, smile, cell_type_dict,
                 de_corr_cell, de_corr_comp):
        super(SCPpretrain, self).__init__()
        self.x = dataset.iloc[:, :2]
        self.smile = smile
        self.cell_type_dict = cell_type_dict
        self.de_corr_cell = de_corr_cell
        self.de_corr_comp = de_corr_comp
        
    def __getitem__(self, idx):
        cell, name = self.x.iloc[idx]
        x_cell = self.cell_type_dict[cell]
        
        ele_val = self.smile.loc[self.smile.sm_name==name, :].values[0]
        ele_bool = ele_val != 0
        ele_idx = [i if ele_bool[i]==True else 0 for i in np.arange(ele_val.shape[0])]
        x = [x_cell] + list(ele_idx[1:]) + list(ele_val[1:])
        
        y_cell = de_corr_cell.loc[cell].values
        y_comp = de_corr_comp.loc[name].values
        y = list(y_cell) + list(y_comp)
        
        return torch.tensor(x, dtype=torch.int64),\
               torch.tensor(y, dtype=torch.float32)
    
    def __len__(self):
        return self.x.shape[0]
        
        
class SCPset(Dataset):
    def __init__(self, dataset, smile, cell_type_dict, normalize):
        super(SCPset, self).__init__()
        if dataset is None:
            self.x = id_map.iloc[:, 1:]
            self.y = None
        else:
            self.x = dataset.iloc[:, :2]
            self.y = dataset.iloc[:, 5:]
            if normalize:
                self.means = self.y.mean(axis=0).values
                self.stds = self.y.std(axis=0).values
                #
                #
                self.y = (self.y - self.means)/self.stds
        self.smile = smile
        self.cell_type_dict = cell_type_dict
   
    def __getitem__(self, idx):
        cell, name = self.x.iloc[idx]
        x_cell = self.cell_type_dict[cell]
        
        ele_val = self.smile.loc[self.smile.sm_name==name, :].values[0]
        ele_bool = ele_val != 0
        ele_idx = [i if ele_bool[i]==True else 0 for i in np.arange(ele_val.shape[0])]
        x = [x_cell] + list(ele_idx[1:]) + list(ele_val[1:])
        
        if self.y is None:
            return torch.tensor(x, dtype=torch.int64)
        else:
            y = self.y.iloc[idx, :]
            return torch.tensor(x, dtype=torch.int64),\
                    torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return self.x.shape[0] 

In [9]:
# dataset check

datap = DataLoader(SCPpretrain(de_train, smile, cell_type_dict,
                 de_corr_cell, de_corr_comp), 128, False)
xp, yp = next(iter(datap))
print(xp.shape)
print(yp.shape)

data = DataLoader(SCPset(de_train, smile, cell_type_dict, True), 128, False)
x, y = next(iter(data))
print(x.shape)
print(y.shape)

torch.Size([128, 65])
torch.Size([128, 152])
torch.Size([128, 65])
torch.Size([128, 18211])


  torch.tensor(y, dtype=torch.float32)


## **Build Model**

In [10]:
# Define Model

class FinalLayer(nn.Module):
    def __init__(self, dim_embed, device):
        super(FinalLayer, self).__init__()
        self.fc_v1 = nn.Linear(dim_embed, 18211, device=device)
        self.fc_w1 = nn.Linear(dim_embed, 18211, device=device)
        self.fc_v2 = nn.Linear(dim_embed, 18211, device=device)
        self.fc_w2 = nn.Linear(dim_embed, 18211, device=device)
    def forward(self, x):
        v1 = self.fc_v1(x).tanh()
        v2 = self.fc_v2(x).tanh()
        w1 = self.fc_w1(x).sigmoid()
        w2 = self.fc_w2(x).sigmoid()
        return v1*w1 + v2*w2
    
class InterLayer(nn.Module):
    def __init__(self, dim_embed, num_layers, drop_prob, device):
        super(InterLayer, self).__init__()
        self.main_encoder = nn.Sequential(
            *[nn.Sequential(
                nn.Linear(dim_embed*2*i, dim_embed*2*(i+1), device=device),
                nn.BatchNorm1d(dim_embed*2*(i+1), device=device),
                nn.ReLU(),
                nn.Dropout(drop_prob)
            ) for i in range(1, num_layers+1)] )
        
        self.mu = nn.Linear(dim_embed*2*(num_layers+1),
                            dim_embed*2*(num_layers+1), device=device)
        self.var = nn.Linear(dim_embed*2*(num_layers+1),
                             dim_embed*2*(num_layers+1), device=device)
        
    def forward(self, x):
        xout = self.main_encoder(x)
        mu = self.mu(xout)
        logvar = self.var(xout)
        
        std = torch.exp(logvar / 2)
        eps = torch.randn_like(std)
        z = mu + eps * std
        return xout, z, mu, logvar
        
class SCP2model(nn.Module):
    def __init__(self, device, dim_embed, num_layers, drop_prob, log, pretrain):
        super(SCP2model, self).__init__()
        self.embed_cell = nn.Embedding(6, dim_embed, device=device)
        self.embed_comp = nn.Embedding(33, dim_embed, padding_idx=0, device=device)
        self.conv_comp = nn.Conv1d(32, 1, 3, 1, 1, bias=True, device=device)
        self.inter_encoder = InterLayer(dim_embed, num_layers, drop_prob, device=device)
        self.final_layer = FinalLayer(dim_embed*2*(num_layers+1), device=device)
        self.log = log
        self.pretrain = pretrain
        
    def calc_cell_vector(self, cell_idx):
        return self.embed_cell(cell_idx)
      
    def calc_comp_vector(self, comp_idx, comp_val):
        comp_val = (comp_val + 1).log() if self.log else comp_val
        comp = self.embed_comp(comp_idx)
        comp = comp * comp_val.unsqueeze(2)
        comp = self.conv_comp(comp).squeeze()
        return comp
    
    def forward(self, x):
        hs = x.shape[1]-1
        cell = self.calc_cell_vector(x[:, 0])
        comp = self.calc_comp_vector(x[:, 1:hs//2+1], x[:, hs//2+1:])

        if self.pretrain:
            return cell, comp
        else:
            x_concat = torch.cat([cell, comp], dim=1)
            xout, z, mu, logvar = self.inter_encoder(x_concat)
            pred_x = self.final_layer(xout)
            pred_z = self.final_layer(z)
            return pred_x, pred_z, mu, logvar

In [11]:
# model check

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# dim_embed = 44
# log = True
# pretrain = False

# model = SCP2model(device, dim_embed, log, pretrain)
# pred = model(x.to(device))
# pred.shape
# emb = model.embed_cell(torch.arange(6, device=device))
# sim = cell @ emb.T
# sim[:5]

## **Define Functions**

In [12]:
# Define util function

def MRRMSE(pred, y, means=None, stds=None):
      pred = pred.detach().cpu().numpy()
      y = y.detach().cpu().numpy()
      if means is not None:
            pred = pred * stds + means
            y = y * stds + means
      return np.sqrt(np.square(y - pred).mean(axis=1)).mean()    

def mrrmse_loss(pred, y):
      return torch.sqrt(torch.square(pred - y).mean(dim=1)).mean()

def compose_loss(pred, y):
      return mrrmse_loss(pred, y) + F.smooth_l1_loss(pred, y)

def fix_random_seed(seed):
    #random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [13]:
# Define pre-training function

def pretrain_model(device, dim_embed, num_layers, drop_prob, log, pretrain, optimizer, 
                   criterion, learning_rate, weight, num_epochs, batch_size, verbose):
      # train loader
      train_loader = DataLoader(SCPpretrain(de_train, smile, cell_type_dict, de_corr_cell, de_corr_comp),
                                batch_size=batch_size, shuffle=True)
      
      # model & optimizer & criterion
      model = SCP2model(device, dim_embed, num_layers, drop_prob, log, pretrain)
      
      if optimizer.lower()=="sgd":
            optimizer = SGD(model.parameters(), lr=learning_rate, momentum=0.0)
      elif optimizer.lower()=='adam':
            optimizer = Adam(model.parameters(), lr=learning_rate)
      else:
            print("Wrong optimizer!")
            return
      
      scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs)
      
      if criterion.lower()=="l1":
            criterion = nn.SmoothL1Loss(0.5)
      elif criterion.lower()=="l2":
            criterion = nn.MSELoss(reduce="sum")
      elif criterion.lower()=="MRRMSE":
            criterion = mrrmse_loss
      elif criterion.lower() == "compose":
            criterion = compose_loss
      else:
            print("Wrong criterion!")
            return
      
      # fixed values 
      ele_vals = []
      ele_idxs = []
      for i in range(smile.shape[0]):
            ele_val = smile.iloc[i, :].values
            ele_bool = ele_val != 0
            ele_idx = [i if ele_bool[i]==True else 0 for i in np.arange(ele_val.shape[0])]
            ele_vals.append(list(ele_val[1:]))
            ele_idxs.append(list(ele_idx[1:]))
      fixed_comp_vals = torch.tensor(ele_vals, device=device)
      fixed_comp_idxs = torch.tensor(ele_idxs, device=device)
      fixed_cells = torch.arange(6, device=device)
      
      model.train()
      for epoch in tqdm(range(1, num_epochs+1)):
            
            train_loss = 0.
            for x, y in train_loader:
                  x, y = x.to(device), y.to(device)
                  optimizer.zero_grad()
                  
                  x_cell, x_comp = model(x)
                  cells = model.calc_cell_vector(fixed_cells)
                  comps = model.calc_comp_vector(fixed_comp_idxs, fixed_comp_vals)
                  sims = torch.cat([x_cell @ cells.T,
                                    x_comp @ comps.T], dim=1)
                  if weight is not None:
                        B_idx = (x[:, 0] == 0) | (x[:, 0] == 1)
                        loss_B = criterion(sims[B_idx], y[B_idx])
                        loss_T = criterion(sims[~B_idx], y[~B_idx])
                        loss = loss_B * weight + loss_T * (1-weight)
                  else:
                        loss = criterion(sims, y)
                  loss.backward()
                  optimizer.step()
                  
                  train_loss += loss.item()
                  
            scheduler.step()      
            # print loss per epoch
            if verbose:
                  print("[Epoch : %3d] [Loss : %.6f ]"%(epoch, train_loss/len(train_loader)))
                              
      return model

# Define training function

def train_model(device, train_size, normalize, model, dim_embed, num_layers, drop_prob, log, pretrain, 
                optimizer, criterion, learning_rate, weight, num_epochs, batch_size, verbose):
      # train & valid loader
      train_idx = np.random.choice(de_train.shape[0], int(de_train.shape[0]*train_size), replace=False)
      train_loader = DataLoader(SCPset(de_train.iloc[train_idx, :], smile, cell_type_dict, normalize),
                                batch_size=batch_size, shuffle=True)
      valid_loader = [1]
      if train_size < 1.:
            valid_idx = list(set(np.arange(de_train.shape[0])) - set(train_idx))
            valid_loader = DataLoader(SCPset(de_train.iloc[valid_idx, :], smile, cell_type_dict, normalize),
                                      batch_size=batch_size, shuffle=True)
      # model & optimizer & criterion
      if model is None:
            model = SCP2model(device, dim_embed, num_layers, drop_prob, log, pretrain)  
      if optimizer.lower()=="sgd":
            optimizer = SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=0.01)
      elif optimizer.lower()=='adam':
            optimizer = Adam(model.parameters(), lr=learning_rate)
      elif optimizer.lower()=="adamw":
            optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
      else:
            print("Wrong optimizer!")
            return
      
      scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, num_epochs)
      
      if criterion.lower()=="l1":
            criterion = nn.SmoothL1Loss(0.5)
      elif criterion.lower()=="l2":
            criterion = nn.MSELoss()
      elif criterion.lower()=="MRRMSE":
            criterion = mrrmse_loss
      elif criterion.lower() == "compose":
            criterion = compose_loss
      else:
            print("Wrong criterion!")
            return
      
      best_mrrmse = 10
      model.pretrain = False
      for epoch in tqdm(range(1, num_epochs+1)):
            train_loss = 0.
            valid_loss = 0.
            train_mrrmse = 0.
            valid_mrrmse = 0.
            unnormal_mrrmse = 0.
            
            model.train()
            for x, y in train_loader:
                  x, y = x.to(device), y.to(device)
                  optimizer.zero_grad()
                  pred_x, pred_z, mu, logvar = model(x)
                  # if verbose and F.mse_loss(pred[0,:], pred[1,:]).item() < 0.0001:
                  #       print("trivial solution !, mse of prediction is %.10f"%F.mse_loss(pred[0,:], pred[1,:]).item())
                  loss_x = criterion(pred_x, y)
                  loss_z = criterion(pred_z, y)
                  loss = loss_x + loss_z
                  loss.backward()
                  optimizer.step()
                  
                  train_loss += loss.item()
                  train_mrrmse += MRRMSE(pred_x, y)
            if train_size < 1.:
                  model.eval()
                  for x, y in valid_loader:
                        x, y = x.to(device), y.to(device)
                        with torch.no_grad():
                              pred, _, _, _ = model(x)
                              loss = criterion(pred, y)
                        
                        valid_loss += loss.item()
                        valid_mrrmse += MRRMSE(pred, y)
                        if normalize:
                              unnormal_mrrmse += MRRMSE(pred, y, valid_loader.dataset.means,
                                                        valid_loader.dataset.stds)
            scheduler.step()      
            # print loss per epoch
            if verbose:
                  print("[Epoch : %3d] [Loss : %.4f / %.4f] [MRRMSE : %.3f / %.3f / %.3f]"%(
                        epoch, train_loss/len(train_loader), valid_loss/len(valid_loader),
                        train_mrrmse/len(train_loader), valid_mrrmse/len(valid_loader), unnormal_mrrmse/len(valid_loader)))
            if train_size < 1. and best_mrrmse > valid_mrrmse/len(valid_loader):
                  best_mrrmse = valid_mrrmse/len(valid_loader)
            
      return model, best_mrrmse

# Define inferring function

def infer_model(device, model, normalize):
      gene_means = de_train.iloc[:, 5:].mean(axis=0).values
      gene_stds = de_train.iloc[:, 5:].std(axis=0).values
      test_loader = DataLoader(SCPset(None, smile, cell_type_dict, normalize),
                               batch_size=255, shuffle=False)
      model.eval()
      for x in test_loader:
            x = x.to(device)
            with torch.no_grad():
                  pred,_,_,_ = model(x)
                  pred = pd.DataFrame(pred.detach().cpu().numpy())
      if normalize:
            pred = pred * gene_stds + gene_means
      return pred

## **Parameter Tuning**

In [19]:
# training

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
      
# parameters

fix_random_seed(231155)

dim_embed = 128
num_layers = 10
log = True
pretrain = True
drop_prob = 0.3
train_size = 0.65

normalize = True
optimizer = "SGD"
criterion = "Compose"
learning_rate = 0.025
weight = None
num_epochs = 100
batch_size = 32

optimizer_pre = "Adam" # Adam fix !
criterion_pre = "L2"
learning_rate_pre = 0.02
weight_pre = None
num_epochs_pre = 200
batch_size_pre = 128

model_pretrained = pretrain_model(device, dim_embed, num_layers, drop_prob, log, pretrain, optimizer_pre, 
                                  criterion_pre, learning_rate_pre, weight_pre, num_epochs_pre, batch_size_pre, True)

model, bestmrrmse = train_model(device, train_size, normalize, model_pretrained, dim_embed, num_layers, drop_prob, log, pretrain, 
                                optimizer, criterion, learning_rate, weight, num_epochs, batch_size, True)
# pretrain : N | normal : Y  MRRMSE : 1.491
# pretrain : Y | normal : Y  MRRMSE : 1.334
# pretrain : N | normal : N  MRRMSE : 1.451
# pretrain : Y | normal : N  MRRMSE : 1.338



  0%|          | 0/200 [00:00<?, ?it/s]

[Epoch :   1] [Loss : 419.039014 ]
[Epoch :   2] [Loss : 139.513788 ]
[Epoch :   3] [Loss : 92.183400 ]
[Epoch :   4] [Loss : 54.513024 ]
[Epoch :   5] [Loss : 36.471241 ]
[Epoch :   6] [Loss : 26.783702 ]
[Epoch :   7] [Loss : 20.093834 ]
[Epoch :   8] [Loss : 15.289434 ]
[Epoch :   9] [Loss : 11.930021 ]
[Epoch :  10] [Loss : 9.580056 ]
[Epoch :  11] [Loss : 7.878072 ]
[Epoch :  12] [Loss : 6.595677 ]
[Epoch :  13] [Loss : 5.606435 ]
[Epoch :  14] [Loss : 4.826329 ]
[Epoch :  15] [Loss : 4.196658 ]
[Epoch :  16] [Loss : 3.684660 ]
[Epoch :  17] [Loss : 3.258793 ]
[Epoch :  18] [Loss : 2.901282 ]
[Epoch :  19] [Loss : 2.599534 ]
[Epoch :  20] [Loss : 2.340067 ]
[Epoch :  21] [Loss : 2.117100 ]
[Epoch :  22] [Loss : 1.922408 ]
[Epoch :  23] [Loss : 1.753173 ]
[Epoch :  24] [Loss : 1.603361 ]
[Epoch :  25] [Loss : 1.470900 ]
[Epoch :  26] [Loss : 1.353913 ]
[Epoch :  27] [Loss : 1.248714 ]
[Epoch :  28] [Loss : 1.155594 ]
[Epoch :  29] [Loss : 1.071221 ]
[Epoch :  30] [Loss : 0.996100 ]

  0%|          | 0/100 [00:00<?, ?it/s]

  torch.tensor(y, dtype=torch.float32)


[Epoch :   1] [Loss : 2.2533 / 0.8802] [MRRMSE : 0.803 / 0.660 / 1.332]
[Epoch :   2] [Loss : 2.2309 / 0.8876] [MRRMSE : 0.797 / 0.665 / 1.348]
[Epoch :   3] [Loss : 2.2097 / 0.8859] [MRRMSE : 0.790 / 0.664 / 1.345]
[Epoch :   4] [Loss : 2.1716 / 0.8999] [MRRMSE : 0.778 / 0.672 / 1.372]
[Epoch :   5] [Loss : 2.1489 / 0.9009] [MRRMSE : 0.772 / 0.672 / 1.365]
[Epoch :   6] [Loss : 2.1150 / 0.8890] [MRRMSE : 0.761 / 0.664 / 1.344]
[Epoch :   7] [Loss : 2.1313 / 0.8910] [MRRMSE : 0.765 / 0.665 / 1.348]
[Epoch :   8] [Loss : 2.1030 / 0.8924] [MRRMSE : 0.757 / 0.666 / 1.349]
[Epoch :   9] [Loss : 2.1167 / 0.8781] [MRRMSE : 0.761 / 0.658 / 1.313]
[Epoch :  10] [Loss : 2.0934 / 0.8700] [MRRMSE : 0.753 / 0.653 / 1.299]
[Epoch :  11] [Loss : 2.0469 / 0.8833] [MRRMSE : 0.739 / 0.660 / 1.323]
[Epoch :  12] [Loss : 2.0765 / 0.8837] [MRRMSE : 0.749 / 0.661 / 1.320]
[Epoch :  13] [Loss : 2.0595 / 0.8900] [MRRMSE : 0.745 / 0.663 / 1.320]
[Epoch :  14] [Loss : 2.0457 / 0.8718] [MRRMSE : 0.740 / 0.653 /

In [20]:
pred = infer_model(device, model, normalize)

df = pred.reset_index()
df.columns = submission.columns
display(df.head())

# save

title = f"scp2_Dim{dim_embed}_Lay{num_layers}_Nor{normalize}_Log{log}_Opt{optimizer}_Cri{criterion}_lr{learning_rate}_B{batch_size}_E{num_epochs}"
#df.to_csv("/home/aiuser/taeuk/%s.csv"%title, header=True, index=False)

Unnamed: 0,id,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,A4GALT,AAAS,AACS,AAGAB,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
0,0,0.296504,0.211665,0.124898,0.135206,0.563002,0.761984,-0.013693,0.302938,-0.009029,...,0.05647,0.123769,-0.021091,0.218706,0.433305,0.326586,0.214578,0.147317,-0.136896,-0.073958
1,1,0.329205,0.2289,0.142581,0.145403,0.603104,0.828216,-0.010491,0.320782,-0.001178,...,0.059914,0.138882,-0.009893,0.233539,0.453226,0.352735,0.227589,0.15449,-0.145921,-0.071046
2,2,0.369808,0.255562,0.148743,0.154635,0.681103,0.938575,-0.000567,0.347575,0.010046,...,0.0687,0.157901,0.015156,0.266242,0.500297,0.383189,0.242517,0.160406,-0.162571,-0.069888
3,3,0.16265,0.128836,0.076603,0.09057,0.374698,0.47424,-0.033362,0.221444,-0.044989,...,0.024654,0.072491,-0.079114,0.131992,0.318572,0.228841,0.177377,0.121536,-0.086626,-0.084828
4,4,0.368679,0.253313,0.150887,0.153933,0.677635,0.93419,-0.001269,0.347952,0.010284,...,0.069004,0.157996,0.016334,0.264738,0.497681,0.381771,0.24192,0.159662,-0.161785,-0.069368


In [21]:
# submission

# model_pretrained = pretrain_model(device, dim_embed, num_layers, drop_prob, log, pretrain, optimizer_pre, 
#                                   criterion_pre, learning_rate_pre, weight_pre, num_epochs_pre, batch_size_pre, True)
# submit, _ = train_model(device, train_size, normalize, model_pretrained, dim_embed, num_layers, drop_prob, log, pretrain, 
#                                 optimizer, criterion, learning_rate, weight, num_epochs, batch_size, True)
# pred = infer_model(device, submit, normalize)

# df_submit = pred.reset_index()
# df_submit.columns = submission.columns
# display(df_submit.head())

# save

# title = f"scp2_Dim{dim_embed}_Lay{num_layers}_Nor{normalize}_Log{log}_Opt{optimizer}_Cri{criterion}_lr{learning_rate}_B{batch_size}_E{num_epochs}"
# df_submit.to_csv("/home/aiuser/taeuk/%s.csv"%title, header=True, index=False)