In [1]:
import pandas as pd

In [2]:
# (cell_type, sm_name, gene, d0_val, d1_val, d2_val) -> (de_val)

In [6]:
# DE labels
kaggle_train_de_df = pd.read_parquet('data/de_train.parquet')
genes = sorted(list(set(kaggle_train_de_df.columns.tolist()) - set(["cell_type", "sm_name", "sm_lincs_id", "SMILES", "control"])))
del kaggle_train_de_df

In [7]:
ddde = pd.read_parquet('data/ddde.parquet')

In [8]:
cell_types = sorted(ddde.cell_type.unique().tolist())
sm_names = sorted(ddde.sm_name.unique().tolist())

In [13]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

gene_count = len(genes)

class DDDEDataset(Dataset):
    def __init__(self, dfr):
        self.data = []
        self.labels = []
        cols = dfr.columns.tolist()
        n2i = {name: i for i, name in enumerate(cols)}
        for d in dfr.to_records(index=False):
            ci = cell_types.index(d[n2i["cell_type"]])
            si = sm_names.index(d[n2i["sm_name"]])
            for gi, g in enumerate(genes):
                self.data.append((ci, si, gi, d[n2i["%s_d2" % g]]))
                self.labels.append(d[n2i[g + "_de"]])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        ci, si, gi, exp = self.data[idx]
        sample = {'cell_type': ci, 'sm_name': si, 'gene': gi, 'exp': exp, 'label': self.labels[idx]}
        return sample


In [14]:

# Create an instance of the custom dataset
custom_dataset = DDDEDataset(ddde)


In [15]:
batch_size = 1024
shuffle = True
data_loader = DataLoader(custom_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=8)

In [17]:
# for batch in data_loader:
#     print(batch)
#     break

In [49]:
import torch
import torch.nn as nn
import torch.optim as optim

class DENet(nn.Module):
    def __init__(self, cell_type_size, sm_name_size, gene_size, cell_type_dim=16, sm_name_dim=32, gene_dim=64, exp_dim=16, hidden_size=256):
        super(DENet, self).__init__()

        self.cell_type_embedding = nn.Embedding(cell_type_size, cell_type_dim)
        self.sm_name_embedding = nn.Embedding(sm_name_size, sm_name_dim)
        self.gene_embedding = nn.Embedding(gene_size, gene_dim)

        # Fully connected layers for the continuous input
        self.fc_cont = nn.Linear(1, exp_dim)

        # Fully connected layers
        self.fc1 = nn.Linear(cell_type_dim + sm_name_dim + gene_dim + exp_dim, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)

    def forward(self, cell_type, sm_name, gene, exp):
        # Embedding categorical variable
        x_cell_type = self.cell_type_embedding(cell_type)
        x_sm_name = self.sm_name_embedding(sm_name)
        x_gene = self.gene_embedding(gene)

        # Apply fully connected layer to continuous input
        x_cont = torch.relu(self.fc_cont(exp.unsqueeze(dim=1)))

        # Concatenate embeddings with continuous input
        x = torch.cat([x_cell_type, x_sm_name, x_gene, x_cont], dim=1)

        # Feedforward layers
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)

        return x

In [50]:
from torch.nn.parallel import DataParallel

net = DENet(cell_type_size=len(cell_types), sm_name_size=len(sm_names), gene_size=len(genes))
net = DataParallel(net).cuda()

In [51]:
net

DataParallel(
  (module): DENet(
    (cell_type_embedding): Embedding(6, 16)
    (sm_name_embedding): Embedding(146, 32)
    (gene_embedding): Embedding(18211, 64)
    (fc_cont): Linear(in_features=1, out_features=16, bias=True)
    (fc1): Linear(in_features=128, out_features=256, bias=True)
    (fc2): Linear(in_features=256, out_features=1, bias=True)
  )
)

In [52]:
import tqdm as tq

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

In [55]:
# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    net.train()  # Set the model to training mode
    for batch in tq.tqdm(data_loader):
        optimizer.zero_grad()  # Zero the gradients
        cell_type = batch['cell_type'].cuda()
        sm_name = batch['sm_name'].cuda()
        gene = batch['gene'].cuda()
        exp = batch['exp'].to(torch.float32).cuda()
        targets = batch['label'].to(torch.float32).cuda()
        
        outputs = net(cell_type, sm_name, gene, exp)
        loss = criterion(outputs.squeeze(), targets)
        loss.backward()  # Backward pass
        optimizer.step()  # Update weights
        
    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')

100%|██████████| 10920/10920 [02:21<00:00, 77.09it/s] 


Epoch 1/3, Loss: 2.217395782470703


100%|██████████| 10920/10920 [02:30<00:00, 72.41it/s] 


Epoch 2/3, Loss: 0.6326473951339722


100%|██████████| 10920/10920 [02:00<00:00, 90.29it/s] 

Epoch 3/3, Loss: 0.6056039333343506





In [23]:
torch.save(net.state_dict(), f"models/dnet_epoch_{epoch + 1}.pt")

In [40]:
ddd_test = pd.read_parquet('data/ddd_test.parquet')

In [41]:
ddd_test

Unnamed: 0,index_d0,cell_type,sm_name,CRYBG1_d0,SH3BP4_d0,ANTXRLP1_d0,HIST1H2AJ_d0,EZR_d0,ALKBH7_d0,POM121_d0,...,KMT2B,SULF2,AC116158.2,GOSR2,PRNCR1,AL731568.1,RIMKLA,PIGC,FBXL18,AC010327.4
0,0,B cells,5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-...,0.000008,0.0,0.000000,0.0,0.000172,0.000193,0.000001,...,0.000000,0.000014,0.000000,0.000002,0.0,0.0,0.0,0.000000,0.0,0.0
1,1,B cells,ABT-199 (GDC-0199),0.000008,0.0,0.000000,0.0,0.000173,0.000197,0.000003,...,0.000002,0.000020,0.000000,0.000000,0.0,0.0,0.0,0.000003,0.0,0.0
2,2,B cells,ABT737,0.000014,0.0,0.000000,0.0,0.000172,0.000200,0.000004,...,0.000002,0.000012,0.000000,0.000003,0.0,0.0,0.0,0.000000,0.0,0.0
3,3,B cells,AMD-070 (hydrochloride),0.000010,0.0,0.000000,0.0,0.000167,0.000183,0.000003,...,0.000002,0.000017,0.000000,0.000000,0.0,0.0,0.0,0.000003,0.0,0.0
4,4,B cells,AT 7867,0.000010,0.0,0.000000,0.0,0.000166,0.000189,0.000003,...,0.000000,0.000023,0.000000,0.000001,0.0,0.0,0.0,0.000001,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250,250,Myeloid cells,Vandetanib,0.000015,0.0,0.000000,0.0,0.000176,0.000195,0.000000,...,0.000000,0.000021,0.000000,0.000002,0.0,0.0,0.0,0.000002,0.0,0.0
251,251,Myeloid cells,Vanoxerine,0.000012,0.0,0.000001,0.0,0.000168,0.000201,0.000004,...,0.000005,0.000009,0.000000,0.000000,0.0,0.0,0.0,0.000005,0.0,0.0
252,252,Myeloid cells,Vardenafil,0.000004,0.0,0.000000,0.0,0.000160,0.000198,0.000000,...,0.000000,0.000012,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0
253,253,Myeloid cells,Vorinostat,0.000014,0.0,0.000000,0.0,0.000165,0.000194,0.000001,...,0.000000,0.000012,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0


In [55]:
class DDDTestDataset(Dataset):
    def __init__(self, dfr):
        self.data = []
        cols = dfr.columns.tolist()
        n2i = {name: i for i, name in enumerate(cols)}
        for d in dfr.to_records(index=False):
            try:
                ci = cell_types.index(d[n2i["cell_type"]])
                si = sm_names.index(d[n2i["sm_name"]])
                
                for gi, g in enumerate(genes):
                    self.data.append((ci, si, gi, d[n2i[g]]))
            except Exception as error:
                print(error)
                
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        ci, si, gi, exp = self.data[idx]
        sample = {'cell_type': ci, 'sm_name': si, 'gene': gi, 'exp': exp}
        return sample


In [56]:
test_dataset = DDDTestDataset(ddd_test)

'CGP 60474' is not in list


In [58]:
batch_size = 128
shuffle = False
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=8)

In [69]:
net.eval()
with torch.no_grad(), open("de_test.tsv", "w") as f:
    val_loss = 0.0
    for batch in test_loader:
        cell_type = batch['cell_type'].cuda()
        sm_name = batch['sm_name'].cuda()
        gene = batch['gene'].cuda()
        exp = batch['exp'].to(torch.float32).cuda()
        
        outputs = net(cell_type, sm_name, gene, exp)

        for ci, si, gi, exp, de in zip(cell_type.cpu().tolist(), sm_name.cpu().tolist(), gene.cpu().tolist(), exp.cpu().tolist(), outputs.squeeze().cpu().tolist()):
            f.write("%s\n" % "\t".join([cell_types[ci], sm_names[si], genes[gi], str(exp[0]), str(exp[1]), str(exp[2]), str(de)]))
        

In [89]:
id_map = pd.read_csv('data/id_map.csv', delimiter=',')

In [90]:
sample_submission = pd.read_csv('data/sample_submission.csv', delimiter=',')

In [80]:
# sample_submission.columns.tolist()

In [105]:
de_test = pd.read_csv('de_test.tsv', delimiter='\t', names=["cell_type", "sm_name", "gene", "d0", "d1", "d2", "de"])

In [106]:
de_test.drop("d0", axis=1, inplace=True)
de_test.drop("d1", axis=1, inplace=True)
de_test.drop("d2", axis=1, inplace=True)

In [107]:
de_test = pd.merge(de_test, id_map, on=["cell_type", "sm_name"], how="inner")

In [108]:
de_test.drop("cell_type", axis=1, inplace=True)
de_test.drop("sm_name", axis=1, inplace=True)

In [109]:
de_test.reset_index(inplace=True)

In [110]:
de_test

Unnamed: 0,index,gene,de,id
0,0,A1BG,0.148877,0
1,1,A1BG-AS1,0.249989,0
2,2,A2M,0.098068,0
3,3,A2M-AS1,0.158804,0
4,4,A2MP1,0.007440,0
...,...,...,...,...
4625589,4625589,ZXDB,0.091786,254
4625590,4625590,ZXDC,0.113523,254
4625591,4625591,ZYG11B,-0.079239,254
4625592,4625592,ZYX,-0.188479,254


In [111]:
# Pivot the DataFrame
pivot_df = de_test.pivot(index=["id"], columns='gene', values='de')

In [116]:
pivot_df.reset_index(inplace=True)

In [120]:
pivot_df[sample_submission.columns.tolist()].to_csv('submission.csv', index=False)

In [121]:
!zip s.zip submission.csv

  adding: submission.csv (deflated 55%)
