In [27]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

In [28]:
training_data = pd.read_parquet("../data/train/baseline_train_ecfp4.parquet")

In [29]:
training_data 

Unnamed: 0,smiles,source,fp_0,fp_1,fp_2,fp_3,fp_4,fp_5,fp_6,fp_7,...,fp_2038,fp_2039,fp_2040,fp_2041,fp_2042,fp_2043,fp_2044,fp_2045,fp_2046,fp_2047
0,CCCCCCC1OC(=O)C1C,PKS,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CCCCCCC(=O)C(C)C(=O)O,PKS,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CCCCCCC=C(C)C(=O)O,PKS,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CCCCCCCC(C)C(=O)O,PKS,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CCCCC1OC(=O)CCC1C,PKS,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3236342,O=C1CC=CC(=O)OC(c2c[nH]c(Cl)c2Cl)(S(=O)(=O)Br)C1,chem,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3236343,CCC(O)(OC)C(C)C1OC(=O)C(C)CC1C,chem,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3236344,CC(=CC(C)CCCCCON)C=C(C)C(=O)O,chem,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3236345,CCC1C(=O)NC(=O)C(C)C(C(=O)O[N+](=O)[O-])CCC(C)...,chem,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
class fp_CNN_Encoder(nn.Module):
    def __init__(self, fp_dim = 2048, hidden_channels = (64, 128), embed_dim = 256, proj_dim = 120):
        super().__init__()
        c1, c2 = hidden_channels
        self.conv = nn.Sequential()

In [None]:
fp_cols = [c for c in training_data.columns if c.startswith("fp_")]
assert len(fp_cols) == 2048
sample_fp = training_data.loc[0, fp_cols].to_numpy().astype(dtype=np.float32) # shape (2048,)
x = torch.from_numpy(sample_fp).unsqueeze(0).unsqueeze(0) # shape (1, 1, 2048)
x.shape # (batch_size, in_channels, fp_dim)

torch.Size([1, 1, 2048])

In [None]:
# after conv1d, shape changes from [batch_size, in_channels, fp_dim] to [batch_size, out_channels, fp_dim]
conv1 = nn.Conv1d(in_channels = 1, out_channels = 64, kernel_size = 5, padding = 2)
y1 = conv1(x)
y1.shape

torch.Size([1, 64, 2048])

In [33]:
# after a single conv1d layer, we can add in some non-linearity
y1 = F.relu(y1)
y1.shape

torch.Size([1, 64, 2048])

In [34]:
# after conv1d, we can extract further features using a second conv1d layer
conv2 = nn.Conv1d(in_channels = 64, out_channels = 128, kernel_size = 5, padding = 2)
y2 = conv2(y1)
y2.shape

torch.Size([1, 128, 2048])