In [53]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

In [54]:
training_data = pd.read_parquet("../data/train/baseline_train_ecfp4.parquet")

In [55]:
training_data 

Unnamed: 0,smiles,source,fp_0,fp_1,fp_2,fp_3,fp_4,fp_5,fp_6,fp_7,...,fp_2038,fp_2039,fp_2040,fp_2041,fp_2042,fp_2043,fp_2044,fp_2045,fp_2046,fp_2047
0,CCCCCCC1OC(=O)C1C,PKS,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CCCCCCC(=O)C(C)C(=O)O,PKS,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,CCCCCCC=C(C)C(=O)O,PKS,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CCCCCCCC(C)C(=O)O,PKS,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CCCCC1OC(=O)CCC1C,PKS,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3236342,O=C1CC=CC(=O)OC(c2c[nH]c(Cl)c2Cl)(S(=O)(=O)Br)C1,chem,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3236343,CCC(O)(OC)C(C)C1OC(=O)C(C)CC1C,chem,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3236344,CC(=CC(C)CCCCCON)C=C(C)C(=O)O,chem,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3236345,CCC1C(=O)NC(=O)C(C)C(C(=O)O[N+](=O)[O-])CCC(C)...,chem,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
class fp_CNN_Encoder(nn.Module):
    def __init__(self, fp_dim = 2048, hidden_channels = (64, 128), embed_dim = 256, proj_dim = 120):
        super().__init__()
        c1, c2 = hidden_channels
        self.conv = nn.Sequential()

In [57]:
fp_cols = [c for c in training_data.columns if c.startswith("fp_")]
assert len(fp_cols) == 2048
sample_fp = training_data.loc[0, fp_cols].to_numpy().astype(dtype=np.float32) # shape (2048,)
x = torch.from_numpy(sample_fp).unsqueeze(0).unsqueeze(0) # shape (1, 1, 2048)
x.shape # (batch_size, in_channels, fp_dim)

torch.Size([1, 1, 2048])

In [58]:
# after conv1d, shape changes from [batch_size, in_channels, fp_dim] to [batch_size, out_channels, fp_dim]
conv1 = nn.Conv1d(in_channels = 1, out_channels = 64, kernel_size = 5, padding = 2)
y1 = conv1(x)
y1.shape

torch.Size([1, 64, 2048])

In [59]:
# after a single conv1d layer, we can add in some non-linearity
y1 = F.relu(y1)
y1.shape

torch.Size([1, 64, 2048])

In [60]:
# after conv1d, we can extract further features using a second conv1d layer
conv2 = nn.Conv1d(in_channels = 64, out_channels = 128, kernel_size = 5, padding = 2)
y2 = conv2(y1)
y2.shape

torch.Size([1, 128, 2048])

In [61]:
y3 = nn.AdaptiveMaxPool1d(1)(y2) # shape (batch_size, out_channels, 1)
y3.shape

torch.Size([1, 128, 1])

In [62]:
y3.squeeze(-1).shape # shape (batch_size, out_channels)

torch.Size([1, 128])

In [47]:
class fp_CNN_Encoder(nn.Module):
    
    def __init__(self, fp_dim = 2048, hidden_channels = (64, 128), embed_dim = 256, proj_dim = 120, use_projection = True, batchnorm_safe = True):
        super().__init__()
        c1, c2 = hidden_channels

        # convolution stack
        self.conv = nn.Sequential(
            nn.Conv1d(in_channels = 1, out_channels = c1, kernel_size = 5, padding = 2),
            nn.BatchNorm1d(num_features = c1),
            nn.ReLU(inplace = True),
            nn.Conv1d(in_channels = c1, out_channels = c2, kernel_size = 5, padding = 2),
            nn.BatchNorm1d(num_features = c2),
            nn.ReLU(inplace = True),
            nn.AdaptiveMaxPool1d(1), # collapse length to 1
            )

        # encoder head
        self.fc = nn.Linear(in_features = c2, out_features = embed_dim)

        # projection head
        self.use_projection = use_projection
        self.batchnorm_safe = batchnorm_safe
        if self.use_projection:
            if self.batchnorm_safe:
                # LayerNorm works with batch_size=1
                norm_layer = nn.LayerNorm(embed_dim)
            else:
                # BatchNorm1d is better if you always train with batch_size > 1
                norm_layer = nn.BatchNorm1d(embed_dim)

            self.proj = nn.Sequential(
                nn.Linear(embed_dim, embed_dim),
                nn.ReLU(inplace=True),
                norm_layer,
                nn.Linear(embed_dim, proj_dim),
            )

    def forward(self, x):
        # x: [B, fp_dim] or [B, 1, fp_dim]
        if x.dim() == 2:
            x = x.unsqueeze(1) # add channel dim, [B, 1, fp_dim]

        h = self.conv(x).squeeze(-1) # [B, c2, 1] -> [B, c2]
        g = F.normalize(self.fc(h), dim = -1) # [B, embed_dim], normalized embedding

        if self.use_projection:
            z = F.normalize(self.proj(g), dim = -1)
            return g, z
        else:
            return g


In [48]:
# single batch
sample_fp = torch.randn(1, 2048)  # fake fingerprint
encoder = fp_CNN_Encoder()
g, z = encoder(sample_fp)
print(g.shape, z.shape)  # torch.Size([1, 256]) torch.Size([1, 120])


torch.Size([1, 256]) torch.Size([1, 120])


In [49]:
import torch

# make a toy encoder
encoder_bn  = fp_CNN_Encoder(batchnorm_safe=False)  # uses BatchNorm1d
encoder_ln  = fp_CNN_Encoder(batchnorm_safe=True)   # uses LayerNorm

# fake fingerprints: 2048-bit vectors
batch1 = torch.randn(1, 2048)   # batch size = 1
batch2 = torch.randn(4, 2048)   # batch size = 4

# run through LayerNorm-safe encoder
print("LayerNorm (safe):")
g1, z1 = encoder_ln(batch1)
print(" batch1 (B=1): g:", g1.shape, " z:", z1.shape)
g2, z2 = encoder_ln(batch2)
print(" batch2 (B=4): g:", g2.shape, " z:", z2.shape)

# run through BatchNorm encoder
print("\nBatchNorm1d:")
try:
    g1_bn, z1_bn = encoder_bn(batch1)  # will raise error in training mode
    print(" batch1 (B=1): g:", g1_bn.shape, " z:", z1_bn.shape)
except Exception as e:
    print(" batch1 (B=1) raised error:", e)

g2_bn, z2_bn = encoder_bn(batch2)
print(" batch2 (B=4): g:", g2_bn.shape, " z:", z2_bn.shape)


LayerNorm (safe):
 batch1 (B=1): g: torch.Size([1, 256])  z: torch.Size([1, 120])
 batch2 (B=4): g: torch.Size([4, 256])  z: torch.Size([4, 120])

BatchNorm1d:
 batch1 (B=1) raised error: Expected more than 1 value per channel when training, got input size torch.Size([1, 256])
 batch2 (B=4): g: torch.Size([4, 256])  z: torch.Size([4, 120])


In [50]:
import pandas as pd