In [499]:
import pandas as pd
import numpy as np
import torch
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import Draw
from zipfile import ZipFile
import seaborn as sns
import matplotlib as plt
from torch import nn
from tqdm import tqdm
import torchvision
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from sklearn.metrics import mean_squared_error as mse
print('libraries are imported')

libraries are imported


In [500]:
class Autoencoder(nn.Module):
    def __init__(self, input_size, bottleneck, train_encoder):
        super().__init__()
        self.train_encoder = train_encoder
        self.input_size = input_size
        self.bottleneck = bottleneck

        self.encoder = nn.Sequential(
            nn.Linear(self.input_size, 512),
            torch.nn.Dropout(p=0.5),
            nn.LeakyReLU(),
            nn.Linear(512, 256),
            torch.nn.Dropout(p=0.5),
            nn.LeakyReLU(),
            nn.Linear(256, 128),
            torch.nn.Dropout(p=0.5),
            nn.LeakyReLU(),
            nn.Linear(128, 64),
            torch.nn.Dropout(p=0.5),
            nn.LeakyReLU(),
            nn.Linear(64, self.bottleneck)
        )

        if self.train_encoder:
            self.decoder = nn.Sequential(
                nn.Linear(self.bottleneck, 64),
                nn.LeakyReLU(),
                nn.Linear(64, 128),
                nn.LeakyReLU(),
                nn.Linear(128, 256),
                nn.LeakyReLU(),
                nn.Linear(256, 512),
                nn.LeakyReLU(),
                nn.Linear(512, self.input_size),
                nn.Sigmoid()
            )
        else:
            self.encoder.add_module("ReLU_last", torch.nn.ReLU())
            self.encoder.add_module("final_fc", torch.nn.Linear(self.output_size, 1))

    def forward(self,x):
        x = self.encoder(x)
        if self.train_encoder:
            x = self.decoder(x)
        return x

In [501]:
class HomoLumo(nn.Module):
    def __init__(self, n_features):
        super().__init__()
        self.input_size = n_features

        self.layers = nn.Sequential(
            ae_model.encoder,
            nn.Linear(self.input_size, 1),
        )

    def forward(self, x):
        x = self.layers(x)
        return x

In [502]:
class trainData(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data

    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]

    def __len__(self):
        return len(self.X_data)

class testData(Dataset):
    def __init__(self, X_data):
        self.X_data = X_data

    def __getitem__(self, index):
        return self.X_data[index]

    def __len__(self):
        return len(self.X_data)

In [503]:
def df_to_tensor(df):
    return torch.from_numpy(df.values).float()

In [504]:
# Hyperparameters
bottleneck = 50
lr_rate = 0.001
batch_size_1 = 128
batch_size_2 = 2
num_epochs_1 = 20
num_epochs_2 = 20
num_epochs_3 = 100
train_percentage = 0.8

In [505]:
# First, we read in the pretrain features and corresponding labels and convert them into a tensor

df_pretrain = pd.read_csv('data/pretrain_features.csv').set_index('Id')
X_pretrain = df_to_tensor(df_pretrain[df_pretrain.columns[1:]])
labels_pretrain = pd.read_csv('data/pretrain_labels.csv').set_index('Id')
y_pretrain = df_to_tensor(labels_pretrain)

In [506]:
# the pretrain data is loaded into a dataloader to use

data_pretrain = trainData(X_pretrain, y_pretrain)
pretrain_loader = DataLoader(data_pretrain, batch_size=batch_size_1, shuffle=True)

In [507]:
input_size = X_pretrain.shape[1]

ae_model = Autoencoder(input_size, bottleneck, train_encoder=True)

ae_loss_fct = nn.MSELoss()
ae_optimizer = Adam(ae_model.parameters(), lr=lr_rate)

In [508]:
#data, target = next(iter(pretrain_loader))

for epoch in range(num_epochs_1):
    for (data, target) in tqdm(pretrain_loader):

        ae_optimizer.zero_grad()
        recon_data = ae_model(data)

        ae_loss = ae_loss_fct(recon_data, data)
        ae_loss.backward()
        ae_optimizer.step()

    print('epoch', epoch + 1, '/', num_epochs_1, ", loss:", ae_loss)

100%|██████████| 391/391 [00:15<00:00, 25.14it/s]


epoch 1 / 20 , loss: tensor(0.0425, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:15<00:00, 25.40it/s]


epoch 2 / 20 , loss: tensor(0.0426, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:15<00:00, 25.79it/s]


epoch 3 / 20 , loss: tensor(0.0427, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:16<00:00, 24.26it/s]


epoch 4 / 20 , loss: tensor(0.0432, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:15<00:00, 25.74it/s]


epoch 5 / 20 , loss: tensor(0.0426, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:22<00:00, 17.29it/s]


epoch 6 / 20 , loss: tensor(0.0438, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:21<00:00, 18.32it/s]


epoch 7 / 20 , loss: tensor(0.0456, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:24<00:00, 15.90it/s]


epoch 8 / 20 , loss: tensor(0.0437, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:20<00:00, 19.47it/s]


epoch 9 / 20 , loss: tensor(0.0454, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:19<00:00, 19.84it/s]


epoch 10 / 20 , loss: tensor(0.0443, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:21<00:00, 17.93it/s]


epoch 11 / 20 , loss: tensor(0.0447, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:18<00:00, 20.63it/s]


epoch 12 / 20 , loss: tensor(0.0442, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:16<00:00, 24.29it/s]


epoch 13 / 20 , loss: tensor(0.0450, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:15<00:00, 25.98it/s]


epoch 14 / 20 , loss: tensor(0.0447, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:16<00:00, 23.74it/s]


epoch 15 / 20 , loss: tensor(0.0441, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:15<00:00, 25.42it/s]


epoch 16 / 20 , loss: tensor(0.0449, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:16<00:00, 23.56it/s]


epoch 17 / 20 , loss: tensor(0.0439, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:15<00:00, 25.67it/s]


epoch 18 / 20 , loss: tensor(0.0446, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:15<00:00, 25.96it/s]


epoch 19 / 20 , loss: tensor(0.0440, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:14<00:00, 26.24it/s]

epoch 20 / 20 , loss: tensor(0.0454, grad_fn=<MseLossBackward0>)





In [509]:
le_model = HomoLumo(bottleneck)

le_loss_fct = nn.MSELoss()
le_optimizer = Adam(le_model.parameters(), lr=lr_rate)

for epoch in range(num_epochs_2):
    for (X, y) in tqdm(pretrain_loader):

        le_optimizer.zero_grad()
        y_pred = le_model(X)

        le_loss = le_loss_fct(y_pred, y)
        le_loss.backward()
        le_optimizer.step()

    print('epoch', epoch + 1, '/', num_epochs_2, ", loss:", le_loss)

100%|██████████| 391/391 [00:07<00:00, 49.93it/s]


epoch 1 / 20 , loss: tensor(0.7287, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:07<00:00, 49.48it/s]


epoch 2 / 20 , loss: tensor(0.6928, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:07<00:00, 50.30it/s]


epoch 3 / 20 , loss: tensor(0.3828, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:07<00:00, 49.83it/s]


epoch 4 / 20 , loss: tensor(0.3309, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:07<00:00, 48.99it/s]


epoch 5 / 20 , loss: tensor(0.2628, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:07<00:00, 49.10it/s]


epoch 6 / 20 , loss: tensor(0.1968, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:07<00:00, 50.75it/s]


epoch 7 / 20 , loss: tensor(0.1837, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:07<00:00, 49.57it/s]


epoch 8 / 20 , loss: tensor(0.1343, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:07<00:00, 50.05it/s]


epoch 9 / 20 , loss: tensor(0.1234, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:07<00:00, 49.56it/s]


epoch 10 / 20 , loss: tensor(0.1035, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:07<00:00, 49.74it/s]


epoch 11 / 20 , loss: tensor(0.0684, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:07<00:00, 48.89it/s]


epoch 12 / 20 , loss: tensor(0.0559, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:08<00:00, 48.75it/s]


epoch 13 / 20 , loss: tensor(0.0374, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:07<00:00, 50.99it/s]


epoch 14 / 20 , loss: tensor(0.0294, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:07<00:00, 50.00it/s]


epoch 15 / 20 , loss: tensor(0.0225, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:07<00:00, 50.68it/s]


epoch 16 / 20 , loss: tensor(0.0159, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:07<00:00, 51.07it/s]


epoch 17 / 20 , loss: tensor(0.0178, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:07<00:00, 50.82it/s]


epoch 18 / 20 , loss: tensor(0.0133, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:07<00:00, 50.02it/s]


epoch 19 / 20 , loss: tensor(0.0158, grad_fn=<MseLossBackward0>)


100%|██████████| 391/391 [00:07<00:00, 49.78it/s]

epoch 20 / 20 , loss: tensor(0.0122, grad_fn=<MseLossBackward0>)





In [510]:
df_train_features = pd.read_csv('data/train_features.csv').set_index('Id')
X_train_features = df_to_tensor(df_train_features[df_train_features.columns[1:]])
#X_train_encoded = ae_model.encoder(X_train_features)

data_size = len(X_train_features)
test_cut = int(train_percentage*data_size)

X_train = X_train_features[:test_cut, :]
X_test =  X_train_features[test_cut:, :]

labels_train = pd.read_csv('data/train_labels.csv').set_index('Id')
y_train_labels = df_to_tensor(labels_train)

y_train = y_train_labels[:test_cut, :]
y_test =  y_train_labels[test_cut:, :]

data_train = trainData(X_train, y_train)
data_test = trainData(X_test, y_test)

train_loader = DataLoader(data_train, batch_size=batch_size_2, shuffle=True)
test_loader = DataLoader(data_test, batch_size=1)

In [511]:
le_model.train()

for name, param in le_model.named_parameters():
    if '0' in name:
        param.requires_grad = False

hl_loss_fct = nn.MSELoss()
hl_optimizer = Adam(le_model.parameters(), lr=0.001)

for epoch in range(num_epochs_3):
    for X_input, y_label in train_loader:

        hl_optimizer.zero_grad()
        X_pred = le_model(X_input)

        hl_loss = hl_loss_fct(X_pred, y_label)
        hl_loss.backward()
        hl_optimizer.step()

    print('epoch', epoch + 1, '/', num_epochs_3, ", loss:", hl_loss)

epoch 1 / 100 , loss: tensor(15.4575, grad_fn=<MseLossBackward0>)
epoch 2 / 100 , loss: tensor(8.9857, grad_fn=<MseLossBackward0>)
epoch 3 / 100 , loss: tensor(2.4555, grad_fn=<MseLossBackward0>)
epoch 4 / 100 , loss: tensor(0.4656, grad_fn=<MseLossBackward0>)
epoch 5 / 100 , loss: tensor(1.4390, grad_fn=<MseLossBackward0>)
epoch 6 / 100 , loss: tensor(0.3817, grad_fn=<MseLossBackward0>)
epoch 7 / 100 , loss: tensor(1.8562, grad_fn=<MseLossBackward0>)
epoch 8 / 100 , loss: tensor(0.1537, grad_fn=<MseLossBackward0>)
epoch 9 / 100 , loss: tensor(0.3093, grad_fn=<MseLossBackward0>)
epoch 10 / 100 , loss: tensor(0.2648, grad_fn=<MseLossBackward0>)
epoch 11 / 100 , loss: tensor(0.1075, grad_fn=<MseLossBackward0>)
epoch 12 / 100 , loss: tensor(0.1290, grad_fn=<MseLossBackward0>)
epoch 13 / 100 , loss: tensor(0.0742, grad_fn=<MseLossBackward0>)
epoch 14 / 100 , loss: tensor(0.3685, grad_fn=<MseLossBackward0>)
epoch 15 / 100 , loss: tensor(0.2923, grad_fn=<MseLossBackward0>)
epoch 16 / 100 , l

In [512]:
le_model.eval()

mse_eval = 0
n_test = len(y_test)
with torch.no_grad()
    for (X, y_true) in test_loader:
    
        y_eval = le_model(X)
        mse_eval += (y_true - y_eval)**2
    
    rmse_eval = (mse_eval.detach().numpy()/n_test)**0.5

print(rmse_eval[0][0])


0.2010087


In [513]:
df_test = pd.read_csv('data/test_features.csv').set_index('Id')

features_test = df_to_tensor(df_test[df_test.columns[1:]])

pred_data = testData(features_test)

predloader = DataLoader(pred_data, batch_size=1)
print("Dataloader created")
prediction = []

with torch.no_grad():
    for X_batch in tqdm(predloader):

        y_predict = le_model(X_batch).squeeze(-1).numpy()[0]

        prediction.append(y_predict)

Dataloader created


100%|██████████| 10000/10000 [00:05<00:00, 1840.09it/s]


In [514]:
IDs = np.arange(50100, 60100)

dict = {'Id': IDs, 'y': prediction}

submissions = pd.DataFrame(dict)
submissions.to_csv('submissions.csv')