In [3]:
!pip install rdkit
!pip install deepchem

Collecting deepchem
  Downloading deepchem-2.7.1-py3-none-any.whl (693 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m693.2/693.2 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting scipy<1.9 (from deepchem)
  Downloading scipy-1.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (42.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scipy, deepchem
  Attempting uninstall: scipy
    Found existing installation: scipy 1.11.2
    Uninstalling scipy-1.11.2:
      Successfully uninstalled scipy-1.11.2
Successfully installed deepchem-2.7.1 scipy-1.8.1


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from rdkit import Chem
from rdkit.Chem import AllChem
import deepchem as dc
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Load data
train_data = pd.read_csv('train_linear.csv')

# Define constants
BATCH_SIZE = 32
NUM_EPOCHS = 200
LEARNING_RATE = 0.001

# Extract SMILES strings and convert to molecular graphs
def smiles_to_graph(smiles_list):
    graph_representations = [AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smiles), 2, nBits=1024) for smiles in smiles_list]
    return np.array(graph_representations)

train_graph_representations = smiles_to_graph(train_data["SMILES"])
feature_columns = train_data.columns.difference(["id", "SMILES", "MLM", "HLM"])

# Normalize other features from the dataset
scaler = StandardScaler().fit(train_data[feature_columns])
normalized_features = scaler.transform(train_data[feature_columns])

# Combine molecular representations and normalized features
combined_train_features = np.hstack([train_graph_representations, normalized_features])

# Split data into training and validation sets
train_features, val_features, train_labels, val_labels = train_test_split(combined_train_features, train_data[['MLM', 'HLM']].values, test_size=0.1, random_state=42)

# Define PyTorch dataset and dataloader
class CustomDataset(Dataset):
    def __init__(self, features, labels=None):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.labels is None:
            return torch.tensor(self.features[idx], dtype=torch.float)
        else:
            return torch.tensor(self.features[idx], dtype=torch.float), torch.tensor(self.labels[idx], dtype=torch.float)

train_dataset = CustomDataset(train_features, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataset = CustomDataset(val_features, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Define the neural network model
class MultiInputNN(nn.Module):
    def __init__(self, input_dim):
        super(MultiInputNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 2)
        self.dropout = nn.Dropout(0.2)
        self.dropout2 = nn.Dropout(0.2)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.dropout2(x)
        x = self.fc4(x)
        return x

model = MultiInputNN(train_features.shape[1])
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Training loop
for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    for i, (features, labels) in enumerate(train_dataloader):
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validate on the validation set
    model.eval()
    val_predictions = []
    with torch.no_grad():
        val_loss = 0
        for features, labels in val_dataloader:
            outputs = model(features)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            val_predictions.append(outputs.numpy())

    val_predictions = np.vstack(val_predictions)
    rmse = np.sqrt(mean_squared_error(val_labels, val_predictions))

    print(f"Epoch {epoch + 1}/{NUM_EPOCHS}, Training Loss: {total_loss / len(train_dataloader):.4f}, Validation RMSE: {rmse:.4f}")

# Load test data, featurize, and make predictions
test_data = pd.read_csv('test_linear.csv')
test_graph_representations = smiles_to_graph(test_data["SMILES"])
normalized_test_features = scaler.transform(test_data[feature_columns])
combined_test_features = np.hstack([test_graph_representations, normalized_test_features])
test_dataset = CustomDataset(combined_test_features)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model.eval()
test_predictions = []
with torch.no_grad():
    for features in test_dataloader:
        outputs = model(features)
        test_predictions.append(outputs.numpy())

test_predictions = np.vstack(test_predictions)
submission = pd.DataFrame({
    "id": test_data["id"],
    "MLM": test_predictions[:, 0],
    "HLM": test_predictions[:, 1]
})
submission.to_csv("submission.csv", index=False)
print("Submission file created!")


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Load data
train_data = pd.read_csv('train_linear.csv')

# Constants
BATCH_SIZE = 32
NUM_EPOCHS = 200
LEARNING_RATE = 0.001
WEIGHT_DECAY = 1e-5
PATIENCE = 10

def smiles_to_graph(smiles_list):
    graph_representations = [AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smiles), 2, nBits=1024) for smiles in smiles_list]
    return np.array(graph_representations)

train_graph_representations = smiles_to_graph(train_data["SMILES"])
feature_columns = train_data.columns.difference(["id", "SMILES", "MLM", "HLM"])
scaler = StandardScaler().fit(train_data[feature_columns])
normalized_features = scaler.transform(train_data[feature_columns])
combined_train_features = np.hstack([train_graph_representations, normalized_features])
train_features, val_features, train_labels, val_labels = train_test_split(combined_train_features, train_data[['MLM', 'HLM']].values, test_size=0.1, random_state=42)

class CustomDataset(Dataset):
    def __init__(self, features, labels=None):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.labels is None:
            return torch.tensor(self.features[idx], dtype=torch.float)
        else:
            return torch.tensor(self.features[idx], dtype=torch.float), torch.tensor(self.labels[idx], dtype=torch.float)

train_dataset = CustomDataset(train_features, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataset = CustomDataset(val_features, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

class ResidualBlock(nn.Module):
    def __init__(self, in_features, out_features):
        super(ResidualBlock, self).__init__()
        self.fc1 = nn.Linear(in_features, out_features)
        self.fc2 = nn.Linear(out_features, out_features)
        self.bn1 = nn.BatchNorm1d(out_features)
        self.bn2 = nn.BatchNorm1d(out_features)
        self.activation = nn.LeakyReLU(negative_slope=0.01)
        self.shortcut = nn.Linear(in_features, out_features) if in_features != out_features else None

    def forward(self, x):
        identity = x
        out = self.fc1(x)
        out = self.bn1(out)
        out = self.activation(out)
        out = self.fc2(out)
        out = self.bn2(out)
        if self.shortcut is not None:
            identity = self.shortcut(x)
        out += identity
        out = self.activation(out)
        return out

class ModifiedMultiInputNN(nn.Module):
    def __init__(self, input_dim):
        super(ModifiedMultiInputNN, self).__init__()
        self.block1 = ResidualBlock(input_dim, 512)
        self.block2 = ResidualBlock(512, 256)
        self.block3 = ResidualBlock(256, 128)
        self.fc_out = nn.Linear(128, 2)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.dropout(x)
        x = self.fc_out(x)
        return x

model = ModifiedMultiInputNN(train_features.shape[1])
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=PATIENCE, verbose=True)

for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    for i, (features, labels) in enumerate(train_dataloader):
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval()
    val_predictions = []
    val_loss = 0
    with torch.no_grad():
        for features, labels in val_dataloader:
            outputs = model(features)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            val_predictions.append(outputs.numpy())

    val_predictions = np.vstack(val_predictions)
    rmse = np.sqrt(mean_squared_error(val_labels, val_predictions))
    scheduler.step(val_loss)

    print(f"Epoch {epoch + 1}/{NUM_EPOCHS}, Training Loss: {total_loss / len(train_dataloader):.4f}, Validation RMSE: {rmse:.4f}")

test_data = pd.read_csv('test_linear.csv')
test_graph_representations = smiles_to_graph(test_data["SMILES"])
normalized_test_features = scaler.transform(test_data[feature_columns])
combined_test_features = np.hstack([test_graph_representations, normalized_test_features])
test_dataset = CustomDataset(combined_test_features)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model.eval()
test_predictions = []
with torch.no_grad():
    for features in test_dataloader:
        outputs = model(features)
        test_predictions.append(outputs.numpy())

test_predictions = np.vstack(test_predictions)
submission = pd.DataFrame({
    "id": test_data["id"],
    "MLM": test_predictions[:, 0],
    "HLM": test_predictions[:, 1]
})
submission.to_csv("submission.csv", index=False)
print("Submission file created!")


Epoch 1/200, Training Loss: 1478.5110, Validation RMSE: 32.9110
Epoch 2/200, Training Loss: 825.3048, Validation RMSE: 33.9291
Epoch 3/200, Training Loss: 608.4586, Validation RMSE: 34.0272
Epoch 4/200, Training Loss: 448.9705, Validation RMSE: 34.3051
Epoch 5/200, Training Loss: 348.7811, Validation RMSE: 35.3628
Epoch 6/200, Training Loss: 290.9373, Validation RMSE: 35.5880
Epoch 7/200, Training Loss: 243.2873, Validation RMSE: 36.2125
Epoch 8/200, Training Loss: 210.2083, Validation RMSE: 36.8698
Epoch 9/200, Training Loss: 198.6118, Validation RMSE: 35.9928
Epoch 10/200, Training Loss: 172.5647, Validation RMSE: 35.9928
Epoch 11/200, Training Loss: 157.4021, Validation RMSE: 37.6858
Epoch 00012: reducing learning rate of group 0 to 5.0000e-04.
Epoch 12/200, Training Loss: 143.3893, Validation RMSE: 35.5661
Epoch 13/200, Training Loss: 116.3217, Validation RMSE: 35.8469
Epoch 14/200, Training Loss: 94.2126, Validation RMSE: 35.5777
Epoch 15/200, Training Loss: 93.8154, Validation RM

KeyboardInterrupt: ignored

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Load data
train_data = pd.read_csv('train_linear.csv')

# Constants
BATCH_SIZE = 32
NUM_EPOCHS = 200
LEARNING_RATE = 0.001
WEIGHT_DECAY = 1e-5
PATIENCE = 10
EARLY_STOPPING_PATIENCE = 20

def smiles_to_graph(smiles_list):
    graph_representations = [AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smiles), 2, nBits=1024) for smiles in smiles_list]
    return np.array(graph_representations)

train_graph_representations = smiles_to_graph(train_data["SMILES"])
feature_columns = train_data.columns.difference(["id", "SMILES", "MLM", "HLM"])
scaler = StandardScaler().fit(train_data[feature_columns])
normalized_features = scaler.transform(train_data[feature_columns])
combined_train_features = np.hstack([train_graph_representations, normalized_features])
train_features, val_features, train_labels, val_labels = train_test_split(combined_train_features, train_data[['MLM', 'HLM']].values, test_size=0.1, random_state=42)

class CustomDataset(Dataset):
    def __init__(self, features, labels=None):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        if self.labels is None:
            return torch.tensor(self.features[idx], dtype=torch.float)
        else:
            return torch.tensor(self.features[idx], dtype=torch.float), torch.tensor(self.labels[idx], dtype=torch.float)

train_dataset = CustomDataset(train_features, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataset = CustomDataset(val_features, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Define a simpler neural network model
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 2)
        self.dropout = nn.Dropout(0.4)
        self.activation = nn.LeakyReLU(negative_slope=0.01)

    def forward(self, x):
        x = self.activation(self.fc1(x))
        x = self.dropout(x)
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.dropout(x)
        x = self.fc4(x)
        return x

model = SimpleNN(train_features.shape[1])
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.5, patience=PATIENCE, verbose=True)

# Early stopping parameters
epochs_without_improvement = 0
best_val_rmse = float('inf')

for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    for i, (features, labels) in enumerate(train_dataloader):
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval()
    val_predictions = []
    val_loss = 0
    with torch.no_grad():
        for features, labels in val_dataloader:
            outputs = model(features)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            val_predictions.append(outputs.numpy())

    val_predictions = np.vstack(val_predictions)
    rmse = np.sqrt(mean_squared_error(val_labels, val_predictions))
    scheduler.step(val_loss)

    # Check for early stopping
    if rmse < best_val_rmse:
        best_val_rmse = rmse
        epochs_without_improvement = 0
    else:
        epochs_without_improvement += 1

    if epochs_without_improvement >= EARLY_STOPPING_PATIENCE:
        print("Early stopping due to no improvement in validation RMSE.")
        break

    print(f"Epoch {epoch + 1}/{NUM_EPOCHS}, Training Loss: {total_loss / len(train_dataloader):.4f}, Validation RMSE: {rmse:.4f}")

test_data = pd.read_csv('test_linear.csv')
test_graph_representations = smiles_to_graph(test_data["SMILES"])
normalized_test_features = scaler.transform(test_data[feature_columns])
combined_test_features = np.hstack([test_graph_representations, normalized_test_features])
test_dataset = CustomDataset(combined_test_features)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

model.eval()
test_predictions = []
with torch.no_grad():
    for features in test_dataloader:
        outputs = model(features)
        test_predictions.append(outputs.numpy())

test_predictions = np.vstack(test_predictions)
submission = pd.DataFrame({
    "id": test_data["id"],
    "MLM": test_predictions[:, 0],
    "HLM": test_predictions[:, 1]
})
submission.to_csv("submission.csv", index=False)
print("Submission file created!")


Epoch 1/200, Training Loss: 1531.3347, Validation RMSE: 32.9456
Epoch 2/200, Training Loss: 1042.0308, Validation RMSE: 33.0084
Epoch 3/200, Training Loss: 941.4246, Validation RMSE: 33.3867
Epoch 4/200, Training Loss: 855.7310, Validation RMSE: 33.5276
Epoch 5/200, Training Loss: 759.3003, Validation RMSE: 34.7648
Epoch 6/200, Training Loss: 675.6714, Validation RMSE: 34.6236
Epoch 7/200, Training Loss: 587.5877, Validation RMSE: 34.7195
Epoch 8/200, Training Loss: 482.4557, Validation RMSE: 36.0319
Epoch 9/200, Training Loss: 430.7852, Validation RMSE: 35.8163
Epoch 10/200, Training Loss: 378.6470, Validation RMSE: 35.6597
Epoch 11/200, Training Loss: 339.6004, Validation RMSE: 35.9257
Epoch 00012: reducing learning rate of group 0 to 5.0000e-04.
Epoch 12/200, Training Loss: 308.1426, Validation RMSE: 35.6516
Epoch 13/200, Training Loss: 274.2221, Validation RMSE: 36.0118
Epoch 14/200, Training Loss: 241.1260, Validation RMSE: 36.1608
Epoch 15/200, Training Loss: 223.8295, Validation