# Task 1: Fine-tune Chemical Language Model

The goal is to fine-tune a pre-trained chemical language model on a regression task using the Lipophilicity dataset. The task involves predicting the lipophilicity value for a given molecule representation (SMILES string). You will learn how to load and tokenize a dataset from HuggingFace, how to load a pre-trained language model, and finally, how to run a model in inference mode.

Your task is to complete the missing code blocks below.

In [None]:
# import dependencies
import torch
from datasets import load_dataset
import torch.nn as nn
from torch.optim import AdamW
from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, PreTrainedModel
from torch.utils.data import DataLoader, Dataset, Subset
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
from tqdm.notebook import tqdm
import random
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# 1.Fine-tune a Chemical Language Model on Lipophilicity


## --- Step 1: Load Dataset ---

The dataset we are going to use is the [Lipophilicity](https://huggingface.co/datasets/scikit-fingerprints/MoleculeNet_Lipophilicity) dataset, part of [MoleculeNet](https://pubs.rsc.org/en/content/articlelanding/2018/sc/c7sc02664a) benchmark.

Lipophilicity, also known as hydrophobicity, is a measure of how readily a substance dissolves in nonpolar solvents (such as oil) compared to polar solvents (such as water).

In [None]:
# Dataset name and model name
DATASET_PATH = "scikit-fingerprints/MoleculeNet_Lipophilicity"
MODEL_NAME = "ibm/MoLFormer-XL-both-10pct"  #MoLFormer model

In [None]:
# load the dataset from HuggingFace
dataset = load_dataset(DATASET_PATH)

In [None]:
# Convert the 'train' split to a DataFrame for exploration
lipo_df = pd.DataFrame(dataset['train'])

# Display column names and the first 5 rows
print("Column Names:", lipo_df.columns.to_list())
print("\nFirst few rows:")
print(lipo_df.head())

In [None]:
lipo_df.tail()

In [None]:
# Visualize the distribution of the logP values and the length of the SMILES strings
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Distribution of logP values
sns.histplot(lipo_df['label'], bins=30, kde=True, ax=ax1, color='skyblue')
ax1.set_title('Distribution of logP Values')
ax1.set_xlabel('logP')

# Distribution of SMILES lengths
lipo_df['smiles_length'] = lipo_df['SMILES'].apply(len)
sns.histplot(lipo_df['smiles_length'], bins=30, kde=True, ax=ax2, color='salmon')
ax2.set_title('Distribution of SMILES Lengths')
ax2.set_xlabel('Number of Characters in SMILES')

plt.tight_layout()
plt.show()

The logP values are roughly normally distributed, with most values concentrated around 1 to 4. There are fewer molecules with extreme logP values (e.g., below 0 or above 4). The distribution of SMILES lengths is right-skewed, with most molecules having around 50 characters. The majority of molecules are of moderate size, which is typical for drug-like molecules.

In [None]:
#Custom PyTorch Dataset for handling SMILES strings and corresponding target labels.
class SMILESDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        
        """
        Args:
            dataframe: Pandas DataFrame containing 'SMILES' and 'label' columns
            tokenizer: Pre-trained tokenizer from Hugging Face
        """
        # Extract the SMILES strings and target labels from the DataFrame
        self.smiles = dataframe['SMILES'].tolist()
        self.targets = dataframe['label'].tolist()

        # Pre-trained tokenizer for text processing 
        self.tokenizer = tokenizer

    def __len__(self): # Total number of samples in the dataset.
        return len(self.smiles)

    def __getitem__(self, idx): # Retrieves a single sample from the dataset.
        
        # Retrieve the SMILES string and corresponding target label at the given index
        smile = self.smiles[idx]
        target = self.targets[idx]

        # Tokenize with model-specific settings
        encoding = self.tokenizer(
            smile,
            padding='max_length',
            truncation=True,
            max_length= 512, # Maximum sequence length for the tokenizer
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(target, dtype=torch.float) # Convert target label to tensor
        }

## --- Step 2: Split Dataset ---

As there is only one split (train split) in the original dataset, we need to split the data into training and testing sets by ourselves.

In [None]:
# tokenize the data
# load a pre-trained tokenizer from HuggingFace
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

In [None]:
# split the data into training and test datasets
train_df, test_df = train_test_split(lipo_df, test_size=0.2, random_state=9)

# Create SMILESDataset instances for both train and test datasets
train_data = SMILESDataset(dataframe=train_df, tokenizer=tokenizer)
test_data = SMILESDataset(dataframe=test_df, tokenizer=tokenizer)

In [None]:
# Construct Pytorch data loaders for both train and test datasets
BATCH_SIZE = 16

# Create DataLoader for the training dataset and testing dataset (shuffling for training)
train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

## --- Step 3: Load Model ---

In [None]:
# load pre-trained model from HuggingFace
model = AutoModel.from_pretrained(MODEL_NAME, deterministic_eval=True, trust_remote_code=True)

In [None]:
# MoLFormer model with a regression head
class MoLFormerWithRegressionHead(PreTrainedModel):
    def __init__(self, pretrained_model, config=None):
        """
        Args:
            pretrained_model: A pre-trained MoLFormer model that will be used as the backbone.
            config: A configuration object. If None, uses the config from the pretrained model.
        """
        
        # If no config is provided, use the pretrained model's config
        if config is None:
            config = pretrained_model.config
            
        super().__init__(config)
        self.backbone = pretrained_model  # Use the pre-trained MoLFormer model as the backbone
        hidden_size = self.backbone.config.hidden_size # Get the hidden size from the config
        self.regression_head = nn.Linear(hidden_size, 1) # Define the regression head (fully connected layer that outputs a single value)
        self.config = config

    def forward(self, input_ids, attention_mask):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask) # Forward pass through the pre-trained model
        
        # Get the last hidden state of the [CLS] token
        last_hidden_state = outputs.last_hidden_state  # Shape: (batch_size, sequence_length, hidden_size)
        
        cls_hidden_state = last_hidden_state[:, 0, :]  # We use the hidden state corresponding to the [CLS] token for regressio
        output = self.regression_head(cls_hidden_state)  # Pass through the regression head
        return output.squeeze(-1) # Flatten the output to (batch_size,) for regression


# Class for the MLM-finetuned model
class MoLFormerMLMWithRegressionHead(PreTrainedModel):
    def __init__(self, pretrained_model, config=None):
        if config is None:
            config = pretrained_model.config
        super().__init__(config)
        self.backbone = pretrained_model
        hidden_size = self.backbone.config.hidden_size
        self.regression_head = nn.Linear(hidden_size, 1)
        self.config = config

    def forward(self, input_ids, attention_mask=None):
        outputs = self.backbone(input_ids=input_ids, 
                               attention_mask=attention_mask,
                               output_hidden_states=True)  # MLM needs hidden states explicitly
        last_hidden_state = outputs.hidden_states[-1]  # Access last layer’s hidden states
        cls_hidden_state = last_hidden_state[:, 0, :]
        output = self.regression_head(cls_hidden_state)
        return output.squeeze(-1)

In [None]:
# initialize the regression model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
regression_model = MoLFormerWithRegressionHead(model).to(device)

## --- Step 4 & 5: Training and evaluation---

In [None]:
def train_and_evaluate(model, train_dataloader, test_dataloader, num_epochs=50, learning_rate=2e-5):
    """
    Trains and evaluates a model over multiple epochs.
    
    Args:
        model: model to train and evaluate.
        train_dataloader: DataLoader for the training data.
        test_dataloader: DataLoader for the testing data.
        num_epochs (int, optional): The number of epochs to train the model.
        learning_rate (float, optional): The learning rate for the optimizer.
    
    Returns:
        dict: A dictionary containing the training and testing losses, R² scores, and final RMSE.
    """

    # Initialize the optimizer (AdamW) and loss function (MSELoss for regression tasks)
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.MSELoss()

    # Lists to store losses and metrics for each epoch
    train_losses = []
    test_losses = []
    r2_scores = []

    # Training loop for each epoch
    for epoch in range(num_epochs):
        model.train() # Set the model to training mode
        total_train_loss = 0
        train_steps = 0

        # Iterate through the training data
        train_iterator = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}")
        
        for batch in train_iterator:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad() # Zero the gradients before backpropagation
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels) # Calculate the loss
            loss.backward() # Backpropagate the gradients
            optimizer.step() # Update the model parameters using the optimizer
            
            total_train_loss += loss.item() # Accumulate the training loss
            train_steps += 1
            
            train_iterator.set_postfix({'training_loss': '{:.3f}'.format(loss.item())})

        # Calculate the average training loss for the current epoch
        avg_train_loss = total_train_loss / train_steps
        train_losses.append(avg_train_loss)
        
        # Evaluation phase
        model.eval()
        total_test_loss = 0
        test_steps = 0
        all_predictions = []
        all_labels = []

        # Disable gradient calculation during evaluation
        with torch.no_grad():
            for batch in test_dataloader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                loss = criterion(outputs, labels) # Calculate the loss
                
                total_test_loss += loss.item() # Accumulate the test loss
                test_steps += 1

                # Store the predictions and true labels for metrics calculation
                all_predictions.extend(outputs.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        # Calculate the average test loss for the current epoch
        avg_test_loss = total_test_loss / test_steps
        test_losses.append(avg_test_loss)

        # Calculate the R² score for regression performance
        r2 = r2_score(all_labels, all_predictions)
        r2_scores.append(r2)

        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(all_labels, all_predictions))
        
        print(f"\nEpoch {epoch + 1}:")
        print(f"Average training loss: {avg_train_loss:.3f}")
        print(f"Average test loss: {avg_test_loss:.3f}")
        print(f"R² Score: {r2:.3f}")
        print(f"RMSE: {rmse:.3f}")
        print("-" * 50)
    
    return {
        'train_losses': train_losses,
        'test_losses': test_losses,
        'r2_scores': r2_scores,
        'final_r2': r2,
        'final_rmse': rmse
    }

In [None]:
print("Starting training...")
training_stats_origModel = train_and_evaluate(
    model=regression_model,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
    num_epochs=100,
    learning_rate=2e-5
)

In [None]:
def plot_training_metrics(stats):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Plot losses
    ax1.plot(stats['train_losses'], label='Training Loss')
    ax1.plot(stats['test_losses'], label='Validation Loss')
    ax1.set_title('Training and Validation Loss')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.legend()
    
    # Plot R² scores
    ax2.plot(stats['r2_scores'], label='R² Score')
    ax2.set_title('R² Score Evolution')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('R² Score')
    ax2.legend()
    
    plt.tight_layout()
    plt.savefig("Origregression_training_plot.png", dpi=300, bbox_inches='tight')
    plt.show()

plot_training_metrics(training_stats_origModel)

# 2.Add Unsupervised Finetuning
In this step, you will perform unsupervised fine-tuning on the training dataset. This means the model will leverage only the SMILES strings without any corresponding labels to adapt its understanding of the data distribution. By familiarizing the model with the patterns and structure of the SMILES strings, you can potentially enhance its performance on downstream supervised tasks.

For this fine-tuning, you will use the Masked Language Modeling (MLM) objective, where the model learns to predict randomly masked tokens within the input sequence. Remember to save the fine-tuned model for later use.


In [None]:
# Load the pre-trained model for Masked Language Modeling (MLM)
mlm_model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME, trust_remote_code=True)
mlm_model.to(device)

In [None]:
# Custom Dataset class for Masked Language Modeling (MLM)
class MLMDataset(Dataset):
    def __init__(self, smiles_list, tokenizer):
        self.tokenizer = tokenizer
        self.smiles = smiles_list
        
    def __len__(self):
        return len(self.smiles)
    
    def __getitem__(self, idx): # Tokenizes a SMILES string
        encoding = self.tokenizer(
            self.smiles[idx],
            padding='max_length',
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )

        # Return the tokenized input IDs and attention mask, squeezed to remove any extra dimensions
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
        }

In [None]:
def unsupervised_finetuning(model, train_df, tokenizer, device, 
                            num_epochs=5, batch_size=16, learning_rate=1e-5, patience=5):
    """
    Perform unsupervised fine-tuning using MLM objective with early stopping and best model saving.
    """
    print("Starting unsupervised fine-tuning...")

    # Create MLM dataset and dataloader
    mlm_dataset = MLMDataset(train_df['SMILES'].tolist(), tokenizer)
    mlm_dataloader = DataLoader(mlm_dataset, batch_size=batch_size, shuffle=True)
    
    # Data collator for MLM
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
    
    # Optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    # Tracking best model
    best_loss = np.inf  # Initialize best loss as infinity
    best_model_path = "./molformer_pretrainedMLM"
    os.makedirs(best_model_path, exist_ok=True)
    
    # Early stopping variables
    patience_counter = 0

    # Training loop
    model.train()
    
    for epoch in range(num_epochs):
        total_loss = 0
        steps = 0
        train_iterator = tqdm(mlm_dataloader, desc=f"MLM Epoch {epoch + 1}/{num_epochs}")
        
        for batch in train_iterator:
            # Prepare MLM inputs
            mlm_inputs = data_collator([{
                'input_ids': ids,
                'attention_mask': mask
            } for ids, mask in zip(batch['input_ids'], batch['attention_mask'])])
            
            # Move tensors to device
            input_ids = mlm_inputs['input_ids'].to(device)
            attention_mask = mlm_inputs['attention_mask'].to(device)
            labels = mlm_inputs['labels'].to(device)

            # Zero gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            steps += 1

            train_iterator.set_postfix({'mlm_loss': '{:.3f}'.format(loss.item())})

        # Compute average loss for the epoch
        avg_loss = total_loss / steps
        print(f"\nEpoch {epoch + 1} - Average MLM Loss: {avg_loss:.3f}")

        # Check if this is the best model so far
        if avg_loss < best_loss:
            print(f"Saving to {best_model_path}...")
            best_loss = avg_loss
            patience_counter = 0  # Reset patience
            model.save_pretrained(best_model_path)
            tokenizer.save_pretrained(best_model_path)
        else:
            patience_counter += 1
            print(f"No improvement for {patience_counter}/{patience} epochs.")

        # Early stopping condition
        if patience_counter >= patience:
            print("Training stopped")
            break

    print(f"Best model saved with MLM Loss: {best_loss:.3f}")
    return model

In [None]:
# Perform MLM fine-tuning
mlm_model = unsupervised_finetuning(
    model=mlm_model,
    train_df=train_df,
    tokenizer=tokenizer,
    device=device,
    num_epochs=100,
    patience=5  # Stops if no improvement for 5 epochs
)

# 3.Fine-Tune for Comparison
After performing unsupervised fine-tuning on the training data, we now fine-tune the model on the regression task with the regression head. By comparing the performance of the model before and after unsupervised fine-tuning, you can evaluate how the unsupervised fine-tuning impacts the model's performance on our target task.


In [None]:
def compare_models(origModel, mlm_model, train_dataloader, test_dataloader):
    """
    Compare performance of original and MLM fine-tuned models
    """
    
    # Create copies of models with regression heads
    #origModel_regression = MoLFormerWithRegressionHead(copy.deepcopy(model)).to(device)
    mlm_regression = MoLFormerMLMWithRegressionHead(copy.deepcopy(mlm_model)).to(device)
    
    print("\Retrieving original model stats...")
    original_stats = origModel
    
    # Train and evaluate MLM fine-tuned model
    print("\nTraining MLM fine-tuned model...")
    mlm_stats = train_and_evaluate(
        model=mlm_regression,
        train_dataloader=train_dataloader,
        test_dataloader=test_dataloader,
        num_epochs=100,
        learning_rate=2e-5
    )
    
    # Compare results
    print("\nFinal Results Comparison:")
    print("-" * 50)
    print("Original Model:")
    print(f"Final R² Score: {original_stats['final_r2']:.3f}")
    print(f"Final RMSE: {original_stats['final_rmse']:.3f}")
    print("\nMLM Fine-tuned Model:")
    print(f"Final R² Score: {mlm_stats['final_r2']:.3f}")
    print(f"Final RMSE: {mlm_stats['final_rmse']:.3f}")
    
    # Plot comparison
    plt.figure(figsize=(15, 5))
    
    # Plot R² scores
    plt.subplot(1, 2, 1)
    plt.plot(original_stats['r2_scores'], label='Original Model')
    plt.plot(mlm_stats['r2_scores'], label='MLM Fine-tuned Model')
    plt.title('R² Score Comparison')
    plt.xlabel('Epoch')
    plt.ylabel('R² Score')
    plt.legend()
    
    # Plot test losses
    plt.subplot(1, 2, 2)
    plt.plot(original_stats['test_losses'], label='Original Model')
    plt.plot(mlm_stats['test_losses'], label='MLM Fine-tuned Model')
    plt.title('Test Loss Comparison')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig("finetuneComparison_training_plot.png", dpi=300, bbox_inches='tight')
    plt.show()
    
    return original_stats, mlm_stats

In [None]:
# Compare the models
original_stats, mlm_stats = compare_models(
    origModel=training_stats_origModel,
    mlm_model=mlm_model,
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader
)