In [None]:
import pandas as pd
import kagglehub
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, BertConfig, BertModel

# ------------------------------------------------------
# 1. Toy corpus with urgency scores
# ------------------------------------------------------

# Download latest version
path = kagglehub.dataset_download("shivanshuman/quantum-machine-8-aka-qm8")

df = pd.read_csv(path, nrows=100)

# Extract the first column as a list of strings
toy_sentences = df.iloc[:, 0].astype(str).tolist()

# Extract the second column as a PyTorch float tensor
toy_urgency = torch.tensor(df.iloc[:, 1].values, dtype=torch.float32)

# ------------------------------------------------------
# 2. Dataset class
# ------------------------------------------------------

class UrgencyDataset(Dataset):
    def __init__(self, sentences, scores, tokenizer, max_len=32):
        self.sentences = sentences
        self.scores = scores
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        text = self.sentences[idx]
        score = self.scores[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_len,
            padding="max_length",
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": score
        }

# -------------------------
# 3) MLP head (configurable)
# -------------------------
class MLPHead(nn.Module):
    def __init__(self, input_dim, hidden_sizes, dropout=0.1, activation=nn.GELU):
        """
        input_dim: dimensionality of encoder pooled vector (e.g., 768)
        hidden_sizes: list of ints for successive hidden layers; final output -> 1
        dropout: dropout prob between layers
        activation: activation class (not instance), default GELU
        """
        super().__init__()
        layers = []
        prev = input_dim
        for h in hidden_sizes:
            layers.append(nn.Linear(prev, h))
            layers.append(activation())
            layers.append(nn.Dropout(dropout))
            prev = h
        # final linear to single logit
        layers.append(nn.Linear(prev, 1))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        # x: (B, input_dim)
        logits = self.net(x)          # (B,1)
        return logits.squeeze(-1)     # (B,)

# -------------------------
# 4) Full model with BERT + custom MLP head
# -------------------------
class TransformerUrgencyRegressor(nn.Module):
    def __init__(self, model_name="bert-large-uncased", mlp_hidden_sizes=None, mlp_dropout=0.1):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        H = self.encoder.config.hidden_size
        # default MLP: 8x expansion then a smaller layer
        if mlp_hidden_sizes is None:
            mlp_hidden_sizes = [H * 8, 512]
        self.head = MLPHead(input_dim=H, hidden_sizes=mlp_hidden_sizes, dropout=mlp_dropout)

        # Optional: freeze encoder for fast experiments (uncomment to use)
        # for p in self.encoder.parameters():
        #     p.requires_grad = False

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        # Use CLS embedding (position 0). Alternatively use pooled_output if available:
        cls_rep = outputs.last_hidden_state[:, 0, :]  # (B, H)
        logit = self.head(cls_rep)                   # (B,)
        return logit

# -------------------------
# 5) Training setup
# -------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")

dataset = UrgencyDataset(toy_sentences, toy_urgency, tokenizer)
loader = DataLoader(dataset, batch_size=4, shuffle=True)

# instantiate model with a custom MLP: change the list to whatever you want
model = TransformerUrgencyRegressor(model_name="bert-large-uncased",
                                    mlp_hidden_sizes=[768 * 8, 512],
                                    mlp_dropout=0.25).to(device)

optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=0.01)
loss_fn = nn.BCEWithLogitsLoss()

# -------------------------
# 6) Training loop
# -------------------------
epochs = 200
print("Training...\n")

for epoch in range(epochs):
    total_loss = 0.0
    num_examples = 0
    model.train()

    for batch in loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        logits = model(input_ids=input_ids, attention_mask=attention_mask)  # (B,)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

        batch_size = input_ids.size(0)
        total_loss += loss.item() * batch_size
        num_examples += batch_size

    avg_loss = total_loss / max(1, num_examples)
    print(f"Epoch {epoch+1}/{epochs} - Avg Loss: {avg_loss:.6f}")

# ------------------y------------------------------------
# 6. Test predictions
# ------------------------------------------------------

print("\nPredictions:")
model.eval()
with torch.no_grad():
    for i, sentence in enumerate(toy_sentences):
        encoding = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True).to(device)
        logit = model(encoding["input_ids"], encoding["attention_mask"])
        score = torch.sigmoid(logit).item()  # convert logit â†’ 0-1
        print(f"Sentence: '{sentence}'")
        print(f"  True urgency: {toy_urgency[i].item():.2f}")
        print(f"  Predicted urgency: {score:.2f}\n")


Training...

Epoch 1/200 - Avg Loss: 0.532889
Epoch 2/200 - Avg Loss: 0.531076
Epoch 3/200 - Avg Loss: 0.531081
Epoch 4/200 - Avg Loss: 0.530748
Epoch 5/200 - Avg Loss: 0.530831
Epoch 6/200 - Avg Loss: 0.530950
Epoch 7/200 - Avg Loss: 0.530921
Epoch 8/200 - Avg Loss: 0.531181
Epoch 9/200 - Avg Loss: 0.530891
Epoch 10/200 - Avg Loss: 0.530822


In [12]:
with torch.no_grad():
    for i, sentence in enumerate(toy_sentences):
        encoding = tokenizer(
            sentence,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=512
        ).to(device)

        # Correct way:
        output = model(**encoding)
        logit = output.logits.squeeze(-1)          # shape: (1,) â†’ scalar
        score = torch.sigmoid(logit).item()        # â†’ float between 0 and 1

        print(f"Sentence: '{sentence}'")
        print(f"  True urgency: {toy_urgency[i].item():.4f}")
        print(f"  Predicted urgency: {score:.4f}\n")

Sentence: '[H]C([H])([H])[H]'
  True urgency: 0.4330
  Predicted urgency: 0.5559

Sentence: '[H]N([H])[H]'
  True urgency: 0.2652
  Predicted urgency: 0.5559

Sentence: '[H]O[H]'
  True urgency: 0.2865
  Predicted urgency: 0.5559

Sentence: '[H]C#C[H]'
  True urgency: 0.3586
  Predicted urgency: 0.5559

Sentence: '[H]C#N'
  True urgency: 0.3200
  Predicted urgency: 0.5559

Sentence: '[H]C([H])=O'
  True urgency: 0.1539
  Predicted urgency: 0.5559

Sentence: '[H]C([H])([H])C([H])([H])[H]'
  True urgency: 0.3761
  Predicted urgency: 0.5559

Sentence: '[H]OC([H])([H])[H]'
  True urgency: 0.2667
  Predicted urgency: 0.5559

Sentence: '[H]C#CC([H])([H])[H]'
  True urgency: 0.2734
  Predicted urgency: 0.5559

Sentence: '[H]C([H])([H])C#N'
  True urgency: 0.3197
  Predicted urgency: 0.5559

Sentence: '[H]C(=O)C([H])([H])[H]'
  True urgency: 0.1657
  Predicted urgency: 0.5559

Sentence: '[H]C(=O)N([H])[H]'
  True urgency: 0.2163
  Predicted urgency: 0.5559

Sentence: '[H]C([H])([H])C([H])([H])

In [13]:
import numpy as np
print("Label statistics:")
print(f"  Mean: {toy_urgency.mean():.4f}")
print(f"  Std : {toy_urgency.std():.4f}")
print(f"  Min/Max: {toy_urgency.min():.4f} / {toy_urgency.max():.4f}")

Label statistics:
  Mean: 0.2299
  Std : 0.0468
  Min/Max: 0.0873 / 0.5138


# Testing block

In [None]:
import math
import numpy as np
import pandas as pd
import torch

# -------------------------
# Diagnostics: show df shape & columns
# -------------------------
print("Original df shape:", getattr(df, "shape", None))
print("Columns:", list(df.columns))

# If column names are weird, show first few rows to inspect
print("\nFirst 10 rows of df (for inspection):")
display(df.head(10)) if "display" in globals() else print(df.head(10).to_string(index=False))

# -------------------------
# Choose a safe slice for rows 101..121 (inclusive)
# -------------------------
START = 20001
END_EXCLUSIVE = 20101

n = len(df)
if START >= n:
    # If df has fewer rows than START, fall back to last 21 rows (or all rows if <21)
    fallback_count = min(21, n)
    start_idx = max(0, n - fallback_count)
    end_idx = n
    print(f"\nWARNING: df has only {n} rows, so rows {START}-{END_EXCLUSIVE-1} don't exist.")
    print(f"Falling back to rows {start_idx}..{end_idx-1} (last {fallback_count} rows).")
else:
    start_idx = START
    end_idx = min(END_EXCLUSIVE, n)
    print(f"\nSelecting rows {start_idx}..{end_idx-1} from df (if available).")

df_test = pd.read_csv(r"/Users/Ed/Downloads/qm8.csv", nrows=21000)
df_test = df_test.iloc[20001:20101].reset_index(drop=True)
df_test = df_test[['smiles', 'E1-CC2']]
print("df_test shape:", df_test.shape)

# -------------------------
# Identify text and label columns (robustly)
# -------------------------
if "smiles" in df_test.columns:
    text_col = "smiles"
else:
    text_col = df_test.columns[0]

# your label column seems to be 'E1-CC2' in this run â€” if present, use it; else second column
if "E1-CC2" in df_test.columns:
    label_col = "E1-CC2"
elif "urgency" in df_test.columns:
    label_col = "urgency"
else:
    # fallback to second column if present
    label_col = df_test.columns[1] if df_test.shape[1] > 1 else None

print(f"Using text column: '{text_col}', label column: '{label_col}'")

# -------------------------
# Drop rows missing text or labels
# -------------------------
if label_col is None:
    raise RuntimeError("No suitable label column found in df_test. Please ensure a numeric label column exists.")

# Coerce label to numeric and drop NaNs
df_test[label_col] = pd.to_numeric(df_test[label_col], errors="coerce")
before_drop = len(df_test)
df_test = df_test.dropna(subset=[text_col, label_col]).reset_index(drop=True)
after_drop = len(df_test)
print(f"Dropped {before_drop - after_drop} rows with missing text or non-numeric labels. Remaining: {after_drop}")

if len(df_test) == 0:
    raise RuntimeError("No rows left in df_test after cleaning. Please check your DataFrame and indices.")

# -------------------------
# Prediction helper (batched)
# -------------------------
model.eval()

def predict_batch(texts, batch_size=16):
    preds = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        # tokenizer can accept a list for batching
        enc = tokenizer(batch_texts,
                        truncation=True,
                        padding=True,
                        max_length=32,
                        return_tensors="pt")
        input_ids = enc["input_ids"].to(device)
        attention_mask = enc["attention_mask"].to(device)
        with torch.no_grad():
            logits = model(input_ids=input_ids, attention_mask=attention_mask)  # shape (B,)
            probs = torch.sigmoid(logits).cpu().numpy()
        preds.extend(probs.tolist())
    return np.array(preds)

# Run predictions
texts = df_test[text_col].astype(str).tolist()
preds = predict_batch(texts, batch_size=16)
df_test["predicted_urgency"] = preds

# Clamp labels to [0,1] just in case
df_test[label_col] = df_test[label_col].clip(0.0, 1.0)

# -------------------------
# Metrics: MAE, Pearson r
# -------------------------
df_test["abs_error"] = (df_test[label_col] - df_test["predicted_urgency"]).abs()
mae = df_test["abs_error"].mean()

def safe_pearson(a, b):
    a = np.asarray(a, dtype=float)
    b = np.asarray(b, dtype=float)
    if len(a) < 2:
        return float("nan")
    am = a - a.mean()
    bm = b - b.mean()
    denom = math.sqrt((am**2).sum() * (bm**2).sum())
    if denom == 0:
        return float("nan")
    return float((am * bm).sum() / denom)

pearson = safe_pearson(df_test[label_col].values, df_test["predicted_urgency"].values)

# -------------------------
# Print results (sorted by abs error)
# -------------------------
print(f"\nMAE on selected slice: {mae:.4f}")
print(f"Pearson r: {pearson:.4f}\n")

print("Results (sorted by abs_error desc):")
print(df_test[[text_col, label_col, "predicted_urgency", "abs_error"]]
      .sort_values("abs_error", ascending=False)
      .to_string(index=False, formatters={
          label_col: "{:.4f}".format,
          "predicted_urgency": "{:.4f}".format,
          "abs_error": "{:.4f}".format
      }))

# -------------------------
# Optional: show the raw df_test if you want
# -------------------------
print("\nFinal df_test shape:", df_test.shape)


Original df shape: (1000, 17)
Columns: ['smiles', 'E1-CC2', 'E2-CC2', 'f1-CC2', 'f2-CC2', 'E1-PBE0', 'E2-PBE0', 'f1-PBE0', 'f2-PBE0', 'E1-PBE0.1', 'E2-PBE0.1', 'f1-PBE0.1', 'f2-PBE0.1', 'E1-CAM', 'E2-CAM', 'f1-CAM', 'f2-CAM']

First 10 rows of df (for inspection):
                      smiles   E1-CC2   E2-CC2       f1-CC2   f2-CC2  E1-PBE0  E2-PBE0  f1-PBE0      f2-PBE0  E1-PBE0.1  E2-PBE0.1  f1-PBE0.1    f2-PBE0.1   E1-CAM   E2-CAM  f1-CAM  f2-CAM
           [H]C([H])([H])[H] 0.432952 0.432960 2.497283e-01 0.249736 0.430218 0.430236 0.181436 1.815015e-01   0.430218   0.430236   0.181436 1.815015e-01 0.409931 0.409939  0.1832  0.1832
                [H]N([H])[H] 0.265220 0.350081 6.701544e-02 0.030049 0.268386 0.349106 0.040761 3.164115e-02   0.268386   0.349106   0.040761 3.164115e-02 0.253853 0.334481  0.0575  0.0238
                     [H]O[H] 0.286537 0.363579 3.775532e-02 0.000000 0.291377 0.362091 0.019503 1.000000e-08   0.291377   0.362091   0.019503 1.000000e-08 0.278519 0.35

In [44]:
pip install openpyxl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
[K     |â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 250 kB 7.1 MB/s eta 0:00:01
[?25hCollecting et-xmlfile
  Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [21]:
# 1. Compute the MAE (most common way)
mae = df_test["abs_error"].mean()
print(mae)

import openpyxl
df_test.to_excel("~/Desktop/my_results.xlsx", index=False, engine="openpyxl")


0.1630652495559357


In [None]:
# quick sanity checks
assert 'df_test' in globals(), "df_test not found in globals() â€” please ensure df_test exists"
assert model is not None, "model not found"
assert tokenizer is not None, "tokenizer not found"
assert device is not None, "device not defined"

df_test = pd.read_csv(r"/Users/edwardclayson/Desktop/qm8.csv", nrows=150)
df_test = df_test.iloc[101:122].reset_index(drop=True)
df_test = df_test[['smiles', 'E1-CC2']]
print(df_test)
print(df_test.columns)

                                           smiles    E1-CC2
0   [H]C1([H])C2([H])C([H])([H])C1([H])C2([H])[H]  0.342918
1             [H]C1([H])C2([H])OC1([H])C2([H])[H]  0.265776
2                         [H]C#CC([H])([H])C#C[H]  0.269839
3                            [H]C#CC([H])([H])C#N  0.271908
4                               [H]C([H])(C#N)C#N  0.312861
5                       [H]C#CC([H])([H])C([H])=O  0.164412
6                          [H]C(=O)C([H])([H])C#N  0.164618
7                       [H]N=C([H])N([H])C([H])=O  0.183942
8                            [H]N=C([H])OC([H])=O  0.214191
9                          [H]C(=O)N([H])C([H])=O  0.172651
10                        [H]C#CC#CC([H])([H])[H]  0.204037
11                           [H]C([H])([H])C#CC#N  0.221354
12                      [H]C(=O)C#CC([H])([H])[H]  0.148978
13               [H]OC([H])([H])C#CC([H])([H])[H]  0.248975
14     [H]C([H])([H])C#CC([H])([H])C([H])([H])[H]  0.269998
15          [H]C(=NC([H])([H])[H])OC([H]