In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from matplotlib import pyplot as plt

In [2]:
# load experimental data into pandas dataframe

df_exp = pd.read_csv("team-a.csv")
df_exp = df_exp.drop(['formula'],axis=1)

In [28]:
# split experimental data

X_exp = df_exp[['MagpieData maximum MendeleevNumber', 'MagpieData mean AtomicWeight',
       'MagpieData minimum MeltingT', 'MagpieData maximum MeltingT',
       'MagpieData mean MeltingT', 'MagpieData minimum Column',
       'MagpieData range Column', 'MagpieData avg_dev Column',
       'MagpieData mode Column', 'MagpieData range Row', 'MagpieData mean Row',
       'MagpieData range Electronegativity',
       'MagpieData avg_dev Electronegativity',
       'MagpieData mode Electronegativity', 'MagpieData mean NpValence',
       'MagpieData maximum NdValence', 'MagpieData range NdValence',
       'MagpieData mean NdValence', 'MagpieData maximum NfValence',
       'MagpieData mean NfValence', 'MagpieData mean NValence',
       'MagpieData mode NValence', 'MagpieData maximum NpUnfilled',
       'MagpieData range NpUnfilled', 'MagpieData mean NpUnfilled',
       'MagpieData range NUnfilled', 'MagpieData mean NUnfilled',
       'MagpieData mode NUnfilled', 'MagpieData minimum GSvolume_pa',
       'MagpieData mode GSvolume_pa', 'MagpieData maximum GSbandgap',
       'MagpieData range GSbandgap', 'MagpieData mode GSbandgap',
       'MagpieData mean GSmagmom', 'MagpieData mode SpaceGroupNumber']].values


y_exp = df_exp['gap expt'].values
y_exp = y_exp.reshape(-1,1)
X_train_exp,X_test_exp,y_train_exp,y_test_exp = train_test_split(X_exp,y_exp,test_size=0.2,random_state=42)

In [4]:
# load dft data into pandas dataframe

from matminer.datasets import load_dataset, get_all_dataset_info
import warnings

warnings.filterwarnings("ignore")  # ignore warnings during featurisation

print(get_all_dataset_info("matbench_mp_gap"))

df_dft = load_dataset("matbench_mp_gap")
df_dft

Dataset: matbench_mp_gap
Description: Matbench v0.1 test dataset for predicting DFT PBE band gap from structure. Adapted from Materials Project database. Removed entries having a formation energy (or energy above the convex hull) more than 150meV and those containing noble gases. Retrieved April 2, 2019. For benchmarking w/ nested cross validation, the order of the dataset must be identical to the retrieved data; refer to the Automatminer/Matbench publication for more details.
Columns:
	gap pbe: Target variable. The band gap as calculated by PBE DFT from the Materials Project, in eV.
	structure: Pymatgen Structure of the material.
Num Entries: 106113
Reference: A. Jain*, S.P. Ong*, G. Hautier, W. Chen, W.D. Richards, S. Dacek, S. Cholia, D. Gunter, D. Skinner, G. Ceder, K.A. Persson (*=equal contributions)
The Materials Project: A materials genome approach to accelerating materials innovation
APL Materials, 2013, 1(1), 011002.
doi:10.1063/1.4812323
Bibtex citations: ["@Article{Dunn2020

Unnamed: 0,structure,gap pbe
0,"[[-0.00812638 0.02476014 -0.01698117] K, [-0....",1.3322
1,"[[0. 1.78463544 1.78463544] Cr, [1.784...",0.0000
2,"[[-2.13764909 -2.12540569 -2.14704542] Cs, [-6...",0.0000
3,"[[0. 0. 0.] Si, [ 4.55195829 4.55195829 -4.55...",0.4113
4,"[[0. 2.655 2.655] Ca, [2.655 0. 2.655] C...",0.3514
...,...,...
106108,"[[ 2.91058377 3.61215869 -0.19100541] Ca, [-0...",1.1354
106109,"[[0.07215014 3.75835129 1.91249744] Ta, [2.014...",2.7274
106110,"[[0.99954964 0.70129827 4.70919163] Mg, [ 0.87...",2.8860
106111,"[[0.99298226 0.71146045 4.70710628] Zn, [ 0.86...",2.2330


In [5]:
# generate magpie features for data 

from pymatgen.core import Composition
from matminer.featurizers.composition import ElementProperty

df_dft["composition"] = df_dft["structure"].apply(lambda s: s.composition)

ep = ElementProperty.from_preset("magpie")
ep.set_n_jobs(1)
ep.featurize_dataframe(df_dft, col_id="composition", inplace=True)

ElementProperty:   0%|          | 0/106113 [00:00<?, ?it/s]

In [11]:
# extract important features based on importance in xgboost model and load data

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler

X_dft = df_dft[['MagpieData maximum MendeleevNumber', 'MagpieData mean AtomicWeight',
       'MagpieData minimum MeltingT', 'MagpieData maximum MeltingT',
       'MagpieData mean MeltingT', 'MagpieData minimum Column',
       'MagpieData range Column', 'MagpieData avg_dev Column',
       'MagpieData mode Column', 'MagpieData range Row', 'MagpieData mean Row',
       'MagpieData range Electronegativity',
       'MagpieData avg_dev Electronegativity',
       'MagpieData mode Electronegativity', 'MagpieData mean NpValence',
       'MagpieData maximum NdValence', 'MagpieData range NdValence',
       'MagpieData mean NdValence', 'MagpieData maximum NfValence',
       'MagpieData mean NfValence', 'MagpieData mean NValence',
       'MagpieData mode NValence', 'MagpieData maximum NpUnfilled',
       'MagpieData range NpUnfilled', 'MagpieData mean NpUnfilled',
       'MagpieData range NUnfilled', 'MagpieData mean NUnfilled',
       'MagpieData mode NUnfilled', 'MagpieData minimum GSvolume_pa',
       'MagpieData mode GSvolume_pa', 'MagpieData maximum GSbandgap',
       'MagpieData range GSbandgap', 'MagpieData mode GSbandgap',
       'MagpieData mean GSmagmom', 'MagpieData mode SpaceGroupNumber']].values
y_dft = df_dft['gap pbe'].values
y_dft = y_dft.reshape(-1,1)

scaler_X = StandardScaler()
scaler_y = StandardScaler()
X_scaled = scaler_X.fit_transform(X_dft)
y_scaled = scaler_y.fit_transform(y_dft)

X_train_dft,X_val_dft,y_train_dft,y_val_dft = train_test_split(X_scaled,y_scaled,test_size=0.2,random_state=42)

X_train_tensor = torch.FloatTensor(X_train_dft)
y_train_tensor = torch.FloatTensor(y_train_dft)

X_val_tensor = torch.FloatTensor(X_val_dft)
y_val_tensor = torch.FloatTensor(y_val_dft)

In [13]:
# load dataloader

from torch.utils.data import DataLoader

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=34, shuffle=True)

In [23]:
# define some model parameters

input_size = X_train_dft.shape[1]

model = nn.Sequential(
    nn.Linear(input_size, 128),   
    nn.ReLU(),
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Linear(64, 32),
    nn.ReLU(),
    nn.Linear(32, 1)  
)

# and optimiser

import torch.optim as optim

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [24]:
# train model
# Number of complete passes through the dataset

num_epochs = 100

# keep track of the loss for each epoch

train_losses = []
val_losses = []

# Start the training loop

for epoch in range(num_epochs): 
    model.train()
    train_loss = 0.0

    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        predictions = model(X_batch)
        loss = criterion(predictions, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    # Iterate over the validation data and compute the loss

    model.eval()
    val_loss = 0.0

    # turn off gradients since we are in the evaluation mode

    with torch.no_grad():
        for X_val, y_val in val_loader:
            val_predictions = model(X_val)
            loss = criterion(val_predictions, y_val)

            # Add the loss for this batch to the validation loss

            val_loss += loss.item()

    train_losses.append(train_loss/len(train_loader))
    val_losses.append(val_loss/len(val_loader))

    print(f"Epoch [{epoch+1}/{num_epochs}], Train loss: {train_losses[-1]:.4f}, Val loss: {val_losses[-1]:.4f}")

Epoch [1/100], Train loss: 0.4336, Val loss: 0.3771
Epoch [2/100], Train loss: 0.3647, Val loss: 0.3602
Epoch [3/100], Train loss: 0.3452, Val loss: 0.3408
Epoch [4/100], Train loss: 0.3304, Val loss: 0.3364
Epoch [5/100], Train loss: 0.3193, Val loss: 0.3365
Epoch [6/100], Train loss: 0.3110, Val loss: 0.3171
Epoch [7/100], Train loss: 0.3018, Val loss: 0.3199
Epoch [8/100], Train loss: 0.2955, Val loss: 0.2984
Epoch [9/100], Train loss: 0.2899, Val loss: 0.3031
Epoch [10/100], Train loss: 0.2841, Val loss: 0.2918
Epoch [11/100], Train loss: 0.2787, Val loss: 0.3038
Epoch [12/100], Train loss: 0.2757, Val loss: 0.3048
Epoch [13/100], Train loss: 0.2692, Val loss: 0.2838
Epoch [14/100], Train loss: 0.2648, Val loss: 0.2776
Epoch [15/100], Train loss: 0.2634, Val loss: 0.2783
Epoch [16/100], Train loss: 0.2593, Val loss: 0.2798
Epoch [17/100], Train loss: 0.2555, Val loss: 0.2727
Epoch [18/100], Train loss: 0.2534, Val loss: 0.2675
Epoch [19/100], Train loss: 0.2505, Val loss: 0.2743
Ep

In [None]:
# calculate mae, dont fully understand this code -check

from sklearn.metrics import mean_absolute_error

# Set model to evaluation mode
model.eval()

# Get all predictions
all_predictions = []
all_targets = []

with torch.no_grad():
    for X_batch, y_batch in val_loader: 
        predictions = model(X_batch)
        all_predictions.append(predictions)
        all_targets.append(y_batch)

# Concatenate all batches
all_predictions = torch.cat(all_predictions)
all_targets = torch.cat(all_targets)

# Convert to numpy and unscale (IMPORTANT!)
predictions_scaled = all_predictions.numpy()
targets_scaled = all_targets.numpy()

# Inverse transform to get original scale
predictions_original = scaler_y.inverse_transform(predictions_scaled)
targets_original = scaler_y.inverse_transform(targets_scaled)

# Calculate MAE
mae = mean_absolute_error(targets_original, predictions_original)
print(f"Validation MAE: {mae:.4f}")

Validation MAE: 0.4535


In [29]:
X_train_scaled = scaler_X.transform(X_train_exp)
X_test_scaled  = scaler_X.transform(X_test_exp)

# Scale y using ORIGINAL target scaler so the output space matches the pretrained head
y_train_scaled = scaler_y.transform(y_train_exp)
y_test_scaled  = scaler_y.transform(y_test_exp)

# Tensors with consistent shapes
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_scaled, dtype=torch.float32)  # shape (n,1)
X_test_tensor  = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor  = torch.tensor(y_test_scaled, dtype=torch.float32)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader_expt = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [30]:
# train pretrained model on new data

# Freeze all except the last layer
for i, p in enumerate(model.parameters()):
    p.requires_grad = True  # start by enabling all
# If your last layer is model[-1], freeze earlier layers:
for p in list(model.parameters())[:-2]:  # heuristic: freeze everything except last Linear weights+bias
    p.requires_grad = False

optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4, weight_decay=1e-4)
criterion = nn.SmoothL1Loss(beta=1.0)  # Huber loss for robustness
num_epochs = 50

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for X_batch, y_batch in train_loader_expt:
        optimizer.zero_grad()
        preds = model(X_batch)
        if preds.ndim == 1:
            preds = preds.view(-1, 1)
        loss = criterion(preds, y_batch)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        train_loss += loss.item()

    if epoch % 10 == 0:
        avg_loss = train_loss / len(train_loader_expt)
        print(f"Epoch [{epoch}/{num_epochs}], Loss (scaled): {avg_loss:.6f}")

Epoch [0/50], Loss (scaled): 0.153631
Epoch [10/50], Loss (scaled): 0.139267
Epoch [20/50], Loss (scaled): 0.134614
Epoch [30/50], Loss (scaled): 0.132693
Epoch [40/50], Loss (scaled): 0.131463


In [31]:
model.eval()
with torch.no_grad():
    preds_test = model(X_test_tensor)
    if preds_test.ndim == 1:
        preds_test = preds_test.view(-1, 1)

# Debug: MAE in scaled space
from sklearn.metrics import mean_absolute_error
mae_scaled = mean_absolute_error(y_test_tensor.numpy(), preds_test.numpy())
print(f"MAE (scaled space): {mae_scaled:.6f}")

# Inverse-transform with ORIGINAL target scaler to get eV
preds_original   = scaler_y.inverse_transform(preds_test.numpy())
targets_original = y_test_exp  # already in eV, no inverse needed if we didn't re-scale y with a new scaler

# If you followed section C, y_test_exp was scaled with scaler_y; in that case:
# targets_original = scaler_y.inverse_transform(y_test_tensor.numpy())

mae_finetuned = mean_absolute_error(targets_original, preds_original)
print(f"Fine-tuned MAE: {mae_finetuned:.4f}")
print(f"XGBoost baseline: 0.42")

MAE (scaled space): 0.339961
Fine-tuned MAE: 0.5436
XGBoost baseline: 0.42


In [None]:
# now approaching mae for xgboost baseline using transfer learning!!! next steps:
# - try different fine-tuning parameters, can i integrate it to try a load of different parameters and return the best?
# - try improving the pre-trained NN, again can i build a better optimiser?
# - is there a better dataset available than dft bandgap? check email thing 