In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from matplotlib import pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler

In [5]:
# load experimental data into pandas dataframe

df_exp = pd.read_csv("team-a.csv")
df_exp = df_exp.drop(['formula'],axis=1)

In [6]:
# split experimental data

X_exp = df_exp[['MagpieData maximum MendeleevNumber', 'MagpieData mean AtomicWeight',
       'MagpieData minimum MeltingT', 'MagpieData maximum MeltingT',
       'MagpieData mean MeltingT', 'MagpieData minimum Column',
       'MagpieData range Column', 'MagpieData avg_dev Column',
       'MagpieData mode Column', 'MagpieData range Row', 'MagpieData mean Row',
       'MagpieData range Electronegativity',
       'MagpieData avg_dev Electronegativity',
       'MagpieData mode Electronegativity', 'MagpieData mean NpValence',
       'MagpieData maximum NdValence', 'MagpieData range NdValence',
       'MagpieData mean NdValence', 'MagpieData maximum NfValence',
       'MagpieData mean NfValence', 'MagpieData mean NValence',
       'MagpieData mode NValence', 'MagpieData maximum NpUnfilled',
       'MagpieData range NpUnfilled', 'MagpieData mean NpUnfilled',
       'MagpieData range NUnfilled', 'MagpieData mean NUnfilled',
       'MagpieData mode NUnfilled', 'MagpieData minimum GSvolume_pa',
       'MagpieData mode GSvolume_pa', 'MagpieData maximum GSbandgap',
       'MagpieData range GSbandgap', 'MagpieData mode GSbandgap',
       'MagpieData mean GSmagmom', 'MagpieData mode SpaceGroupNumber']].values


y_exp = df_exp['gap expt'].values
y_exp = y_exp.reshape(-1,1)


In [7]:
scaler_X = StandardScaler()
scaler_y = StandardScaler()
X_scaled = scaler_X.fit_transform(X_exp)
y_scaled = scaler_y.fit_transform(y_exp)

X_train,X_val,y_train,y_val = train_test_split(X_scaled,y_scaled,test_size=0.2,random_state=42)

X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.FloatTensor(y_train)

X_val_tensor = torch.FloatTensor(X_val)
y_val_tensor = torch.FloatTensor(y_val)

In [8]:
# load dataloader

from torch.utils.data import DataLoader

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=34, shuffle=True)

In [9]:
# define some model parameters

input_size = X_train.shape[1]

model = nn.Sequential(
    nn.Linear(input_size, 128),   
    nn.ReLU(),
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Linear(64, 32),
    nn.ReLU(),
    nn.Linear(32, 1)  
)

# and optimiser

import torch.optim as optim

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [10]:
# train model
# Number of complete passes through the dataset

num_epochs = 100

# keep track of the loss for each epoch

train_losses = []
val_losses = []

# Start the training loop

for epoch in range(num_epochs): 
    model.train()
    train_loss = 0.0

    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        predictions = model(X_batch)
        loss = criterion(predictions, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    # Iterate over the validation data and compute the loss

    model.eval()
    val_loss = 0.0

    # turn off gradients since we are in the evaluation mode

    with torch.no_grad():
        for X_val, y_val in val_loader:
            val_predictions = model(X_val)
            loss = criterion(val_predictions, y_val)

            # Add the loss for this batch to the validation loss

            val_loss += loss.item()

    train_losses.append(train_loss/len(train_loader))
    val_losses.append(val_loss/len(val_loader))

    print(f"Epoch [{epoch+1}/{num_epochs}], Train loss: {train_losses[-1]:.4f}, Val loss: {val_losses[-1]:.4f}")

Epoch [1/100], Train loss: 0.8221, Val loss: 0.7594
Epoch [2/100], Train loss: 0.6317, Val loss: 0.6152
Epoch [3/100], Train loss: 0.5515, Val loss: 0.5772
Epoch [4/100], Train loss: 0.5005, Val loss: 0.5182
Epoch [5/100], Train loss: 0.4670, Val loss: 0.5414
Epoch [6/100], Train loss: 0.4304, Val loss: 0.5086
Epoch [7/100], Train loss: 0.4064, Val loss: 0.5155
Epoch [8/100], Train loss: 0.3963, Val loss: 0.4824
Epoch [9/100], Train loss: 0.3849, Val loss: 0.4681
Epoch [10/100], Train loss: 0.3539, Val loss: 0.4462
Epoch [11/100], Train loss: 0.3417, Val loss: 0.4588
Epoch [12/100], Train loss: 0.3348, Val loss: 0.4644
Epoch [13/100], Train loss: 0.3237, Val loss: 0.4489
Epoch [14/100], Train loss: 0.3030, Val loss: 0.4703
Epoch [15/100], Train loss: 0.3098, Val loss: 0.4478
Epoch [16/100], Train loss: 0.3012, Val loss: 0.4291
Epoch [17/100], Train loss: 0.2825, Val loss: 0.4574
Epoch [18/100], Train loss: 0.2813, Val loss: 0.4362
Epoch [19/100], Train loss: 0.2720, Val loss: 0.4627
Ep

In [11]:
# calculate mae, dont fully understand this code -check

from sklearn.metrics import mean_absolute_error

# Set model to evaluation mode
model.eval()

# Get all predictions
all_predictions = []
all_targets = []

with torch.no_grad():
    for X_batch, y_batch in val_loader: 
        predictions = model(X_batch)
        all_predictions.append(predictions)
        all_targets.append(y_batch)

# Concatenate all batches
all_predictions = torch.cat(all_predictions)
all_targets = torch.cat(all_targets)

# Convert to numpy and unscale (IMPORTANT!)
predictions_scaled = all_predictions.numpy()
targets_scaled = all_targets.numpy()

# Inverse transform to get original scale
predictions_original = scaler_y.inverse_transform(predictions_scaled)
targets_original = scaler_y.inverse_transform(targets_scaled)

# Calculate MAE
mae = mean_absolute_error(targets_original, predictions_original)
print(f"Validation MAE: {mae:.4f}")

Validation MAE: 0.4725


In [None]:
# feature engineering 