In [None]:
# In this notebook I train a neural network to predict experimental bandgap. 
# I have modified the approach taught in Dr Alex Ganose's Data Analytics module so that the nn parameters are assigned in a class
# I then train the model using cross-validation to assess it's performance
# MAE (0.47) is worse than XGBoost benchmark (0.43), but that it expected for supervised learning 

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from matplotlib import pyplot as plt
import torch
import torch.nn as nn
from sklearn.model_selection import KFold
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler

In [16]:
# load experimental data into pandas dataframe

df = pd.read_csv("team-a.csv")
df = df.drop(['formula'],axis=1)

In [50]:
# split experimental data

X = df[['MagpieData maximum MendeleevNumber', 'MagpieData mean AtomicWeight',
       'MagpieData minimum MeltingT', 'MagpieData maximum MeltingT',
       'MagpieData mean MeltingT', 'MagpieData minimum Column',
       'MagpieData range Column', 'MagpieData avg_dev Column',
       'MagpieData mode Column', 'MagpieData range Row', 'MagpieData mean Row',
       'MagpieData range Electronegativity',
       'MagpieData avg_dev Electronegativity',
       'MagpieData mode Electronegativity', 'MagpieData mean NpValence',
       'MagpieData maximum NdValence', 'MagpieData range NdValence',
       'MagpieData mean NdValence', 'MagpieData maximum NfValence',
       'MagpieData mean NfValence', 'MagpieData mean NValence',
       'MagpieData mode NValence', 'MagpieData maximum NpUnfilled',
       'MagpieData range NpUnfilled', 'MagpieData mean NpUnfilled',
       'MagpieData range NUnfilled', 'MagpieData mean NUnfilled',
       'MagpieData mode NUnfilled', 'MagpieData minimum GSvolume_pa',
       'MagpieData mode GSvolume_pa', 'MagpieData maximum GSbandgap',
       'MagpieData range GSbandgap', 'MagpieData mode GSbandgap',
       'MagpieData mean GSmagmom', 'MagpieData mode SpaceGroupNumber']].values


y = df['gap expt'].values
y = y.reshape(-1,1)

# some dodgy variable names here - but simpler for implementing cross val later, just want to save x_val and y_val for later

X,X_val_holdout,y,y_val_holdout = train_test_split(X,y,test_size=0.2,random_state=42)


In [51]:
# try and define nn as a class to make it easier to work with
# i think hidden_size = 128 and num_classes = 1

class SimpleNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        out = self.relu(out)
        out = self.fc4(out)

        return out

In [40]:
# set up k-fold cross validation

k = 5
kf = KFold(n_splits=k,shuffle=True)

In [52]:
# initialise and train model
# key point here - need to split data, scale, covert to tensors, and load for each fold for cross validation, then train model. 

input_size = X.shape[1]
hidden_size = 128
num_classes = 1

val_losses = []

for train_index, val_index in kf.split(X):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]

    # scale data. be careful, need seperate x scaler and y scaler because they are different shapes
    # also watch out for fit_transform for the train data and .transform for test. fit_transform calculates mean/std from data as well as tranformation 
    scaler_X = StandardScaler()
    scaler_y = StandardScaler()

    X_train_scaled = scaler_X.fit_transform(X_train)
    y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1,1))
    X_val_holdout_scaled = scaler_X.transform(X_val_holdout)
    y_val_holdout_scaled = scaler_y.transform(y_val_holdout.reshape(-1,1))

    # convert to tensors
    X_train_tensor = torch.FloatTensor(X_train_scaled)
    y_train_tensor = torch.FloatTensor(y_train_scaled)
    X_val_tensor = torch.FloatTensor(X_val_holdout_scaled)
    y_val_tensor = torch.FloatTensor(y_val_holdout_scaled)

    # create dataloader - this feeds data in batches during training 
    train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor),batch_size=32,shuffle=True)

    # create model
    model = SimpleNet(input_size, hidden_size, num_classes)

    # use different criterion for classification problems. this optimiser is standard, can change lr if results are poor
    criterion = nn.L1Loss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    # training loop
    num_epochs = 100
    for epoch in range(num_epochs):
        # proper implementation of dataloader:
        for batch_X, batch_y in train_loader:
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # evaluation - calculates accuracies and adds to list initialised at start
    with torch.no_grad():
        outputs = model(X_val_tensor)
        outputs_original = scaler_y.inverse_transform(outputs.numpy())
        y_val_original = scaler_y.inverse_transform(y_val_tensor.numpy())

        val_loss = np.mean(np.abs(outputs_original -y_val_original))
        val_losses.append(val_loss)
        print(f"fold validation MAE:{val_loss.item()}")
            
# calculates accuracy
avg_val_loss = np.mean(val_losses)
print(f"MAE:{avg_val_loss}")
print(f"mae standard deviation:{np.std(val_losses)}")

fold validation MAE:0.4925520122051239
fold validation MAE:0.47664642333984375
fold validation MAE:0.4583447575569153
fold validation MAE:0.45019763708114624
fold validation MAE:0.4806918799877167
MAE:0.47168654203414917
mae standard deviation:0.015367762185633183
