In [31]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from matplotlib import pyplot as plt

In [32]:
df = pd.read_csv("team-a.csv")
df = df.drop(['formula'],axis=1)

In [33]:
X_exp = df.drop(['gap expt'],axis=1).values
y_exp = df['gap expt'].values
y_exp = y_exp.reshape(-1,1)
X_train_exp,X_test_exp,y_train_exp,y_test_exp = train_test_split(X_exp,y_exp,test_size=0.2,random_state=42)

In [None]:
#XGBoost without tuning - no real improvement on random forest

xgb_model = XGBRegressor(random_state=42)

scores = cross_val_score(xgb_model, X_train_exp, y_train_exp, cv=5, scoring="neg_mean_absolute_error")
mae_scores = -scores
mae_scores
print(f"Mean absolute error: {mae_scores.mean():.2f} (+/- {mae_scores.std() * 2:.2f})")

Mean absolute error: 0.46 (+/- 0.03)


In [6]:
# tune hyperparameters
# this approach uses randomsearch which randomly goes through combinations of parameters
# for different datasets different tuning methods may be better
# BO - best for slow training (large dataset) and want to minimise the number of times you train
# grid search - best for small hyperparameter space

param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1.0]
}

random_search = RandomizedSearchCV(
    xgb_model, param_dist, n_iter=10, cv=5, random_state=42
)

random_search.fit(X_train_exp, y_train_exp)

print(f"Best hyperparameters: {random_search.best_params_}")
print(f"Best mean absolute error: {-random_search.best_score_:.2f}")

Best hyperparameters: {'subsample': 0.8, 'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.1}
Best mean absolute error: -0.69


In [7]:
#XGBoost with tuned hyperparameters - an improvement in MAE

xgb_model = XGBRegressor(n_estimators=200,subsample=0.8,max_depth=7,learning_rate=0.1,random_state=42)

scores = cross_val_score(xgb_model, X_train_exp, y_train_exp, cv=5, scoring="neg_mean_absolute_error")
mae_scores = -scores
mae_scores
print(f"Mean absolute error: {mae_scores.mean():.2f} (+/- {mae_scores.std() * 2:.2f})")

Mean absolute error: 0.43 (+/- 0.02)


In [49]:
# feature importance for xgboost. same MAE if you exlude 74% least important features 

xgb_model.fit(X_train_exp, y_train_exp)
importances = xgb_model.feature_importances_

threshold = np.percentile(importances, 74)
important_features = importances > threshold
X_train_reduced = X_train_exp[:, important_features]

In [None]:
# retraining with important features only

xgb_model = XGBRegressor(n_estimators=200,subsample=0.8,max_depth=7,learning_rate=0.1,random_state=42)

scores = cross_val_score(xgb_model, X_train_reduced, y_train_exp, cv=5, scoring="neg_mean_absolute_error")
mae_scores = -scores
mae_scores
print(f"Mean absolute error: {mae_scores.mean():.2f} (+/- {mae_scores.std() * 2:.2f})")

Mean absolute error: 0.43 (+/- 0.01)


In [None]:
# which features are important? 34 features in total 
# first add extra False into array to make it correct shape

important_features = np.insert(important_features,0,False)

# and extract important feature names
important_feature_names = df.columns[important_features]
print(important_feature_names)

Index(['MagpieData maximum MendeleevNumber', 'MagpieData mean AtomicWeight',
       'MagpieData minimum MeltingT', 'MagpieData maximum MeltingT',
       'MagpieData mean MeltingT', 'MagpieData minimum Column',
       'MagpieData range Column', 'MagpieData avg_dev Column',
       'MagpieData mode Column', 'MagpieData range Row', 'MagpieData mean Row',
       'MagpieData range Electronegativity',
       'MagpieData avg_dev Electronegativity',
       'MagpieData mode Electronegativity', 'MagpieData mean NpValence',
       'MagpieData maximum NdValence', 'MagpieData range NdValence',
       'MagpieData mean NdValence', 'MagpieData maximum NfValence',
       'MagpieData mean NfValence', 'MagpieData mean NValence',
       'MagpieData mode NValence', 'MagpieData maximum NpUnfilled',
       'MagpieData range NpUnfilled', 'MagpieData mean NpUnfilled',
       'MagpieData range NUnfilled', 'MagpieData mean NUnfilled',
       'MagpieData mode NUnfilled', 'MagpieData minimum GSvolume_pa',
       'M