In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from matplotlib import pyplot as plt

In [None]:
# A lot of bandgap values in the dataset are '0' corresponding to conductors 

# two-step approach: a classifier predicts whether the bandgap is 0 or > 0, and a regressor predicts the bandgap if > 0

# try on reduced features for now, if poor classifier performance reclassify feature importance. could be worth doing anyway incase differences in feature importance for tasks. 

# 1. encode dataset for classifier 
# 2. train classifier 
# 3. prep dataset for regressor (remove all '0' values)
# 4. train regressor 
# 5. somehow make them all work together 

# results: mae = 0.37 vs XGBoost regressor mae = 0.43, 14% improvement

# next steps:
# could fine-tune to try and improve this? 
# investigate features, maybe try on all 132 features?
# investigate improving functionality of object to make it more deployable  

In [32]:
# load experimental data into pandas dataframe

df_exp = pd.read_csv("team-a.csv")
df_exp = df_exp.drop(['formula'],axis=1)



In [33]:
# add zero-filled column in dataframe, change val to 1 if 'gap expt' > 0

df_exp['class binary'] = 0

for i in range(4603):
    if df_exp['gap expt'].loc[df_exp.index[i]] > 0:
        df_exp['class binary'].loc[df_exp.index[i]] = 1


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_exp['class binary'].loc[df_exp.index[i]] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_exp['class b

In [8]:
# access 34 important features 

X_exp = df_exp[['MagpieData maximum MendeleevNumber', 'MagpieData mean AtomicWeight',
       'MagpieData minimum MeltingT', 'MagpieData maximum MeltingT',
       'MagpieData mean MeltingT', 'MagpieData minimum Column',
       'MagpieData range Column', 'MagpieData avg_dev Column',
       'MagpieData mode Column', 'MagpieData range Row', 'MagpieData mean Row',
       'MagpieData range Electronegativity',
       'MagpieData avg_dev Electronegativity',
       'MagpieData mode Electronegativity', 'MagpieData mean NpValence',
       'MagpieData maximum NdValence', 'MagpieData range NdValence',
       'MagpieData mean NdValence', 'MagpieData maximum NfValence',
       'MagpieData mean NfValence', 'MagpieData mean NValence',
       'MagpieData mode NValence', 'MagpieData maximum NpUnfilled',
       'MagpieData range NpUnfilled', 'MagpieData mean NpUnfilled',
       'MagpieData range NUnfilled', 'MagpieData mean NUnfilled',
       'MagpieData mode NUnfilled', 'MagpieData minimum GSvolume_pa',
       'MagpieData mode GSvolume_pa', 'MagpieData maximum GSbandgap',
       'MagpieData range GSbandgap', 'MagpieData mode GSbandgap',
       'MagpieData mean GSmagmom', 'MagpieData mode SpaceGroupNumber']].values


y_exp = df_exp['class binary'].values
y_exp = y_exp.reshape(-1,1)

X_train_exp,X_test_exp,y_train_exp,y_test_exp = train_test_split(X_exp,y_exp,test_size=0.2,random_state=42)

In [9]:
# train model

xgb_classifier = XGBClassifier(random_state=42)

# use accuracy instead of mae 

scores = cross_val_score(xgb_classifier, X_train_exp, y_train_exp, cv=5, scoring="accuracy")
print(f"Accuracy: {scores.mean():.2f} (+/- {scores.std() * 2:.2f})")

Accuracy: 0.90 (+/- 0.02)


In [10]:
# tune hyperparameters, same as regressor but change mae for accuracy 

param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1.0]
}

random_search = RandomizedSearchCV(
    xgb_classifier, param_dist, n_iter=10, cv=5, random_state=42
)

random_search.fit(X_train_exp, y_train_exp)

print(f"Best hyperparameters: {random_search.best_params_}")
print(f"Best accuracy: {random_search.best_score_:.2f}")

Best hyperparameters: {'subsample': 0.8, 'n_estimators': 300, 'max_depth': 7, 'learning_rate': 0.3}
Best accuracy: 0.91


In [11]:
# sightly better. good accuracy!

xgb_classifier = XGBClassifier(subsample=0.8, n_estimators=300, max_depth=7, learning_rate=0.3, random_state=42)

scores = cross_val_score(xgb_classifier, X_train_exp, y_train_exp, cv=5, scoring="accuracy")
print(f"Accuracy: {scores.mean():.2f} (+/- {scores.std() * 2:.2f})")

Accuracy: 0.91 (+/- 0.02)


In [13]:
# now to train a regressor 

# only use values where 'class encoded' != 0

df_exp = df_exp[df_exp['class binary'] != 0]

print(df_exp)

X_exp = df_exp[['MagpieData maximum MendeleevNumber', 'MagpieData mean AtomicWeight',
       'MagpieData minimum MeltingT', 'MagpieData maximum MeltingT',
       'MagpieData mean MeltingT', 'MagpieData minimum Column',
       'MagpieData range Column', 'MagpieData avg_dev Column',
       'MagpieData mode Column', 'MagpieData range Row', 'MagpieData mean Row',
       'MagpieData range Electronegativity',
       'MagpieData avg_dev Electronegativity',
       'MagpieData mode Electronegativity', 'MagpieData mean NpValence',
       'MagpieData maximum NdValence', 'MagpieData range NdValence',
       'MagpieData mean NdValence', 'MagpieData maximum NfValence',
       'MagpieData mean NfValence', 'MagpieData mean NValence',
       'MagpieData mode NValence', 'MagpieData maximum NpUnfilled',
       'MagpieData range NpUnfilled', 'MagpieData mean NpUnfilled',
       'MagpieData range NUnfilled', 'MagpieData mean NUnfilled',
       'MagpieData mode NUnfilled', 'MagpieData minimum GSvolume_pa',
       'MagpieData mode GSvolume_pa', 'MagpieData maximum GSbandgap',
       'MagpieData range GSbandgap', 'MagpieData mode GSbandgap',
       'MagpieData mean GSmagmom', 'MagpieData mode SpaceGroupNumber']].values


y_exp = df_exp['gap expt'].values
y_exp = y_exp.reshape(-1,1)

X_train_exp,X_test_exp,y_train_exp,y_test_exp = train_test_split(X_exp,y_exp,test_size=0.2,random_state=42)

      gap expt  MagpieData minimum Number  MagpieData maximum Number  \
2         1.83                       16.0                       82.0   
3         1.51                       32.0                       82.0   
6         1.98                       16.0                       47.0   
7         0.90                       32.0                       47.0   
8         2.47                       47.0                       80.0   
...        ...                        ...                        ...   
4584      0.55                       28.0                       51.0   
4586      4.99                        8.0                       40.0   
4592      2.75                       16.0                       40.0   
4596      2.00                       34.0                       40.0   
4599      1.72                        7.0                       73.0   

      MagpieData range Number  MagpieData mean Number  \
2                        66.0               36.275862   
3                    

In [16]:
#XGBoost without tuning

xgb_model = XGBRegressor(random_state=42)

scores = cross_val_score(xgb_model, X_train_exp, y_train_exp, cv=5, scoring="neg_mean_absolute_error")
mae_scores = -scores
mae_scores
print(f"Mean absolute error: {mae_scores.mean():.2f} (+/- {mae_scores.std() * 2:.2f})")

Mean absolute error: 0.43 (+/- 0.11)


In [17]:
# tune hyperparameters
# this approach uses randomsearch which randomly goes through combinations of parameters
# for different datasets different tuning methods may be better
# BO - best for slow training (large dataset) and want to minimise the number of times you train
# grid search - best for small hyperparameter space

param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1.0]
}

random_search = RandomizedSearchCV(
    xgb_model, param_dist, n_iter=10, cv=5, random_state=42
)

random_search.fit(X_train_exp, y_train_exp)

print(f"Best hyperparameters: {random_search.best_params_}")
print(f"Best mean absolute error: {-random_search.best_score_:.2f}")

Best hyperparameters: {'subsample': 0.8, 'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.1}
Best mean absolute error: -0.81


In [18]:
# tuning hyperparameters does not significantly improve mae - indictive of reaching predictive limit on smaller subset of data? 
# note: have tried on all features, gives same performance once hyperparameters are tuned. so reduced features are fine for this 

xgb_model = XGBRegressor(subsample=0.8, n_estimators=200, max_depth=7, learning_rate=0.1,random_state=42)

scores = cross_val_score(xgb_model, X_train_exp, y_train_exp, cv=5, scoring="neg_mean_absolute_error")
mae_scores = -scores
mae_scores
print(f"Mean absolute error: {mae_scores.mean():.2f} (+/- {mae_scores.std() * 2:.2f})")

Mean absolute error: 0.42 (+/- 0.10)


In [19]:
df_exp = pd.read_csv("team-a.csv")
df_exp = df_exp.drop(['formula'],axis=1)

X = df_exp[['MagpieData maximum MendeleevNumber', 'MagpieData mean AtomicWeight',
       'MagpieData minimum MeltingT', 'MagpieData maximum MeltingT',
       'MagpieData mean MeltingT', 'MagpieData minimum Column',
       'MagpieData range Column', 'MagpieData avg_dev Column',
       'MagpieData mode Column', 'MagpieData range Row', 'MagpieData mean Row',
       'MagpieData range Electronegativity',
       'MagpieData avg_dev Electronegativity',
       'MagpieData mode Electronegativity', 'MagpieData mean NpValence',
       'MagpieData maximum NdValence', 'MagpieData range NdValence',
       'MagpieData mean NdValence', 'MagpieData maximum NfValence',
       'MagpieData mean NfValence', 'MagpieData mean NValence',
       'MagpieData mode NValence', 'MagpieData maximum NpUnfilled',
       'MagpieData range NpUnfilled', 'MagpieData mean NpUnfilled',
       'MagpieData range NUnfilled', 'MagpieData mean NUnfilled',
       'MagpieData mode NUnfilled', 'MagpieData minimum GSvolume_pa',
       'MagpieData mode GSvolume_pa', 'MagpieData maximum GSbandgap',
       'MagpieData range GSbandgap', 'MagpieData mode GSbandgap',
       'MagpieData mean GSmagmom', 'MagpieData mode SpaceGroupNumber']].values

y = df_exp['gap expt'].values

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)


In [None]:
# we have a classifier and a regressor, now we need to combine them...
# bundle into class?

class TwoStepModel:
    """"class to combine classifier and regressor models into one model"""

    def __init__(self,classifier,regressor):
        self.classifier = classifier
        self.regressor = regressor

    # need this for cross validation so sklearn can call the model params for each fold 
    def get_params(self, deep=True):
        return {
            'classifier': self.classifier,
            'regressor': self.regressor}

    def fit(self, X_train, y_train):
        """pass in X_train and y_train as pandas dataframes and fit models"""
        y_binary = (y_train > 0).astype(int)
        self.classifier.fit(X_train, y_binary)

        # mask to filter all y values that are non zero
        mask = y_train > 0
        self.regressor.fit(X_train[mask],y_train[mask])
        
        return self 

    def predict(self,X_test):
        """pass in X_test to get y_pred"""
        # give array populated by '0' and '1':
        binary_predictions  = self.classifier.predict(X_test)

        # create correct shape array populated by zeros:
        y_pred = np.zeros(len(X_test))

        # which predictions are '1'?
        mask = binary_predictions == 1  
        
        # how can i match the values up? i have a numpy array of binary values 
        # make another array with all self.regressor.predict(X) values
        # use mask to fill in relevent values from regressor preds into final y_pred

        regressor_pred = self.regressor.predict(X_test)
        y_pred[mask] = regressor_pred[mask]

        return(y_pred)


In [None]:
# call the new class and perform cross-validation to assess performance: 

model = TwoStepModel(XGBClassifier(subsample=0.8, n_estimators=300, max_depth=7, learning_rate=0.3, random_state=42),
                     XGBRegressor(subsample=0.8, n_estimators=200, max_depth=7, learning_rate=0.1,random_state=42)
                    )

model.fit(X_train,y_train)

scores = cross_val_score(model, X_train, y_train, cv=5, scoring="neg_mean_absolute_error")
mae_scores = -scores
print(f"Mean absolute error: {mae_scores.mean():.2f} (+/- {mae_scores.std() * 2:.2f})")


Mean absolute error: 0.37 (+/- 0.03)
