In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# model imports
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from lightgbm import LGBMRegressor, LGBMClassifier 

from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, f1_score, roc_auc_score
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

In [None]:
class RegressionTest:

    def __init__(self, X, y, rand_state):
        """initialise class and load data as pandas dataframes"""
        self.X = X
        self.y = y
        self.rand_state = rand_state

    def test_models(self, scoring):
        """run untuned models against one another to assess performance, perfomance reported as MAE and r2"""

        # dictionary containing all the models:

        models = {
            "DECISION TREE REGRESSOR":(DecisionTreeRegressor(random_state=(int(self.rand_state)))),
            "RANDOM FOREST REGRESSOR":(RandomForestRegressor(random_state=(int(self.rand_state)),verbose=0)),
            "XGB REGRESSOR":(XGBRegressor(random_state=(int(self.rand_state)))),
            "CATBOOST REGRESSOR":(CatBoostRegressor(random_state=(int(self.rand_state)),verbose=0)),
            "LIGHT GBM REGRESSOR":(LGBMRegressor(random_state=(int(self.rand_state)),verbose=0))
        }

        if scoring == 'MAE':
            score_dict = {}
            for x in models:
                print(f"---- TRAINING {x} ----")
                scores = -(cross_val_score(models[x], self.X, self.y, cv=5, scoring="neg_mean_absolute_error"))
                score_dict[x] = f"MAE: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})"           
            
            print("---- TRAINING COMPLETE ----")
            for x in score_dict:
                print(f'{x} {score_dict[x]}')

        elif scoring == 'r2':
            score_dict = {}
            for x in models:
                print(f"---- TRAINING {x} ----")
                scores = cross_val_score(models[x], self.X, self.y, cv=5, scoring="r2")           
                score_dict[x] = f"r2: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})" 

            print("---- TRAINING COMPLETE ----")
            for x in score_dict:
                print(f'{x} {score_dict[x]}')

        else:
            print('input r2 or MAE as an argument')

    def tune_params(self, )

In [21]:
# some test data
df = pd.read_pickle('mat_bandgap_morgan.pkl')
X = np.vstack(df['Morgan'].values) 
y = df['gap'].values
y = y.reshape(-1,1)
y = y.ravel()

In [None]:
tester_object = RegressionTest(X,y,42)

tester_object.test_models('r2')

---- TRAINING DECISION TREE REGRESSOR ----
---- TRAINING RANDOM FOREST REGRESSOR ----
---- TRAINING XGB REGRESSOR ----
---- TRAINING CATBOOST REGRESSOR ----
---- TRAINING LIGHT GBM REGRESSOR ----
---- TRAINING COMPLETE ----
DECISION TREE REGRESSOR r2: 0.1247 (+/- 0.2821)
RANDOM FOREST REGRESSOR r2: 0.5728 (+/- 0.0670)
XGB REGRESSOR r2: 0.6251 (+/- 0.1620)
CATBOOST REGRESSOR r2: 0.6616 (+/- 0.1449)
LIGHT GBM REGRESSOR r2: 0.6075 (+/- 0.1766)
