# Solar Radiation Prediction

Data analysis on a 4 months dataset collected at the HI-SEAS weather station (Hawaii). The observation sampling rate is 5 minutes and collected features/variables are:

- Solar radiation [W/m^2]
- Temperature [F]
- Atmospheric pressure [Hg]
- Humidity [%]
- Wind speed [miles/h]
- Wind direction [degrees]
- Time sun rise
- TIme sun set

The dataset is a time series with cyclicality(day) and seasonality(months). It is obvious that solar radiation is a function of duration of day light(among other features), thus we
will do time feature engineering in order to account for the relative time of day.

Goal is to train ML model to predict solar radiation as a function of the above features. Particulariy, we will establish a base line with a simple regression model and then we will use
assambles (Random Forest, Gradient Boosting) with Grid Search hyper-parameter tuning. R^2 and Mean-Squared-Error(MSE) used as model evaluation metrics. 

## Load dataset and Preperation

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from dateutil.tz import *
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, TimeSeriesSplit

In [2]:

class FeatureCreation:
    def __init__(self, featureCase):
        self.featureCase = featureCase
        self.df = None

    
    def loadDataset(self, fileName):
        try:
            self.df = pd.read_csv(fileName)
        except FileNotFoundError as notFound:
            raise notFound
            
    def sanityCheck(self):
        self.df.info()
        print('\nCheck to nan/null')
        self.df.isnull().sum()
        
    def fixTimeZone(self):
        '''Convert column UNIXTime to datetime object and set correct Time zone. Data collected at HST, 
            Hawaii Standard Zone
        '''
        self.df['Date'] = self.df['UNIXTime'].astype('datetime64[s]')
        #Set time zone
        self.df['Date'] = self.df['Date'].dt.tz_localize('UTC').dt.tz_convert('HST')
        
    
    def timeConversion(self):
        ''' Convert features TimeSunRise and TimeSunSet to include actual date of the observation '''
        
        try:
            # #Holds actual observation time
            self.df['Data'] = pd.to_datetime(self.df['Data']).dt.date

            #Convert TimeSunRise and TimeSunSet features datetime.time objects
            self.df['TimeSunRise'] = pd.to_datetime(df['TimeSunRise']).dt.time
            self.df['TimeSunSet'] = pd.to_datetime(df['TimeSunSet']).dt.time

            #Create new sunset/sunrise features with observation time
            self.df['sunrise_time'] = self.df.apply(lambda row: pd.datetime.combine(row['Data'], row['TimeSunRise']), axis = 1)
            self.df['sunset_time'] = self.df.apply(lambda row: pd.datetime.combine(row['Data'], row['TimeSunSet']), axis = 1)

            #Correct timezone(HST)
            self.df['sunrise_time'] = self.df['sunrise_time'].dt.tz_localize('HST')
            self.df['sunset_time'] = self.df['sunset_time'].dt.tz_localize('HST')

            #Set index
            self.df.set_index('Date', inplace = True)

            #Sort by index
            self.df.sort_index()
        except KeyError as ke:
            print(ke)

    
    def dropColumns(self, columns):
        try:
            self.df.drop(columns = columns, inplace = True)
        except KeyError as ke:
            print(ke)
            
            
    def createFeature(self):
        '''based on self.featureCases it transforms the data accordingly. Supported transformations are
            a) RelativeTime -> relative time of observation in respect to daylight
            b) Sun is up only
        '''
        
        if self.featureCase == 'rel_time':
            self.relativeTimeFeature()
        elif self.featureCase == 'sun_is_up':
            self.sunIsUp()
        
        
    def relativeTimeFeature(self):
        print('Creating feature:', self.featureCase)
        #Converting sunrise and sunset times into timestamp
        self.df['sunrise_timestamp'] = self.df.apply(lambda row: datetime.timestamp(row['sunrise_time']), axis = 1)
        self.df['sunset_timestamp'] = self.df.apply(lambda row: datetime.timestamp(row['sunset_time']), axis = 1)

        #Creating a column containing the number of daily light hours
        self.df['Daylight_duration'] = (self.df['sunset_timestamp'] - self.df['sunrise_timestamp'])/60/60

        #Creating column describing current time relative to sunrise/sunset
        self.df['Rel_time'] = (self.df['UNIXTime']- self.df['sunrise_timestamp'])/(self.df['sunset_timestamp']-self.df['sunrise_timestamp'])

        #Removing unnecessary features/columns
        self.dropColumns(['UNIXTime','sunrise_timestamp', 'sunset_timestamp', 'sunset_time', 'sunrise_time'])
        
        
        
    def daylight(self, current_time, rising_time, set_time):
        return (rising_time < current_time) and (current_time < set_time)

    def sunIsUp(self):
        print('Creating feature', self.featureCase)
        sun_is_up = [self.daylight(self.df.index[index], self.df["sunrise_time"][index], self.df["sunset_time"][index]) for index in range(self.df.shape[0])]
        sun_is_up = np.array(sun_is_up, dtype = int)
        self.df["sun_is_up"] = sun_is_up
        
        proportion = round(sum(self.df["sun_is_up"]/self.df.shape[0]*100), 2)
        print("Proportion of record with the sun up : {0}%".format(proportion))
            
    

In [3]:
''' Import dataset and inspect observations(nan values) '''
RelTimeCase = FeatureCreation('rel_time')
RelTimeCase.loadDataset("solar_radiation_dataset.csv")
RelTimeCase.sanityCheck()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32686 entries, 0 to 32685
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   UNIXTime                32686 non-null  int64  
 1   Data                    32686 non-null  object 
 2   Time                    32686 non-null  object 
 3   Radiation               32686 non-null  float64
 4   Temperature             32686 non-null  int64  
 5   Pressure                32686 non-null  float64
 6   Humidity                32686 non-null  int64  
 7   WindDirection(Degrees)  32686 non-null  float64
 8   Speed                   32686 non-null  float64
 9   TimeSunRise             32686 non-null  object 
 10  TimeSunSet              32686 non-null  object 
dtypes: float64(4), int64(3), object(4)
memory usage: 2.7+ MB

Check to nan/null


we need to set the correct timezone which is Hawaii Standard Zone(HSZ)

In [4]:
RelTimeCase.fixTimeZone()
RelTimeCase.df.head()


Unnamed: 0,UNIXTime,Data,Time,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,TimeSunRise,TimeSunSet,Date
0,1475229326,9/29/2016 12:00:00 AM,23:55:26,1.21,48,30.46,59,177.39,5.62,06:13:00,18:13:00,2016-09-29 23:55:26-10:00
1,1475229023,9/29/2016 12:00:00 AM,23:50:23,1.21,48,30.46,58,176.78,3.37,06:13:00,18:13:00,2016-09-29 23:50:23-10:00
2,1475228726,9/29/2016 12:00:00 AM,23:45:26,1.23,48,30.46,57,158.75,3.37,06:13:00,18:13:00,2016-09-29 23:45:26-10:00
3,1475228421,9/29/2016 12:00:00 AM,23:40:21,1.21,48,30.46,60,137.71,3.37,06:13:00,18:13:00,2016-09-29 23:40:21-10:00
4,1475228124,9/29/2016 12:00:00 AM,23:35:24,1.17,48,30.46,62,104.95,5.62,06:13:00,18:13:00,2016-09-29 23:35:24-10:00


Feature 'Data' does not contain a valid Time of the day but only the correct Date of the observation. We will create two new columns, sunrise_time and sunset_time which will include the actual sun-set and sun-rise for specific day. Thus will combine parts of 'Data' with TimeSunRise and TimeSunSet respectevely.

Then we drop unnecessary columns (Data', 'Time', 'TimeSunRise', 'TimeSunSet)

In [5]:
RelTimeCase.timeConversion()
RelTimeCase.dropColumns(['Data', 'Time', 'TimeSunRise', 'TimeSunSet'])
RelTimeCase.df.head()


NameError: name 'df' is not defined

In this approach we assume(though reasonable assumption) that solar radiation is practically zero outsite of sun light period. Before Sunrise and After sunset. In order to validate this assumption, we will choose 5 random days from the dataset and plot the radiation as a function of time. 
We also add 2 vertical lines which denote the day light period.

In [None]:
'''It is reasonable to expect that the solar radiation, for any  day, would be approximately zero before the sunrise and after the sunset time. 
Quick visualization check '''
df_random_days = [RelTimeCase.df.loc['2016-09-25':'2016-09-25',:], 
                    RelTimeCase.df.loc['2016-10-25':'2016-10-25',:],
                    RelTimeCase.df.loc['2016-11-25':'2016-11-25',:]]

for day in df_random_days:
    plt.figure(figsize = (7,2))
    plt.plot(day.Radiation, 'o', markerfacecolor = 'w')

    #Plotting vertical line at sunrise
    plt.axvline(day.sunrise_time.iloc[0], label = 'Sunrise time', color = 'yellow')

    #Plotting vertical line at sunset
    plt.axvline(day.sunset_time.iloc[0], label = 'Sunset time', color = 'red') 

    #Adjusting timezone of x-axis
    plt.gca().xaxis_date('HST')

    plt.legend()
    plt.show()


Above plots confirm the assumption as well the the correct manipulation of dates.

## Preliminary Analysis

We check range of each feature to see if they make sense, clean dataset.

In [None]:
''' Check ranges for each feature, see if the make sense '''
RelTimeCase.df.describe()


All values seem to make sense, though in feature Humidity we see that the max value is 103% but we choose to ignore it.

### Distribution of features

Understand how the each feature is allocated between their lower and upper limits.

In [None]:
def featureDistPlots(df):
    ''' Check distribution of features '''
    fig, ax = plt.subplots(nrows =2, ncols = 6, figsize = (25, 10))
    features = ['Radiation', 'Temperature','Pressure', 'Humidity', 'Speed', 'WindDirection(Degrees)']
    index = 0

    #Distribution
    for feature in features:
        sns.histplot(x=df[feature], ax = ax[0,index])
        ax[0,index].set_xlabel(feature, fontsize = 14)
        index += 1

    #Box plots
    index = 0
    for feature in features:
        sns.boxplot(x=df[feature], ax = ax[1,index])
        ax[0,index].set_xlabel(feature, fontsize = 14)
        index += 1


    fig.suptitle('Distribution and box plots', fontsize = 22)
    fig.tight_layout()
    fig.subplots_adjust(top=0.9)

    plt.show()

In [None]:
# Plots distributions
featureDistPlots(RelTimeCase.df)

Explanation: throw NotImplementedException

## Feature Engineering

We consider that all features included in the dataset, are usefull for prediction of target value (Radiation).
We also need to take into account the time of the day as an indication because radiation heavily depends on solar position
on the sky and the duration of day light of each day.

We define 2 new features.
- Relative time of day (Rel_time) = $\frac{\text{Current time - Sunrise Time}}{\text{Sunrise Time - Sunset Time}}$
    - < 0 before sunrise
    - = 0 at sunrise
    - '>' 0 but < 1 between sunrise and sunset
    - = 1 at sunset
    - '>' 1 after sunset
    
- Solar day duration (Daylight_duration) = $\text{Sunrise Time - Sunset Time}$

In [None]:
'''In this first appoach we would like to create a new feature which will indicate the relative time
of the day the observation was performed in relation to daylight duration.
We call the createFeature method from FeatureCreation class and pass the desired transformation argument'''
RelTimeCase.createFeature()
RelTimeCase.df.head()


## Features Correlation

In [None]:
def corrMatrix(data):
    #Plotting a heatmap of the various features in the dataset
    fig, ax = plt.subplots(figsize = (15,5))
    sns.heatmap(data.corr(), annot = True, cmap = 'YlGnBu')
    fig.suptitle('Correlation matrix', fontsize = 16)
    plt.show()

corrMatrix(RelTimeCase.df.corr())

Explanation: throw NotImplementedException

## Features scatter function of Radiation

In [None]:
def scatterPlots(df):
    fig, ax = plt.subplots(nrows =2, ncols = 4, figsize = (23,8))

    ax[0,0].plot(df.Temperature, df.Radiation,'o', markerfacecolor='w')
    ax[0,0].set_xlabel('Temperature [F]', fontsize = 14)
    ax[0,0].set_ylabel('Radiation [W/m^2]', fontsize = 14)

    ax[0,1].plot(df.Pressure, df.Radiation,'o', markerfacecolor='w')
    ax[0,1].set_xlabel('Pressure [Hg]', fontsize = 14)
    ax[0,1].set_ylabel('Radiation [W/m^2]', fontsize = 14)

    ax[0,2].plot(df.Humidity, df.Radiation,'o', markerfacecolor='w')
    ax[0,2].set_xlabel('Humidity [%]', fontsize = 14)
    ax[0,2].set_ylabel('Radiation [W/m^2]', fontsize = 14)

    ax[0,3].plot(df.Daylight_duration, df.Radiation,'o', markerfacecolor='w')
    ax[0,3].set_xlabel('Hours of light [h]', fontsize = 14)
    ax[0,3].set_ylabel('Radiation [W/m^2]', fontsize = 14)

    ax[1,0].plot(df.Rel_time, df.Radiation,'o', markerfacecolor='w')
    ax[1,0].set_xlabel('Rel_time', fontsize = 14)
    ax[1,0].set_ylabel('Radiation [W/m^2]', fontsize = 14)

    ax[1,1].plot(df.Speed, df.Radiation,'o', markerfacecolor='w')
    ax[1,1].set_xlabel('Wind speed [miles/h]', fontsize = 14)
    ax[1,1].set_ylabel('Radiation [W/m^2]', fontsize = 14)

    ax[1,2].plot(df['WindDirection(Degrees)'], df.Radiation,'o', markerfacecolor='w')
    ax[1,2].set_xlabel('Wind direction [degrees]', fontsize = 14)
    ax[1,2].set_ylabel('Radiation [W/m^2]', fontsize = 14)

    fig.delaxes(ax[1,3])

    fig.suptitle('Scatter plots of the solar radiation as a function of the various features', fontsize = 22)
    fig.tight_layout()
    fig.subplots_adjust(top=0.9)

    plt.show()

Positive linear correlation with temperature 
Seems there is a correlation between radiation peak values with pressure peak values.

In [None]:
scatterPlots(RelTimeCase.df)

## Models

We are going to set a baseline with a simple linear model and will use RandomForest and GradientBoosting with GridSearch to get better results.

Evaluation metrics:
- R^2
- Mean Squared Error (MSE)

## Linear Regression

Split data to train(70%) and test(30%)

In [None]:
class LinearRegressionWrapper:
    def __init__(self, df, test_size):
        self.lm = LinearRegression()
        self.df = df
        self.test_size = test_size
        X = self.df.drop(columns = 'Radiation')
        y = self.df.Radiation
        #Splitting data into train and test sets
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X,y,
                                                           test_size = self.test_size,
                                                           random_state = 100)
        self.pred_train = None
        self.pred_test = None
        
    def fit(self):
        self.lm.fit(self.X_train, self.y_train)
    
    def predict(self):
        self.pred_train = self.lm.predict(self.X_train)
        self.pred_test = self.lm.predict(self.X_test)
    
    def predictPartial(self, X):
        return self.lm.predict(X)
    
    def scores(self):
        #Squared error
        print('Linear model, R^2 training set: {:.2f}'.format(r2_score(self.y_train, self.pred_train)))
        print('Linear model, R^2 test set: {:.2f}'.format(r2_score(self.y_test, self.pred_test)))

        #Mean squared error (MSE)
        print('Simple Linear model, MSE training set: {:.2f}'.format(MSE(self.y_train, self.pred_train)))
        print('Simple Linear model, MSE test set: {:.2f}'.format(MSE(self.y_test,self.pred_test)))
        

In [None]:
lmWrapper = LinearRegressionWrapper(df = RelTimeCase.df, test_size = 0.3)
#Fit model
lmWrapper.fit()
#Predictions
lmWrapper.predict()
#Print scores
lmWrapper.scores()


R^2 in both training and test is similar, indicating that there are no overfitting issues

In [None]:
def visualInspect(df, model):
    #Visual representation, predictions with linear model for the 5-day period
    df_5 = df.loc['2016-10-14':'2016-10-19',:]
    X_5 = df_5.drop(columns = 'Radiation')
    y_pred_5 = model.predictPartial(X_5)


    fig, ax = plt.subplots(figsize = (23,5))    

    ax.plot(df_5.Radiation,'o', markerfacecolor='w')
    ax.plot(df_5.index, y_pred_5, linewidth = 1.5, color = 'black', label = 'Linear model prediction')
    ax.set_ylabel('Radiation [W/m^2]', fontsize = 14)
    ax.legend(fontsize = 14)

    plt.show()
    
    return df_5, y_pred_5

df_visual, y_pred_visual = visualInspect(RelTimeCase.df, lmWrapper)

It seems that linear model is doing ok predicting radiation during day light as this was suggested by the relatively high R^2. Though, it is performing poorly during night hours.

### Residuals

Let's have a look on how the residuals of simple model look like. 

$Residual = Observed - Predicted$


In [None]:
def plot_residuals(observed , pred):
    res = observed - pred
    fig, ax = plt.subplots(figsize = (23,5))    
    ax.plot(res)
    plt.show()

plot_residuals(df_visual.Radiation, y_pred_visual)

From the residuals plot above it is clear that there some systematic patterns in the difference between the observed and predicted values. Ideally, residual plots should look fairly as a random cloud. This is another evidence that a simple linear model cannot describe the variance in our data particularly good.

## Tree-based models

We will use two tree-based models *Random Forest* and *Gradient Boosting* which are capable of capturing non-linear relationships and they do not required featuring scaling.

These models are ensamble models thus hyper-parameters tuning is required for better predictions.  Multiple models are trained as a way to identify the best performing set of hyper-parameters. We will use GridSearchCV.

We will create the TreeBasedWrapper class.

In [None]:
class TreeBasedWrapper:
    def __init__(self, model, params, df, test_size, series_split):
        self.model = model
        self.params = params
        self.df = df
        self.X = self.df.drop(columns = 'Radiation')
        self.y = self.df.Radiation
        self.series_split = series_split
        #Splitting data into train and test sets
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y,
                                                           test_size = test_size,
                                                           random_state = 100)
        self.mse_cv = None
        self.tscv = TimeSeriesSplit(series_split)
        self.grid_search = RandomizedSearchCV(estimator = self.model,
                           param_distributions = self.params,
                           cv = tscv,
                           scoring = 'neg_mean_squared_error',
                           verbose = 1,
                           n_jobs = 4)
        
        self.best_model_params = None
        self.best_model_ = None
        
        self.y_pred_train = None
        self.y_pred_test = None
     
    def fit(self):
        self.grid_search.fit(self.X_train, y_train)
    
    def best_parameters(self):
        #Extracting best hyperparameters
        self.best_model_params = self.grid_search.best_params_
        print('Best hyperparameters: \n', self.best_model_params)
    
    def best_model(self):
        #Extracting best rf model
        self.best_model_ = self.grid_search.best_estimator_
    
    def cross_validation(self):
        self.mse_cv = -cross_val_score(self.best_model_, self.X_train, self.y_train,
                            cv = self.tscv, 
                            scoring = 'neg_mean_squared_error',
                            n_jobs = 4)
    
    def predict(self):
        self.y_pred_train = self.best_model_.predict(self.X_train)
        self.y_pred_test = self.best_model_.predict(self.X_test)
    
    def print_scores(self):
        print(type(self.model).__name__)

        #Computing the MSE in the traning set, test set, and cross-validation procedure

        print('\tCV MSE:{:.2f}'.format(self.mse_cv.mean()))
        print('\tTrain MSE:{:.2f}'.format(MSE(self.y_train, self.y_pred_train)))
        print('\tTest MSE:{:.2f}'.format(MSE(self.y_test, self.y_pred_test)))

        #Computing the R^2 in the traning set and test set 
        print('\tR^2 score training set:{:.2f}'.format(r2_score(self.y_train, self.y_pred_train)))
        print('\tR^2 score test set:{:.2f}'.format(r2_score(self.y_test, self.y_pred_test)))

### Random Forest

In [None]:
#Define the grid of hyperparameters
# 'n_estimators': [500, 600, 700],
params_rf = {
    'n_estimators': [500],
    'max_depth': [5, 6, 7],
    'min_samples_leaf': [0.075, 0.05, 0.025],
    'max_features': ['log2', 'sqrt']   
}
rfWrapper = TreeBasedWrapper(RandomForestRegressor(random_state = 100), 
                                params = params_rf,
                                df = RelTimeCase.df,
                                test_size = 0.3, 
                                series_split = 3)


In [None]:
#Fit best model
rfWrapper.fit()

In [None]:
#Find best parameters
rfWrapper.best_parameters()

In [None]:
#Find best model
rfWrapper.best_model()

In [None]:
#Check if there is overfitting through the use of Cross validation
rfWrapper.cross_validation()

In [None]:
#Compute Random Forest predictions in the traning and test sets
rfWrapper.predict()

In [None]:
rfWrapper.print_scores()

### Gradient Boosting

In [None]:
#Define the grid of hyperparameters
# 'n_estimators': [500, 600, 700],
params_gb = {
    'n_estimators': [500],
    'max_depth': [5, 6, 7],
    'min_samples_leaf': [0.075, 0.05, 0.025],
    'max_features': ['log2', 'sqrt']   
}
gbWrapper = TreeBasedWrapper(GradientBoostingRegressor(random_state = 100), 
                                params = params_gb,
                                df = RelTimeCase.df,
                                test_size = 0.3, 
                                series_split = 3)


In [None]:
#Fit best model
gbWrapper.fit()

In [None]:
#Find best parameters
gbWrapper.best_parameters()

In [None]:
#Find best model
gbWrapper.best_model()

In [None]:
#Check if there is overfitting through the use of Cross validation
gbWrapper.cross_validation()

In [None]:
#Compute Random Forest predictions in the traning and test sets
gbWrapper.predict()

In [None]:
gbWrapper.print_scores()

## Feature Importance

throw NotImplementedException

In [None]:
#Plotting feature importances for Random Forest and Gradient boosting

def feature_importance_plots(model_1, model_2):
    #Creating a pd.Series of feature importances
    importances_rf = pd.Series(model_1.best_model_.feature_importances_, index = rfWrapper.X.columns)
    importances_gb = pd.Series(model_2.best_model_.feature_importances_, index = gbWrapper.X.columns)

    #Sorting importances
    sorted_importances_rf = importances_rf.sort_values()
    sorted_importances_gb = importances_gb.sort_values()

    #Plotting sorted importances
    fig, ax = plt.subplots(ncols = 2, figsize = (27,7))
    sorted_importances_rf.plot(kind = 'barh', color = 'lightblue', ax = ax[0])
    sorted_importances_gb.plot(kind = 'barh', color = 'lightblue', ax = ax[1])
    ax[0].set_title('Random Forest Regressor')
    ax[1].set_title('Gradient Boosting Regressor')
    fig.suptitle('Feature importances in the two ML models', fontsize = 24)
    plt.show()

In [None]:
feature_importance_plots(rfWrapper, gbWrapper)

## Feature Engineering II

Consider to limit the evaluations to the only hours of the day when the solar radiation is present (this allows to build a model that does not have to account for the night hours); narrow our dataset only to daylight hours since solar radiation is negligible during the night.

In [None]:
SunUpCase = FeatureCreation('sun_is_up')
SunUpCase.loadDataset("solar_radiation_dataset.csv")
#Again fixing dataset and setting correct time zone
SunUpCase.fixTimeZone()
SunUpCase.timeConversion()


SunUpCase.createFeature()
SunUpCase.dropColumns(['Data', 'Time', 'TimeSunRise', 'TimeSunSet', 'UNIXTime', 
                               'sunrise_time', 'sunset_time'])

SunUpCase.df.head()


We see that almost half of the observations performed during daylight.
Correlation matrix of new dataset

In [None]:
corrMatrix(SunUpCase.df.corr())

### Linear Model II

We fit again a simple linear model to establish a basic performace point

In [None]:
lmWrapperII = LinearRegressionWrapper(df = SunUpCase.df, test_size = 0.3)
#Fit model
lmWrapperII.fit()
#Predictions
lmWrapperII.predict()
#Print scores
lmWrapperII.scores()

### Random Forest II

In [None]:
#Define the grid of hyperparameters
# 'n_estimators': [500, 600, 700],
params_rf = {
    'n_estimators': [500],
    'max_depth': [5, 6, 7],
    'min_samples_leaf': [0.075, 0.05, 0.025],
    'max_features': ['log2', 'sqrt']   
}
rfWrapper = TreeBasedWrapper(RandomForestRegressor(random_state = 100), 
                                params = params_rf,
                                df = SunUpCase.df,
                                test_size = 0.3, 
                                series_split = 3)
#Fit best model
rfWrapper.fit()
#Find best parameters
rfWrapper.best_parameters()

rfWrapper.best_model()

rfWrapper.cross_validation()

rfWrapper.predict()

rfWrapper.print_scores()

### Gradient Boosting II

In [None]:
#Define the grid of hyperparameters
# 'n_estimators': [500, 600, 700],
params_gb = {
    'n_estimators': [500],
    'max_depth': [5, 6, 7],
    'min_samples_leaf': [0.075, 0.05, 0.025],
    'max_features': ['log2', 'sqrt']   
}
gbWrapper = TreeBasedWrapper(GradientBoostingRegressor(random_state = 100), 
                                params = params_gb,
                                df = SunUpCase.df,
                                test_size = 0.3, 
                                series_split = 3)
#Fit best model
gbWrapper.fit()
#Find best parameters
gbWrapper.best_parameters()

gbWrapper.best_model()

gbWrapper.cross_validation()

gbWrapper.predict()

gbWrapper.print_scores()

In [None]:
feature_importance_plots(rfWrapper, gbWrapper)