# Solar Radiation Regression using XGBoost & Optuna

Dataset URL: https://www.kaggle.com/dronio/SolarEnergy

## IMPORTING THE LIBRARIES

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
import xgboost as xgb
import optuna
from sklearn.metrics import r2_score

## IMPORTING THE DATASET

In [3]:
data = pd.read_csv("SolarPrediction.csv")

In [4]:
data

Unnamed: 0,UNIXTime,Data,Time,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,TimeSunRise,TimeSunSet
0,1475229326,9/29/2016 12:00:00 AM,23:55:26,1.21,48,30.46,59,177.39,5.62,06:13:00,18:13:00
1,1475229023,9/29/2016 12:00:00 AM,23:50:23,1.21,48,30.46,58,176.78,3.37,06:13:00,18:13:00
2,1475228726,9/29/2016 12:00:00 AM,23:45:26,1.23,48,30.46,57,158.75,3.37,06:13:00,18:13:00
3,1475228421,9/29/2016 12:00:00 AM,23:40:21,1.21,48,30.46,60,137.71,3.37,06:13:00,18:13:00
4,1475228124,9/29/2016 12:00:00 AM,23:35:24,1.17,48,30.46,62,104.95,5.62,06:13:00,18:13:00
...,...,...,...,...,...,...,...,...,...,...,...
32681,1480587604,12/1/2016 12:00:00 AM,00:20:04,1.22,44,30.43,102,145.42,6.75,06:41:00,17:42:00
32682,1480587301,12/1/2016 12:00:00 AM,00:15:01,1.17,44,30.42,102,117.78,6.75,06:41:00,17:42:00
32683,1480587001,12/1/2016 12:00:00 AM,00:10:01,1.20,44,30.42,102,145.19,9.00,06:41:00,17:42:00
32684,1480586702,12/1/2016 12:00:00 AM,00:05:02,1.23,44,30.42,101,164.19,7.87,06:41:00,17:42:00


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32686 entries, 0 to 32685
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   UNIXTime                32686 non-null  int64  
 1   Data                    32686 non-null  object 
 2   Time                    32686 non-null  object 
 3   Radiation               32686 non-null  float64
 4   Temperature             32686 non-null  int64  
 5   Pressure                32686 non-null  float64
 6   Humidity                32686 non-null  int64  
 7   WindDirection(Degrees)  32686 non-null  float64
 8   Speed                   32686 non-null  float64
 9   TimeSunRise             32686 non-null  object 
 10  TimeSunSet              32686 non-null  object 
dtypes: float64(4), int64(3), object(4)
memory usage: 2.7+ MB


In [6]:
print("Total missing values: ", data.isnull().sum().sum())

Total missing values:  0


## FEATURE ENGINEERING

In [7]:
data

Unnamed: 0,UNIXTime,Data,Time,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,TimeSunRise,TimeSunSet
0,1475229326,9/29/2016 12:00:00 AM,23:55:26,1.21,48,30.46,59,177.39,5.62,06:13:00,18:13:00
1,1475229023,9/29/2016 12:00:00 AM,23:50:23,1.21,48,30.46,58,176.78,3.37,06:13:00,18:13:00
2,1475228726,9/29/2016 12:00:00 AM,23:45:26,1.23,48,30.46,57,158.75,3.37,06:13:00,18:13:00
3,1475228421,9/29/2016 12:00:00 AM,23:40:21,1.21,48,30.46,60,137.71,3.37,06:13:00,18:13:00
4,1475228124,9/29/2016 12:00:00 AM,23:35:24,1.17,48,30.46,62,104.95,5.62,06:13:00,18:13:00
...,...,...,...,...,...,...,...,...,...,...,...
32681,1480587604,12/1/2016 12:00:00 AM,00:20:04,1.22,44,30.43,102,145.42,6.75,06:41:00,17:42:00
32682,1480587301,12/1/2016 12:00:00 AM,00:15:01,1.17,44,30.42,102,117.78,6.75,06:41:00,17:42:00
32683,1480587001,12/1/2016 12:00:00 AM,00:10:01,1.20,44,30.42,102,145.19,9.00,06:41:00,17:42:00
32684,1480586702,12/1/2016 12:00:00 AM,00:05:02,1.23,44,30.42,101,164.19,7.87,06:41:00,17:42:00


In [8]:
import re

In [9]:
data['Month'] = data['Data'].apply(lambda x: re.search(r'^\d+', x).group(0)).astype(np.int)
data['Day'] = data['Data'].apply(lambda x: re.search(r'(?<=\/)\d+(?=\/)', x).group(0)).astype(np.int)
data['Year'] = data['Data'].apply(lambda x: re.search(r'(?<=\/)\d+(?=\s)', x).group(0)).astype(np.int)

data = data.drop('Data', axis=1)

In [10]:
data

Unnamed: 0,UNIXTime,Time,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,TimeSunRise,TimeSunSet,Month,Day,Year
0,1475229326,23:55:26,1.21,48,30.46,59,177.39,5.62,06:13:00,18:13:00,9,29,2016
1,1475229023,23:50:23,1.21,48,30.46,58,176.78,3.37,06:13:00,18:13:00,9,29,2016
2,1475228726,23:45:26,1.23,48,30.46,57,158.75,3.37,06:13:00,18:13:00,9,29,2016
3,1475228421,23:40:21,1.21,48,30.46,60,137.71,3.37,06:13:00,18:13:00,9,29,2016
4,1475228124,23:35:24,1.17,48,30.46,62,104.95,5.62,06:13:00,18:13:00,9,29,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32681,1480587604,00:20:04,1.22,44,30.43,102,145.42,6.75,06:41:00,17:42:00,12,1,2016
32682,1480587301,00:15:01,1.17,44,30.42,102,117.78,6.75,06:41:00,17:42:00,12,1,2016
32683,1480587001,00:10:01,1.20,44,30.42,102,145.19,9.00,06:41:00,17:42:00,12,1,2016
32684,1480586702,00:05:02,1.23,44,30.42,101,164.19,7.87,06:41:00,17:42:00,12,1,2016


In [11]:
data['Hour'] = data['Time'].apply(lambda x: re.search(r'^\d+', x).group(0)).astype(np.int)
data['Minute'] = data['Time'].apply(lambda x: re.search(r'(?<=:)\d+(?=:)', x).group(0)).astype(np.int)
data['Second'] = data['Time'].apply(lambda x: re.search(r'\d+$', x).group(0)).astype(np.int)

data = data.drop('Time', axis=1)

In [12]:
data

Unnamed: 0,UNIXTime,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,TimeSunRise,TimeSunSet,Month,Day,Year,Hour,Minute,Second
0,1475229326,1.21,48,30.46,59,177.39,5.62,06:13:00,18:13:00,9,29,2016,23,55,26
1,1475229023,1.21,48,30.46,58,176.78,3.37,06:13:00,18:13:00,9,29,2016,23,50,23
2,1475228726,1.23,48,30.46,57,158.75,3.37,06:13:00,18:13:00,9,29,2016,23,45,26
3,1475228421,1.21,48,30.46,60,137.71,3.37,06:13:00,18:13:00,9,29,2016,23,40,21
4,1475228124,1.17,48,30.46,62,104.95,5.62,06:13:00,18:13:00,9,29,2016,23,35,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32681,1480587604,1.22,44,30.43,102,145.42,6.75,06:41:00,17:42:00,12,1,2016,0,20,4
32682,1480587301,1.17,44,30.42,102,117.78,6.75,06:41:00,17:42:00,12,1,2016,0,15,1
32683,1480587001,1.20,44,30.42,102,145.19,9.00,06:41:00,17:42:00,12,1,2016,0,10,1
32684,1480586702,1.23,44,30.42,101,164.19,7.87,06:41:00,17:42:00,12,1,2016,0,5,2


In [13]:
data['SunriseHour'] = data['TimeSunRise'].apply(lambda x: re.search(r'^\d+', x).group(0)).astype(np.int)
data['SunriseMinute'] = data['TimeSunRise'].apply(lambda x: re.search(r'(?<=:)\d+(?=:)', x).group(0)).astype(np.int)

data['SunsetHour'] = data['TimeSunSet'].apply(lambda x: re.search(r'^\d+', x).group(0)).astype(np.int)
data['SunsetMinute'] = data['TimeSunSet'].apply(lambda x: re.search(r'(?<=:)\d+(?=:)', x).group(0)).astype(np.int)

data = data.drop(['TimeSunRise', 'TimeSunSet'], axis=1)

In [14]:
data

Unnamed: 0,UNIXTime,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,Month,Day,Year,Hour,Minute,Second,SunriseHour,SunriseMinute,SunsetHour,SunsetMinute
0,1475229326,1.21,48,30.46,59,177.39,5.62,9,29,2016,23,55,26,6,13,18,13
1,1475229023,1.21,48,30.46,58,176.78,3.37,9,29,2016,23,50,23,6,13,18,13
2,1475228726,1.23,48,30.46,57,158.75,3.37,9,29,2016,23,45,26,6,13,18,13
3,1475228421,1.21,48,30.46,60,137.71,3.37,9,29,2016,23,40,21,6,13,18,13
4,1475228124,1.17,48,30.46,62,104.95,5.62,9,29,2016,23,35,24,6,13,18,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32681,1480587604,1.22,44,30.43,102,145.42,6.75,12,1,2016,0,20,4,6,41,17,42
32682,1480587301,1.17,44,30.42,102,117.78,6.75,12,1,2016,0,15,1,6,41,17,42
32683,1480587001,1.20,44,30.42,102,145.19,9.00,12,1,2016,0,10,1,6,41,17,42
32684,1480586702,1.23,44,30.42,101,164.19,7.87,12,1,2016,0,5,2,6,41,17,42


In [15]:
data.dtypes

UNIXTime                    int64
Radiation                 float64
Temperature                 int64
Pressure                  float64
Humidity                    int64
WindDirection(Degrees)    float64
Speed                     float64
Month                       int32
Day                         int32
Year                        int32
Hour                        int32
Minute                      int32
Second                      int32
SunriseHour                 int32
SunriseMinute               int32
SunsetHour                  int32
SunsetMinute                int32
dtype: object

In [16]:
data['Year'].unique()

array([2016])

In [17]:
data['SunriseHour'].unique()

array([6])

In [18]:
data = data.drop(['Year', 'SunriseHour'], axis=1)

## SPLITTING/SCALING

In [19]:
y=data['Radiation'].copy()
X=data.drop('Radiation', axis=1).copy()

In [20]:
scaler = StandardScaler()

In [21]:
X=scaler.fit_transform(X)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=100)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.8, random_state=200)

In [23]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest = xgb.DMatrix(X_test, label=y_test)

## HYPERPARAMTER SEARCH

In [24]:
def get_model_rmse(params):
    model = xgb.train(params, dtrain, num_boost_round=100, evals=[(dval, 'eval')], early_stopping_rounds=10, verbose_eval=0)
    results = model.eval(dval)
    rmse = np.float(re.search(r'[\d.]+$', results).group(0))
    return rmse

In [25]:
def objective(trial):
    learning_rate = trial.suggest_loguniform('learning_rate', 0.00001, 10.0)
    max_depth = trial.suggest_int('max_depth', 4, 8)
    l1_reg = trial.suggest_loguniform('l1_reg', 0.00001, 10.0)
    l2_reg = trial.suggest_loguniform('l2_reg', 0.00001, 10.0)
    
    params = {'learning_rate': learning_rate, 'max_depth': max_depth, 'alpha': l1_reg, 'lambda': l2_reg}
    
    return get_model_rmse(params)

In [26]:
study = optuna.create_study()
study.optimize(objective, n_trials=100, show_progress_bar=True)

[32m[I 2020-11-11 21:06:24,682][0m A new study created in memory with name: no-name-e100ca84-e1a8-4543-a11c-84ef69f5b299[0m
  self._init_valid()


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

[32m[I 2020-11-11 21:06:26,464][0m Trial 0 finished with value: 311.132843 and parameters: {'learning_rate': 0.0024024675673078788, 'max_depth': 5, 'l1_reg': 0.003914966322516704, 'l2_reg': 0.009255150127877554}. Best is trial 0 with value: 311.132843.[0m
[32m[I 2020-11-11 21:06:26,616][0m Trial 1 finished with value: 133.421768 and parameters: {'learning_rate': 1.7417539298750457, 'max_depth': 4, 'l1_reg': 0.0017256867067133503, 'l2_reg': 0.012973702992708093}. Best is trial 1 with value: 133.421768.[0m
[32m[I 2020-11-11 21:06:27,550][0m Trial 2 finished with value: 372.061493 and parameters: {'learning_rate': 0.0002798470781183207, 'max_depth': 6, 'l1_reg': 4.392006153844188, 'l2_reg': 0.032470617299212814}. Best is trial 1 with value: 133.421768.[0m
[32m[I 2020-11-11 21:06:29,250][0m Trial 3 finished with value: 98.857826 and parameters: {'learning_rate': 0.025943382592004727, 'max_depth': 7, 'l1_reg': 1.0348016319280684e-05, 'l2_reg': 0.02834884830251827}. Best is trial 

[32m[I 2020-11-11 21:07:02,932][0m Trial 32 finished with value: 98.718872 and parameters: {'learning_rate': 0.02298391470788214, 'max_depth': 8, 'l1_reg': 2.7299063937259022e-05, 'l2_reg': 0.002723816248056584}. Best is trial 7 with value: 84.786888.[0m
[32m[I 2020-11-11 21:07:04,346][0m Trial 33 finished with value: 83.299904 and parameters: {'learning_rate': 0.11058195124847281, 'max_depth': 8, 'l1_reg': 1.056385504583485e-05, 'l2_reg': 0.016145187553476003}. Best is trial 33 with value: 83.299904.[0m
[32m[I 2020-11-11 21:07:06,192][0m Trial 34 finished with value: 84.647018 and parameters: {'learning_rate': 0.10236334609133671, 'max_depth': 8, 'l1_reg': 6.0122457124174964e-05, 'l2_reg': 0.021555627488110324}. Best is trial 33 with value: 83.299904.[0m
[32m[I 2020-11-11 21:07:07,919][0m Trial 35 finished with value: 117.522034 and parameters: {'learning_rate': 0.017849810959937356, 'max_depth': 7, 'l1_reg': 0.0020556148886526062, 'l2_reg': 0.022224693516662536}. Best is t

[32m[I 2020-11-11 21:07:42,529][0m Trial 64 finished with value: 148.704681 and parameters: {'learning_rate': 1.9305420197237544, 'max_depth': 8, 'l1_reg': 0.0051904610378809505, 'l2_reg': 8.399689978711407}. Best is trial 33 with value: 83.299904.[0m
[32m[I 2020-11-11 21:07:43,842][0m Trial 65 finished with value: 85.296455 and parameters: {'learning_rate': 0.34691919342658517, 'max_depth': 8, 'l1_reg': 0.002268695906036293, 'l2_reg': 1.8489346296080043}. Best is trial 33 with value: 83.299904.[0m
[32m[I 2020-11-11 21:07:45,456][0m Trial 66 finished with value: 84.516716 and parameters: {'learning_rate': 0.1414974998458148, 'max_depth': 8, 'l1_reg': 0.008399714218441419, 'l2_reg': 2.89529920365479}. Best is trial 33 with value: 83.299904.[0m
[32m[I 2020-11-11 21:07:46,674][0m Trial 67 finished with value: 85.16935 and parameters: {'learning_rate': 0.13349172997204986, 'max_depth': 8, 'l1_reg': 0.03565987446882997, 'l2_reg': 0.33455798362644584}. Best is trial 33 with value:

[32m[I 2020-11-11 21:08:32,514][0m Trial 97 finished with value: 87.988434 and parameters: {'learning_rate': 0.03898382512356272, 'max_depth': 8, 'l1_reg': 0.00016377633685726867, 'l2_reg': 1.455111356996991}. Best is trial 33 with value: 83.299904.[0m
[32m[I 2020-11-11 21:08:34,182][0m Trial 98 finished with value: 85.623604 and parameters: {'learning_rate': 0.44743128127648957, 'max_depth': 8, 'l1_reg': 0.00033874177070805807, 'l2_reg': 6.117179216601161}. Best is trial 33 with value: 83.299904.[0m
[32m[I 2020-11-11 21:08:35,793][0m Trial 99 finished with value: 84.572418 and parameters: {'learning_rate': 0.21838791505243602, 'max_depth': 8, 'l1_reg': 0.0005469934222281618, 'l2_reg': 2.449526863290382}. Best is trial 33 with value: 83.299904.[0m



In [27]:
best_params = study.best_params
best_params

{'learning_rate': 0.11058195124847281,
 'max_depth': 8,
 'l1_reg': 1.056385504583485e-05,
 'l2_reg': 0.016145187553476003}

In [28]:
model = xgb.train(best_params, dtrain, num_boost_round=10000, evals=[(dval, 'eval')], early_stopping_rounds=10)

Parameters: { l1_reg, l2_reg } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	eval-rmse:343.31421
Will train until eval-rmse hasn't improved in 10 rounds.
[1]	eval-rmse:309.60764
[2]	eval-rmse:279.95245
[3]	eval-rmse:253.93411
[4]	eval-rmse:231.15678
[5]	eval-rmse:211.01465
[6]	eval-rmse:193.47096
[7]	eval-rmse:178.10925
[8]	eval-rmse:164.97127
[9]	eval-rmse:153.45482
[10]	eval-rmse:143.83037
[11]	eval-rmse:135.18835
[12]	eval-rmse:127.81662
[13]	eval-rmse:121.52767
[14]	eval-rmse:116.27657
[15]	eval-rmse:111.88757
[16]	eval-rmse:107.98895
[17]	eval-rmse:104.73846
[18]	eval-rmse:102.20950
[19]	eval-rmse:99.81844
[20]	eval-rmse:97.89329
[21]	eval-rmse:96.28895
[22]	eval-rmse:94.91874
[23]	eval-rmse:93.62908
[24]	eval-rmse:92.52450
[25]	eval-rmse:91.52024
[26]	eval-rmse:90.77

## RESULTS

In [29]:
y_true = np.array(y_test, dtype=np.float)
y_pred = np.array(model.predict(dtest), dtype=np.float)

In [30]:
r2 = r2_score(y_true, y_pred)
print("R^2 Score: {:.4f}".format(r2))

R^2 Score: 0.9408
