In [22]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import seaborn as sns 

import sklearn
from sklearn.linear_model import ElasticNet
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

np.random.seed(1338)
cmap_data = plt.cm.Paired
cmap_cv = plt.cm.coolwarm
n_splits = 5

import pandas as pd
from pandas_datareader import data as web
import warnings
warnings.filterwarnings('ignore')

In [23]:
class BlockingTimeSeriesSplit():
    def __init__(self, n_splits):
        self.n_splits = n_splits
    
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.5 * (stop - start)) + start
            yield indices[start: mid], indices[mid + margin: stop]

In [35]:
# df = pd.read_csv('solar_radiation_data_trun.csv', parse_dates=True, index_col='UNIXTime')
# df = df.sort_index().drop(['Data', 'TimeSunRise', 'TimeSunSet'], axis=1)

df = pd.read_csv('solar_radiation_dataset.csv')
df = df.sort_index().drop(['Data', 'Time', 'WindDirection(Degrees)', 'TimeSunRise', 'TimeSunSet'], axis=1)
df['UNIXTime'] = df['UNIXTime'].astype('datetime64[s]')
period = '5min'
Radiation = df.resample(period, on='UNIXTime').Radiation.mean()
Temperature = df.resample(period, on='UNIXTime').Temperature.mean()
Pressure = df.resample(period, on='UNIXTime').Pressure.mean()
Humidity = df.resample(period, on='UNIXTime').Humidity.mean()
Speed = df.resample(period, on='UNIXTime').Speed.mean()
# , 'Pressure': Pressure, 'Humidity':Humidity, 'Speed':Speed 
d = {'Radiation': Radiation, 'Pressure': Pressure, 'Humidity':Humidity, 'Speed':Speed}
df = pd.DataFrame(data=d)
df = df.dropna()
df.head()

Unnamed: 0_level_0,Radiation,Pressure,Humidity,Speed
UNIXTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-09-01 10:00:00,2.58,30.43,103.0,11.25
2016-09-01 10:05:00,2.83,30.43,103.0,9.0
2016-09-01 10:20:00,2.16,30.43,103.0,7.87
2016-09-01 10:25:00,2.21,30.43,103.0,18.0
2016-09-01 10:30:00,2.25,30.43,103.0,11.25


In [25]:
STEPS = 7
for i in np.arange(1 ,STEPS):
    col_name = '{}h_Fwd_Radiation'.format(i)
    df[col_name] = df['Radiation'].shift(-i)
    
df = df.dropna()


In [33]:
Features = 3

X = df.iloc[:, :Features]
y = df.iloc[:, Features:]

split = int(len(df) * 0.7)

X_train = X[:split]
y_train = y[:split]

X_test = X[split:]
y_test = y[split:]
y.head()

Unnamed: 0_level_0,Speed
UNIXTime,Unnamed: 1_level_1
2016-09-01 10:00:00,11.25
2016-09-01 10:05:00,9.0
2016-09-01 10:20:00,7.87
2016-09-01 10:25:00,18.0
2016-09-01 10:30:00,11.25


In [27]:
def build_model(_alpha, _l1_ratio):
    estimator = ElasticNet(
        alpha=_alpha,
        l1_ratio=_l1_ratio,
        fit_intercept=True,
        normalize=False,
        precompute=False,
        max_iter=16,
        copy_X=True,
        tol=0.1,
        warm_start=False,
        positive=False,
        random_state=None,
        selection='random'
    )

    return MultiOutputRegressor(estimator, n_jobs=4)

In [28]:
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

In [29]:
'''Time series splitter'''
model = build_model(_alpha=1.0, _l1_ratio=0.3)
tscv = TimeSeriesSplit(n_splits=4)
rmse = np.sqrt(-cross_val_score(model, X_train, y_train, cv=tscv, scoring='neg_mean_squared_error'))
R2 = cross_val_score(model, X_train, y_train, cv=tscv, scoring='r2')

print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"\nR2: {R2.mean()} (+/- {R2.std()}")

RMSE: 114.56455159886283 (+/- 11.088218164616938

R2: 0.8857451102054504 (+/- 0.02113605016886874


In [None]:
''' Blocking time series splitter'''
btscv = BlockingTimeSeriesSplit(n_splits=4)
rmse = np.sqrt(-cross_val_score(model, X_train, y_train, cv=btscv, scoring='neg_mean_squared_error'))
R2 = cross_val_score(model, X_train, y_train, cv=btscv, scoring='r2')

print(f"RMSE: {rmse.mean()} (+/- {rmse.std()}")
print(f"\nR2: {R2.mean()} (+/- {R2.std()}")

In [None]:
def plot_grid_search(cv_results, grid_param_1, grid_param_2, name_param_1, name_param_2, best_params):
    
    # Get Test Scores Mean and std for each grid search
    scores_mean = cv_results['mean_test_score']
    scores_mean = np.array(scores_mean).reshape(len(grid_param_2),len(grid_param_1))

    scores_sd = cv_results['std_test_score']
    scores_sd = np.array(scores_sd).reshape(len(grid_param_2),len(grid_param_1))

    # Plot Grid search scores
    _, ax = plt.subplots(1,1)

    # Param1 is the X-axis, Param 2 is represented as a different curve (color line)
    for idx, val in enumerate(grid_param_2):
        ax.plot(grid_param_1, scores_mean[idx,:], '-o', label= name_param_2 + ': ' + str(val))

    ax.set_title(f"Grid Search Best Params: {best_params}", fontsize=12, fontweight='medium')
    ax.set_xlabel(name_param_1, fontsize=12)
    ax.set_ylabel('CV Average Score', fontsize=12)
    ax.legend(loc="best", fontsize=15)
    ax.grid('on')
    ax.legend(bbox_to_anchor=(1.02, 1.02))

In [None]:
params = {
    'estimator__alpha':(0.1, 0.3, 0.5, 0.7, 0.9),
    'estimator__l1_ratio':(0.1, 0.3, 0.5, 0.7, 0.9)
}

In [None]:
scores = []
for i in range(30):
    model = build_model(_alpha=1.0, _l1_ratio=0.3)

    finder = GridSearchCV(
        estimator=model,
        param_grid=params,
        scoring='r2',
        n_jobs=4,
        iid=False,
        refit=True,
        cv=tscv,  # change this to the splitter subject to test
        verbose=1,
        pre_dispatch=8,
        error_score=-999,
        return_train_score=True
        )

    finder.fit(X_train, y_train)

    best_params = finder.best_params_
    best_score = round(finder.best_score_,4)
    scores.append(best_score)

In [None]:
scores0 = []
for i in range(30):
    model = build_model(_alpha=1.0, _l1_ratio=0.3)
    
    finder0 = GridSearchCV(
        estimator=model,
        param_grid=params,
        scoring='r2',
        n_jobs=4,
        iid=False,
        refit=True,
        cv=btscv,  # change this to the splitter subject to test
        verbose=1,
        pre_dispatch=8,
        error_score=-999,
        return_train_score=True
        )

    finder0.fit(X_train, y_train)

    best_params0 = finder0.best_params_
    best_score0 = round(finder0.best_score_,4)
    scores0.append(best_score)

In [None]:
finder0.cv_results_.keys()

In [None]:
scores1 = pd.DataFrame(scores)
bs = round(float(scores1.mean()),4)
print(f'\nTime series splitter best score: {bs}')

plot_grid_search(finder.cv_results_, params['estimator__l1_ratio'], params['estimator__alpha'], 
                 'l1_ratio', 'alpha', best_params)


