# Load necessary packages

In [1]:
import pandas as pd 
import numpy as np

hex_salmon = '#F68F83'
hex_gold = '#BC9661'
hex_indigo = '#2D2E5F'
hex_maroon = '#8C4750'
hex_white = '#FAFAFA'
hex_blue = '#7EB5D2'

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.dates import DateFormatter
import matplotlib.dates as dates
mpl.rcParams['font.family'] = 'SF Compact Text'
mpl.rcParams['font.weight'] = 'medium'
mpl.rcParams['axes.titleweight'] = 'semibold'
mpl.rcParams['axes.labelweight'] = 'medium'
mpl.rcParams['axes.prop_cycle'] = mpl.cycler(color=[hex_indigo, hex_salmon, hex_maroon])
mpl.rcParams["figure.titlesize"] = 'large'
mpl.rcParams["figure.titleweight"] = 'semibold'

from termcolor import colored

from sklearn.model_selection import train_test_split

from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

# Load data

In [2]:
import import_ipynb
from data import create_features

years = [2018]

lags_ID = range(-4, -169, -1)
lags_DA = [i for i in range(5, -6, -1) if i not in [0]]
lags_VOL = [i for i in range(-4, -25, -1) if i not in [0]]

ID, DA, features = create_features(years, lags_ID, lags_DA, lags_VOL)

importing Jupyter notebook from data.ipynb


In [3]:
ID.head(5)

Unnamed: 0_level_0,ID3,Volume
Instrument,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01 00:00:00,14.586875,221.4
2018-01-01 01:00:00,12.990924,669.7
2018-01-01 02:00:00,22.150235,752.0
2018-01-01 03:00:00,21.917514,460.2
2018-01-01 04:00:00,21.621781,453.0


In [4]:
DA.head(5)

Unnamed: 0_level_0,MCP
Instrument,Unnamed: 1_level_1
2018-01-01 00:00:00,27.2
2018-01-01 01:00:00,27.3
2018-01-01 02:00:00,30.1
2018-01-01 03:00:00,20.87
2018-01-01 04:00:00,25.56


In [5]:
features.head(5)

Unnamed: 0,ID3,VOL,MCP,Load,Load forecast,LFE,ID3 (-4),ID3 (-5),ID3 (-6),ID3 (-7),...,HOD 14,HOD 15,HOD 16,HOD 17,HOD 18,HOD 19,HOD 20,HOD 21,HOD 22,HOD 23
2018-01-01 05:00:00+00:00,22.352647,490.3,25.58,9768.25,11085.25,1317.0,25.114449,24.868167,25.978476,26.555409,...,0,0,0,0,0,0,0,0,0,0
2018-01-01 06:00:00+00:00,23.071457,527.6,25.81,10069.75,12266.5,2196.75,24.868167,25.978476,26.555409,28.621735,...,0,0,0,0,0,0,0,0,0,0
2018-01-01 07:00:00+00:00,24.345686,507.7,29.9,10408.75,14147.5,3738.75,25.978476,26.555409,28.621735,29.089427,...,0,0,0,0,0,0,0,0,0,0
2018-01-01 08:00:00+00:00,25.257541,1032.5,26.33,10693.25,15932.5,5239.25,26.555409,28.621735,29.089427,35.10253,...,0,0,0,0,0,0,0,0,0,0
2018-01-01 09:00:00+00:00,25.114449,771.8,26.38,11050.25,16809.75,5759.5,28.621735,29.089427,35.10253,41.704858,...,0,0,0,0,0,0,0,0,0,0


# Separate train and test sets

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    features.drop(labels=['ID3', 'Load forecast'], axis=1),
    features['ID3'],
    test_size = 0.3,
    random_state = 0,
    shuffle = True)

X_train.shape, X_test.shape

((5468, 235), (2344, 235))

In [9]:
# we fit Random Forests and select features in 2 lines of code

# first I specify the Random Forest instance and its parameters

# Then I use the selectFromModel class from sklearn
# to automatically select the features

# SelectFrom model will select those features which importance
# is greater than the mean importance of all the features
# by default, but you can alter this threshold if you want to

# Encode
lab_enc = LabelEncoder()
y_train = lab_enc.fit_transform(y_train)

sel_ = SelectFromModel(RandomForestClassifier(n_estimators=10, random_state=10))

sel_.fit(X_train, y_train)

# this command let's me visualise those features that were selected.

# sklearn will select those features which importance values
# are greater than the mean of all the coefficients.

sel_.get_support()

# Summarize
print('MSE: %.3f' % sel_.best_score_)
print('Config: %s' % sel_.best_params_)

KeyboardInterrupt: 

In [None]:
X_train.head(5)

In [None]:
X_train_unscaled = X_train

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train_unscaled)

# Random forest

In [None]:
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV

n_folds = 5

# Encode
lab_enc = LabelEncoder()
y_train = lab_enc.fit_transform(y_train)

# Define model
model = RandomForestClassifier()
model_type = f'{model}'

# Define model evaluation method
cv = RepeatedKFold(
    n_splits = n_folds,
    n_repeats = 1,
    random_state = 10)

# Define grid
grid = { 
    'n_estimators': [10],
    # 'max_features': ['auto'],
    # 'max_depth' : [5],
    # 'criterion' :['gini']
}

# Define search
search = GridSearchCV(
    model,
    grid,
    scoring = 'neg_mean_squared_error',
    cv = cv,
    n_jobs = -1)

# Perform the search
results = search.fit(X_train, y_train)

# Summarize
print('MSE: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

In [None]:
# for feature in zip(feat_labels, results.feature_importances_):
#     print(feature)

COEFS = pd.DataFrame()

COEFS['Coefficients'] = results.best_estimator_.feature_importances_

COEFS.index = X_train_unscaled.columns.values

x = COEFS.index
y = COEFS['Coefficients']

fig, ax = plt.subplots(figsize = (len(x)/4, 5))
q = ax.bar(height = y, x = x)

plt.xticks(rotation = 90)
ax.set_title(rf'Coefficients from {model_type}');

In [None]:
from sklearn.feature_selection import SelectFromModel
from time import time

n_features = 10

threshold = np.sort(abs(COEFS['Coefficients']))[-n_features]

print(), print(f'Threshold: {threshold}'), print()

sel_ = SelectFromModel(results.best_estimator_, threshold = threshold, prefit = True)

for i in sel_.get_support(indices = True):
    print(COEFS.index[i])

In [None]:
COEFS_sel = COEFS[sel_.get_support()].reindex(COEFS[sel_.get_support()]['Coefficients'].abs().sort_values(ascending = False).index)

x = COEFS_sel.index
y = COEFS_sel['Coefficients']

fig, ax = plt.subplots(figsize = (len(x)/4, 5))
q = ax.bar(height = y, x = x, width = 0.8)

plt.xticks(rotation = 90)
ax.set_title(rf'Non-zero features (sorted by coefficient magnitude) from {model_type}');

In [None]:
COEFS_sort = COEFS.reindex(COEFS['Coefficients'].abs().sort_values(ascending = False).index)

x = COEFS_sort.index
y = COEFS_sort['Coefficients']

fig, ax = plt.subplots(figsize = (len(x)/4, 5))
q = ax.bar(height = y, x = x, width = 0.8)

plt.xticks(rotation = 90)
ax.set_title(rf'Coefficients from {model_type}')

ax.axhspan(threshold, -threshold, facecolor = hex_salmon, alpha = 0.1)

# for t in [threshold, -threshold]:
#     ax.axhline(t, linewidth = 1, linestyle = '-', color = hex_indigo, alpha = 0.5, label = rf'$\alpha$: {round(alpha, 4)}')

for item in q[0:n_features]:
     item.set_color(hex_salmon)

# Visualise

In [None]:
sel_.get_support()

In [None]:
X_train.columns[sel_.get_support()]

In [None]:
features_selected = X_train.columns[(sel_.get_support())]

len(features_selected)

In [None]:
features_selected

In [None]:
pd.Series(sel_.estimator_.feature_importances_.ravel()).hist(bins=20)
plt.xlabel('Feature importance')
plt.ylabel('Number of Features')
plt.show()

In [None]:
print(), print(colored('Summary:', 'blue')), print()

print(f'Total features: {X_train.shape[1]}')
print(f'Selected features: {len(X_train.columns[(sel_.get_support())])}')
print(f'features with importance greater than the mean importance of all features: {np.sum(sel_.estimator_.feature_importances_ > sel_.estimator_.feature_importances_.mean())}')

print(), print(colored('Selected columns:', 'blue')), print()
print(X_train.columns[sel_.get_support()])

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(
    features.drop(labels=['ID3', 'Load forecast'], axis=1),
    features['ID3'],
    test_size = 0.3,
    random_state = 0,
    shuffle = True)

X_train_unscaled = X_train

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train_unscaled)

# Create a random forest classifier
clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the classifier
clf.fit(X_train, y_train)

# Print the name and gini importance of each feature
for feature in zip(feat_labels, clf.feature_importances_):
    print(feature)
    
# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.15
sfm = SelectFromModel(clf, threshold=0.15)

# Train the selector
sfm.fit(X_train, y_train)

# Print the names of the most important features
for feature_list_index in sfm.get_support(indices=True):
    print(feat_labels[feature_list_index])
    
# Transform the data to create a new dataset containing only the most important features
# Note: We have to apply the transform to both the training X and test X data.
X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)

# Create a new random forest classifier for the most important features
clf_important = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the new classifier on the new dataset containing the most important features
clf_important.fit(X_important_train, y_train)


# Apply The Full Featured Classifier To The Test Data
y_pred = clf.predict(X_test)

# View The Accuracy Of Our Full Feature (4 Features) Model
accuracy_score(y_test, y_pred)

#0.93333333333333335

# Apply The Full Featured Classifier To The Test Data
y_important_pred = clf_important.predict(X_important_test)

# View The Accuracy Of Our Limited Feature (2 Features) Model
accuracy_score(y_test, y_important_pred)

#0.8833333333333333