# Load necessary packages

In [1]:
import pandas as pd 
import numpy as np

hex_salmon = '#F68F83'
hex_gold = '#BC9661'
hex_indigo = '#2D2E5F'
hex_maroon = '#8C4750'
hex_white = '#FAFAFA'
hex_blue = '#7EB5D2'

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.dates import DateFormatter
import matplotlib.dates as dates
mpl.rcParams['font.family'] = 'SF Compact Text'
mpl.rcParams['font.weight'] = 'medium'
mpl.rcParams['axes.titleweight'] = 'semibold'
mpl.rcParams['axes.labelweight'] = 'medium'
mpl.rcParams['axes.prop_cycle'] = mpl.cycler(color=[hex_indigo, hex_salmon, hex_maroon])
mpl.rcParams["figure.titlesize"] = 'large'
mpl.rcParams["figure.titleweight"] = 'semibold'

from termcolor import colored

from sklearn.model_selection import train_test_split

from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load data

In [2]:
import import_ipynb
from data import create_features

years = [2018]
lags = range(-1, -149, -1)
ID, DA, features = create_features(years, lags)

importing Jupyter notebook from data.ipynb


In [3]:
ID.head(5)

Unnamed: 0_level_0,ID3
Instrument,Unnamed: 1_level_1
2018-01-01 00:00:00,14.586875
2018-01-01 01:00:00,12.990924
2018-01-01 02:00:00,22.150235
2018-01-01 03:00:00,21.917514
2018-01-01 04:00:00,21.621781


In [4]:
DA.head(5)

Unnamed: 0_level_0,MCP
Instrument,Unnamed: 1_level_1
2018-01-01 00:00:00,27.2
2018-01-01 01:00:00,27.3
2018-01-01 02:00:00,30.1
2018-01-01 03:00:00,20.87
2018-01-01 04:00:00,25.56


# Create features

## Price

In [5]:
features.head(5)

Unnamed: 0_level_0,ID3,MCP,ID3 (-1),ID3 (-2),ID3 (-3),ID3 (-4),ID3 (-5),ID3 (-6),ID3 (-7),ID3 (-8),...,ID3 (-139),ID3 (-140),ID3 (-141),ID3 (-142),ID3 (-143),ID3 (-144),ID3 (-145),ID3 (-146),ID3 (-147),ID3 (-148)
Instrument,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-01 00:00:00,14.586875,27.2,12.990924,22.150235,21.917514,21.621781,22.352647,23.071457,24.345686,25.257541,...,28.133026,25.573566,21.338662,16.894157,13.99611,10.895444,4.704599,4.923619,5.496337,9.487713
2018-01-01 01:00:00,12.990924,27.3,22.150235,21.917514,21.621781,22.352647,23.071457,24.345686,25.257541,25.114449,...,25.573566,21.338662,16.894157,13.99611,10.895444,4.704599,4.923619,5.496337,9.487713,10.044227
2018-01-01 02:00:00,22.150235,30.1,21.917514,21.621781,22.352647,23.071457,24.345686,25.257541,25.114449,24.868167,...,21.338662,16.894157,13.99611,10.895444,4.704599,4.923619,5.496337,9.487713,10.044227,12.999145
2018-01-01 03:00:00,21.917514,20.87,21.621781,22.352647,23.071457,24.345686,25.257541,25.114449,24.868167,25.978476,...,16.894157,13.99611,10.895444,4.704599,4.923619,5.496337,9.487713,10.044227,12.999145,16.051924
2018-01-01 04:00:00,21.621781,25.56,22.352647,23.071457,24.345686,25.257541,25.114449,24.868167,25.978476,26.555409,...,13.99611,10.895444,4.704599,4.923619,5.496337,9.487713,10.044227,12.999145,16.051924,22.510375


# Separate train and test sets

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    features.drop(labels=['ID3'], axis=1),
    features['ID3'],
    test_size = 0.3,
    random_state = 0,
    shuffle = True)

X_train.shape, X_test.shape

((5551, 149), (2380, 149))

In [7]:
X_train.head(5)

Unnamed: 0_level_0,MCP,ID3 (-1),ID3 (-2),ID3 (-3),ID3 (-4),ID3 (-5),ID3 (-6),ID3 (-7),ID3 (-8),ID3 (-9),...,ID3 (-139),ID3 (-140),ID3 (-141),ID3 (-142),ID3 (-143),ID3 (-144),ID3 (-145),ID3 (-146),ID3 (-147),ID3 (-148)
Instrument,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-05-03 13:00:00,38.61,42.084034,45.08074,51.122366,52.015268,51.006758,47.733694,44.376636,43.058198,41.689196,...,72.060866,50.615229,47.282028,43.7112,45.261673,47.598988,49.075267,49.15546,48.127858,50.171621
2018-01-26 16:00:00,48.25,48.289236,46.781604,43.841152,39.925535,42.101536,41.918745,40.06378,35.150921,33.951,...,63.226417,70.535615,63.499484,56.231114,52.28803,53.306764,52.696084,50.771934,48.730095,41.78378
2018-09-11 13:00:00,70.0,69.253482,80.048755,79.505328,77.548151,75.424293,73.58832,61.398192,56.647576,52.113629,...,53.872761,50.743267,56.051835,63.324357,69.422585,73.439432,74.115821,78.781368,79.446161,82.135151
2018-12-16 22:00:00,56.92,67.923885,68.229592,58.184333,58.666775,62.722978,70.149414,74.805337,74.785691,75.115117,...,69.180784,70.461262,67.409818,61.838669,64.747692,59.653026,54.260255,53.110401,42.290105,40.798089
2018-08-27 22:00:00,54.64,51.870694,50.17117,50.948705,54.784392,61.94054,74.031672,87.291319,93.491137,86.833391,...,68.325695,68.329344,69.055205,66.750018,61.526929,58.00079,57.201008,57.674137,64.186213,66.739429


# Feature scaling

In [8]:
scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler()

# LASSO

Specify logistic regression model via LogisticicRegression() with LASSO (l1) penalty and then select features via SelectFromModel()

It is necessary to encode training labels

In [9]:
# sel_ = SelectFromModel(
#     LogisticRegression(C=0.5, penalty='l1', solver='liblinear', random_state=10))

# sel_.fit(scaler.transform(X_train), y_train)

In [10]:
lab_enc = LabelEncoder()
y_train = lab_enc.fit_transform(y_train)

sel_ = SelectFromModel(Lasso(alpha=100, random_state=10))
sel_.fit(scaler.transform(X_train), y_train)

SelectFromModel(estimator=Lasso(alpha=100, random_state=10))

# Visualise 

In [11]:
sel_.get_support()

array([ True,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [12]:
X_train.columns[sel_.get_support()]

Index(['MCP', 'ID3 (-1)', 'ID3 (-24)', 'ID3 (-48)', 'ID3 (-96)', 'ID3 (-144)',
       'ID3 (-145)'],
      dtype='object')

In [13]:
features_selected = X_train.columns[(sel_.get_support())]

print(f'Total features: {X_train.shape[1]}')
print(f'Selected features: {len(features_selected)}')
print(f'Features with coefficients shrank to zero: {np.sum(sel_.estimator_.coef_ == 0)}')

Total features: 149
Selected features: 7
Features with coefficients shrank to zero: 142
