In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from joblib import dump, load

In [2]:
df = pd.read_csv('./data/model_data.csv')

In [3]:
df = df.set_index('Index')

In [4]:
df.head()

Unnamed: 0_level_0,Datetime,Storage,Date,GasPrice,DayOfYear,MaxTemp,CaisoPrice,OxbowFlow
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,2018-01-01 08:00:00+00:00,160707.4,2018-01-01,5.0,1.0,58.0,35.027855,272.556824
1,2018-01-02 08:00:00+00:00,160224.4,2018-01-02,6.24,2.0,59.0,33.401521,253.365768
2,2018-01-03 08:00:00+00:00,159105.3,2018-01-03,6.24,3.0,60.0,70.573712,672.7779
3,2018-01-04 08:00:00+00:00,157783.6,2018-01-04,4.65,4.0,49.0,39.374058,574.729
4,2018-01-05 08:00:00+00:00,157012.5,2018-01-05,3.77,5.0,46.0,50.158102,533.2065


In [5]:
y = df['OxbowFlow']
X = df.drop(columns=['Date', 'Datetime', 'OxbowFlow'])
# X = df.drop(columns=['Date', 'Datetime', 'OxbowFlow', 'OxbowPower']) # all columns (except Power)
# X = df.drop(columns=['Date', 'Datetime', 'OxbowFlow', 'CaisoPrice', 'GasPrice', 'MaxTemp', 'Storage','OxbowPower']) # just day of year

In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1563 entries, 0 to 2057
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Storage     1563 non-null   float64
 1   GasPrice    1563 non-null   float64
 2   DayOfYear   1563 non-null   float64
 3   MaxTemp     1563 non-null   float64
 4   CaisoPrice  1563 non-null   float64
dtypes: float64(5)
memory usage: 73.3 KB


In [7]:
y

Index
0        272.556824
1        253.365768
2        672.777900
3        574.729000
4        533.206500
           ...     
2053    1008.626280
2054    1012.407470
2055    1001.064000
2056    1004.845150
2057    1012.407470
Name: OxbowFlow, Length: 1563, dtype: float64

In [8]:
X

Unnamed: 0_level_0,Storage,GasPrice,DayOfYear,MaxTemp,CaisoPrice
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,160707.4,5.00,1.0,58.0,35.027855
1,160224.4,6.24,2.0,59.0,33.401521
2,159105.3,6.24,3.0,60.0,70.573712
3,157783.6,4.65,4.0,49.0,39.374058
4,157012.5,3.77,5.0,46.0,50.158102
...,...,...,...,...,...
2053,270054.8,2.55,228.0,88.0,138.984797
2054,268329.2,2.56,229.0,84.0,59.803049
2055,266471.3,2.44,230.0,81.0,40.905011
2056,264633.1,2.44,231.0,80.0,37.559229


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=101)

In [10]:
scaler = StandardScaler()

In [11]:
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
base_elastic_net_model = ElasticNet(max_iter=32000, tol=.01)

In [13]:
param_grid = {
    'alpha': [0.05,0.1,1,5,10,50,100], 
    'l1_ratio': [.1,.2,.3,.4,.5,.6,.7,.8,.95,.99,1]
}

In [14]:
grid_model = GridSearchCV(estimator=base_elastic_net_model,
                          param_grid=param_grid,
                          scoring='neg_mean_squared_error',
                          cv=5,
                          verbose=2)

In [15]:
grid_model.fit(X_train, y_train)

Fitting 5 folds for each of 77 candidates, totalling 385 fits
[CV] END ...........................alpha=0.05, l1_ratio=0.1; total time=   0.0s
[CV] END ...........................alpha=0.05, l1_ratio=0.1; total time=   0.0s
[CV] END ...........................alpha=0.05, l1_ratio=0.1; total time=   0.0s
[CV] END ...........................alpha=0.05, l1_ratio=0.1; total time=   0.0s
[CV] END ...........................alpha=0.05, l1_ratio=0.1; total time=   0.0s
[CV] END ...........................alpha=0.05, l1_ratio=0.2; total time=   0.0s
[CV] END ...........................alpha=0.05, l1_ratio=0.2; total time=   0.0s
[CV] END ...........................alpha=0.05, l1_ratio=0.2; total time=   0.0s
[CV] END ...........................alpha=0.05, l1_ratio=0.2; total time=   0.0s
[CV] END ...........................alpha=0.05, l1_ratio=0.2; total time=   0.0s
[CV] END ...........................alpha=0.05, l1_ratio=0.3; total time=   0.0s
[CV] END ...........................alpha=0.05,

In [16]:
grid_model.best_estimator_

In [17]:
grid_model.best_params_

{'alpha': 1, 'l1_ratio': 1}

In [18]:
y_pred = grid_model.predict(X_test)

In [19]:
mean_absolute_error(y_test, y_pred)

254.19928599672508

In [20]:
np.sqrt(mean_squared_error(y_test, y_pred))

300.740293382115

In [21]:
df['OxbowFlow'].mean().round()

611.0

In [36]:
np.round(np.sqrt(mean_squared_error(y_test, y_pred)) / df['OxbowFlow'].mean() * 100,2) # our deviation in %

49.19

### Make a prediction with some future (fake) data

In [35]:
X_test[0]

array([-0.87926449, -0.84625636, -1.22263376, -0.71041883, -1.11596402])

In [43]:
X_dummy = [160707.4, 5.00, 1.0, 58.0, 35.027855] # Storage, Gas Price,	Day of Year, Max Temp, Caiso Price

In [32]:
X_dummy_scaled = scaler.transform([X_dummy])



In [33]:
X_dummy_scaled

array([[-0.57657754,  0.85207547, -1.68677078, -0.21888155, -0.18905305]])

In [37]:
y_future_pred = grid_model.predict(X_dummy_scaled)

In [42]:
y_future_pred[0].astype(int) # in cfs

448