In [1]:
!pip install fredapi

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import preprocessor as pre
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import numpy as np
import matplotlib.pyplot as plt

In [3]:
gas_df = pre.get_Fred_data('PNGASEUUSDM',2015,2023)
ammonia_df = pre.get_Fred_data('WPU0652013A',2015,2023)
wheat_df = pre.get_Fred_data('PWHEAMTUSDM',2015,2023)

elec_df = pre.clean_elec_csv('ELECTRICITY.csv',2015,2023)

price_evo_df = pre.clean_pred_price_evo_csv('Dataset_Predicting_Price_Evolutions.csv',2016,2023)

dummy_df = pre.get_dummies_and_average_price(price_evo_df,'acid',\
                                         'RM01/0001','RM01/0004','RM01/0006','RM01/0007')

adid_df = pre.generate_features(1,12,dummy_df,\
                                       PWHEAMTUSDM=wheat_df,\
                                       WPU0652013A=ammonia_df,\
                                       PNGASEUUSDM=gas_df)

print(adid_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2571 entries, 0 to 2570
Data columns (total 44 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Time               2571 non-null   datetime64[ns]
 1   Group Description  2571 non-null   object        
 2   Year               2571 non-null   int64         
 3   Month              2571 non-null   int64         
 4   RM01/0004          2571 non-null   uint8         
 5   RM01/0006          2571 non-null   uint8         
 6   RM01/0007          2571 non-null   uint8         
 7   Average_price      2571 non-null   float64       
 8   PWHEAMTUSDM_1      2571 non-null   float64       
 9   WPU0652013A_1      2571 non-null   float64       
 10  PNGASEUUSDM_1      2571 non-null   float64       
 11  PWHEAMTUSDM_2      2571 non-null   float64       
 12  WPU0652013A_2      2571 non-null   float64       
 13  PNGASEUUSDM_2      2571 non-null   float64       
 14  PWHEAMTU

In [4]:
## train_test_split()
## Log transformation

# Create X, y
feature_list = adid_df.drop(['Time', 'Group Description', 'Year','Month','Average_price'],axis=1)
X = feature_list.values
y = adid_df['Average_price'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # 30% of our data as the test set

# Log transformation and standardlisation
y_train_log = np.log(y_train)
y_test_log = np.log(y_test)

scaler_x = StandardScaler()
X_train_scaled = scaler_x.fit_transform(X_train)
X_test_scaled = scaler_x.transform(X_test)

scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train_log.reshape(-1,1))
y_test_scaled = scaler_y.transform(y_test_log.reshape(-1,1))

In [5]:
## Lasso regression - fit and transform train data set
## Cross validation and Hyperparameter tuning using RandomizedSearchCV

# Define the parameter grid
param_grid = {'alpha': np.linspace(0.0000001, 1, 3000)}

# Create a Lasso regression model
lasso = Lasso()

# Create RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=lasso, 
                                   param_distributions=param_grid, 
                                   n_iter=300, 
                                   cv=5, 
                                   random_state=42)

# Fit the data to perform a grid search
random_search.fit(X_train_scaled, y_train_scaled)

# Best alpha parameter
print("Best alpha parameter:", random_search.best_params_['alpha'])

# Best R-squared score
print("Best R-squared score:", round(random_search.best_score_, 3))

# Coefficients of the best Lasso model
assert random_search.n_features_in_ == len(feature_list.columns)

print("Coefficients of the selected features in the best Lasso model:")
for feature, coefficient in zip(feature_list.columns, random_search.best_estimator_.coef_):
    print(f"{feature}: {round(coefficient,3)}")

Best alpha parameter: 1e-07
Best R-squared score: 0.922
Coefficients of the selected features in the best Lasso model:
RM01/0004: -0.032
RM01/0006: 0.267
RM01/0007: -0.718
PWHEAMTUSDM_1: 0.066
WPU0652013A_1: 0.257
PNGASEUUSDM_1: 0.105
PWHEAMTUSDM_2: -0.032
WPU0652013A_2: -0.199
PNGASEUUSDM_2: 0.058
PWHEAMTUSDM_3: 0.31
WPU0652013A_3: 0.103
PNGASEUUSDM_3: 0.286
PWHEAMTUSDM_4: 0.02
WPU0652013A_4: -0.003
PNGASEUUSDM_4: 0.003
PWHEAMTUSDM_5: 0.037
WPU0652013A_5: 0.016
PNGASEUUSDM_5: 0.038
PWHEAMTUSDM_6: -0.081
WPU0652013A_6: -0.443
PNGASEUUSDM_6: -0.054
PWHEAMTUSDM_7: 0.181
WPU0652013A_7: -0.238
PNGASEUUSDM_7: 0.06
PWHEAMTUSDM_8: -0.116
WPU0652013A_8: -0.23
PNGASEUUSDM_8: 0.084
PWHEAMTUSDM_9: 0.141
WPU0652013A_9: 0.264
PNGASEUUSDM_9: 0.117
PWHEAMTUSDM_10: -0.122
WPU0652013A_10: 0.098
PNGASEUUSDM_10: 0.211
PWHEAMTUSDM_11: -0.05
WPU0652013A_11: -0.302
PNGASEUUSDM_11: 0.012
PWHEAMTUSDM_12: -0.006
WPU0652013A_12: 0.115
PNGASEUUSDM_12: 0.121


In [6]:
## Lasso regression - transform test data set
# Get the best Lasso model from RandomizedSearchCV
best_lasso_model = random_search.best_estimator_

# Predict on the test data
y_pred_test = best_lasso_model.predict(X_test_scaled)

# Evaluate the model performance on the test data
test_score = best_lasso_model.score(X_test_scaled, y_test_scaled)
print("Best Model:", best_lasso_model)
print("Test Set R-squared score:", round(test_score, 3))



Best Model: Lasso(alpha=1e-07)
Test Set R-squared score: 0.917
