## TO-DOs
```
[v] Import electrcity data, transform to daily price
[v] Import TTF_GAS data, transform to daily price
[v] Import price evaluatioin data, filtering with Alkalis_RM02_0001
[v] Create 21-days earlier features
[v] Combine features with target variables
!!! # Some issues exist when join historical factor prices with target variables
[] Data scaling
[] check multicollinearity
[] train_test_split()
[] Lasso regression
[] Cross validation
```

In [1]:
!pip install fredapi
!pip install pandasql

Defaulting to user installation because normal site-packages is not writeable
Collecting fredapi
  Downloading fredapi-0.5.1-py3-none-any.whl.metadata (5.0 kB)
Downloading fredapi-0.5.1-py3-none-any.whl (11 kB)
Installing collected packages: fredapi
Successfully installed fredapi-0.5.1
Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
from fredapi import Fred
from pandasql import sqldf
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

In [3]:
def monthly_mean_to_daily(df_monthly: pd.core.frame.DataFrame ) -> pd.core.frame.DataFrame:
    """
    Convert Monthly data into Daily data and impute with monthly mean prices
    """
    df_monthly['Date'] = pd.to_datetime(df_monthly[['Year', 'Month']].assign(DAY=1))
    df = df_monthly.explode('Date') # The explode() method converts each element of the specified column(s) into a row.

    # Generate a complete range of daily dates for the year for imputation
    start_date = df['Date'].min() # represents the starting point of your data
    end_date = df['Date'].max() + pd.offsets.MonthEnd(1)  # finds the maximum (or latest) date and include the last month fully
    full_date_range = pd.date_range(start=start_date, end=end_date, freq='D') # generates a fixed-frequency DatetimeIndex

    # Merge the full date range with the monthly averages to fill in all days
    df_full_date_range = pd.DataFrame(full_date_range, columns=['Date'])
    df = pd.merge(df_full_date_range, df_monthly, on='Date', how='left')
    df_daily = df.ffill(axis=0) # to fill the missing value based on last valid observation following index sequence
    return df_daily

## To import Electricity price from 2012 to 2023, and extend daily prices with monthly mean prices.

In [4]:
elec_df_monthly = pd.read_csv('ELECTRICITY.csv').iloc[:,1:]
elec_df_daily = monthly_mean_to_daily(elec_df_monthly)

elec_df_daily = elec_df_daily[elec_df_daily['Year']>=2012].reset_index().drop(['index'], axis=1)

print(elec_df_daily)
print(elec_df_daily.isna().sum().sort_values()) # checking missing values

           Date    Year  Month  Electricity
0    2012-01-01  2012.0    1.0        56.13
1    2012-01-02  2012.0    1.0        56.13
2    2012-01-03  2012.0    1.0        56.13
3    2012-01-04  2012.0    1.0        56.13
4    2012-01-05  2012.0    1.0        56.13
...         ...     ...    ...          ...
4378 2023-12-27  2023.0   12.0       112.54
4379 2023-12-28  2023.0   12.0       112.54
4380 2023-12-29  2023.0   12.0       112.54
4381 2023-12-30  2023.0   12.0       112.54
4382 2023-12-31  2023.0   12.0       112.54

[4383 rows x 4 columns]
Date           0
Year           0
Month          0
Electricity    0
dtype: int64


## To import EU Gas price from 2012 to 2023, and extend daily prices with monthly mean prices.

In [5]:
apiKey = '29219060bc68b2802af8584e0f328b52'
fred = Fred(api_key=apiKey)

# Natural Gas prices in Europe per month
TTF_GAS = pd.DataFrame(fred.get_series('PNGASEUUSDM'), 
                       columns=['PNGASEUUSDM']).reset_index() 
TTF_GAS['index'] = pd.to_datetime(TTF_GAS['index'], format='%Y-%m-%d')
TTF_GAS['Year'] = TTF_GAS['index'].dt.year
TTF_GAS['Month'] = TTF_GAS['index'].dt.month
TTF_GAS = TTF_GAS.drop(['index'], axis=1)
TTF_GAS_2012_23_monthly = TTF_GAS[TTF_GAS['Year']>=2012].reset_index().drop(['index'], axis=1)

TTF_GAS_daily = monthly_mean_to_daily(TTF_GAS_2012_23_monthly)
print(TTF_GAS_daily.info())
print(TTF_GAS_daily.isna().sum().sort_values()) # Check missing values




<class 'pandas.core.frame.DataFrame'>
Int64Index: 4383 entries, 0 to 4382
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Date         4383 non-null   datetime64[ns]
 1   PNGASEUUSDM  4383 non-null   float64       
 2   Year         4383 non-null   float64       
 3   Month        4383 non-null   float64       
dtypes: datetime64[ns](1), float64(3)
memory usage: 171.2 KB
None
Date           0
PNGASEUUSDM    0
Year           0
Month          0
dtype: int64


In [6]:
price_evolutions_df = pd.read_csv('Dataset_Predicting_Price_Evolutions.csv').iloc[:,1:].sort_values(by=['POSTING DATE', 'Key RM code'])
price_evolutions_df = price_evolutions_df.drop(['SITE', 'SUPPLIER NUMBER', 'PURCHASE NUMBER', 'WEIGHT (kg)'], axis=1)
price_evolutions_df['POSTING DATE'] = pd.to_datetime(price_evolutions_df['POSTING DATE'], format='%Y-%m-%d')
price_evolutions_df['Year'] = price_evolutions_df['POSTING DATE'].dt.year
price_evolutions_df['Month'] = price_evolutions_df['POSTING DATE'].dt.month
price_evolutions_df.rename(columns={'POSTING DATE':'Date',
                                   'Group Description':'Group_Description',
                                   'Key RM code':'Key_RM_code',
                                   'PRICE (EUR/kg)':'PRICE'},
                                    inplace=True)

print(price_evolutions_df.info())
print(price_evolutions_df)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 20570 entries, 20534 to 16
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Date               20570 non-null  datetime64[ns]
 1   Group_Description  20570 non-null  object        
 2   Key_RM_code        20570 non-null  object        
 3   PRICE              20570 non-null  float64       
 4   Year               20570 non-null  int64         
 5   Month              20570 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 1.1+ MB
None
            Date     Group_Description Key_RM_code     PRICE  Year  Month
20534 2012-01-31                  acid   RM01/0001  0.500000  2012      1
20547 2012-01-31                  acid   RM01/0001  0.492479  2012      1
20560 2012-01-31                  acid   RM01/0001  0.485000  2012      1
20569 2012-01-31                  acid   RM01/0001  0.500000  2012   

In [13]:
df = price_evolutions_df[price_evolutions_df['Group_Description']=='Alkalis']

# Function to use pandasql
# pysqldf = lambda q: sqldf(q, globals())
# query = """
# SELECT Key_RM_code, count(Key_RM_code) 
# FROM df
# GROUP BY Key_RM_code;
# """
# print(pysqldf(query))
#   Key_RM_code  count(Key_RM_code)
# 0   RM02/0001                5293
# 1   RM02/0002                 714

# print(df.info())

labeled_date = df['Date'] - pd.Timedelta(days=21)
# print(date_labels.head(10))

df_0 = pd.merge(TTF_GAS_daily, elec_df_daily,how='left', on = ['Date', 'Year', 'Month'])
# print(df_0.head(10))
twenty_one_days_earlier_feature_df = df_0[df_0['Date'].isin(labeled_date)]

twenty_one_days_earlier_feature_df.to_csv('twenty_one_days_earlier_feature_df.csv',index=False)
# twenty_one_days_earlier_feature_df.rename(columns={'Date':'History_Date'},inplace=True)
# df['History_Date']=df['Date'] - pd.Timedelta(days=21)
# df = pd.merge(df, twenty_one_days_earlier_feature_df, how='left', on = ['History_Date','Year', 'Month'])
# df = df.drop(['History_Date'], axis=1)

# Alkalis_RM02_0001 = df[df['Key_RM_code'] == 'RM02/0001']
# Alkalis_RM02_0001.rename(columns={'Electricity':'History_Electricity',
#                                    'PNGASEUUSDM':'History_PNGASEUUSDM'
#                                  },
#                                     inplace=True)
# print(Alkalis_RM02_0001.isna().sum().sort_values()) # Some issues exist when join historical factor prices with target variables


In [20]:
# Data scaling
# check multicollinearity
# train_test_split()

y = Alkalis_RM02_0001['PRICE'].values


# Looping through features
for feature in ["History_PNGASEUUSDM", "History_Electricity"]:
    X = df[[feature]].values

    # Split the data into training and test sets, setting test_size equal to 30% and using a random_state of 42.
    X_train, X_test, y_train, y_test = train_test_split(
        X, 
        y, 
        test_size=0.3, 
        random_state=42)
    print(np.isnan(X_train))
    print(np.isnan(X_test))
    print(np.isnan(y_train))
    print(np.isnan(y_test))

#     # Feature scaling
#     scaler = StandardScaler() 
#     X_train_scaled = scaler.fit_transform(X_train) 
#     X_test_scaled = scaler.transform(X_test)
    

#     scores = []
#     for alpha in [0.01, 1.0, 10.0, 20.0, 50.0]:
#         lasso = Lasso(alpha=alpha)
#         lasso.fit(X_train_scaled, y_train)
#         lasso_pred = lasso.predict(X_test_scaled)
#         scores.append(lasso.score(X_test_scaled, y_test))
#         print(scores)


# # 4. Estimating feature correlation
# sns.heatmap(Alkalis_RM02_0001.drop(['Date','Group_Description','Key_RM_code','Year','Month','PRICE'], axis=1).corr(),annot=True)
# plt.show()

# # Select the final features for the model
# final_features = ["N", "K", "ph"] # 'P', 'K' have strong correlations, but the f1 score of 'K' is higher than than the one of 'P'.

# # 5. Producing a final model
# X = crops[final_features].values

# # Splitting the data with final_features
# X_train, X_test, y_train, y_test = train_test_split(
#     X, 
#     y, 
#     test_size=0.2, 
#     random_state=42)

# # Feature scaling
# scaler = StandardScaler() 
# X_train_scaled = scaler.fit_transform(X_train) 
# X_test_scaled = scaler.transform(X_test)

# # Train and evaluate the model
# log_reg = LogisticRegression(max_iter=2000, multi_class="multinomial")
# log_reg.fit(X_train_scaled, y_train)
# y_pred = log_reg.predict(X_test_scaled)
# model_performance=f1_score(y_test,y_pred,average="weighted")
# print(model_performance)


KeyError: "None of [Index(['History_PNGASEUUSDM'], dtype='object')] are in the [columns]"