# Next_month_price_prediction_Alkalis

## TO-DOs
```
[v] Import monthly electrcity data
[v] Import monthly TTF_GAS data
[v] Import price evaluatioin data
[v] Create rows and encoding Alkalis_RM02_0001, Alkalis_RM02_0002
[v] To calculate the monthly average prices of Alkalis
[v] Create 12*N features, external factor prices from one-month before to 12-month before
[v] Combine features with target variables
[] train_test_split() - do calculation and scaling only based on train data set to prevent data leakage
[x] Detect outliers - skip
[] Check data distribution
[] Data scaling
[] check multicollinearity(to run one regression using each features, and find corr of all feature, filtering those with higher performance and least corr for our last model)
[] Lasso regression - fit and transform train data set
[] Lasso regression - transform test data set
[] Cross validation
```

In [1]:
!pip install fredapi
!pip install pandasql

In [2]:
import pandas as pd
from fredapi import Fred
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

In [3]:
def monthly_mean_to_daily(df_monthly: pd.core.frame.DataFrame ) -> pd.core.frame.DataFrame:
    """
    Convert Monthly data into Daily data and impute with monthly mean prices
    """
    df_monthly['Date'] = pd.to_datetime(df_monthly[['Year', 'Month']].assign(DAY=1))
    df = df_monthly.explode('Date') # The explode() method converts each element of the specified column(s) into a row.

    # Generate a complete range of daily dates for the year for imputation
    start_date = df['Date'].min() # represents the starting point of your data
    end_date = df['Date'].max() + pd.offsets.MonthEnd(1)  # finds the maximum (or latest) date and include the last month fully
    full_date_range = pd.date_range(start=start_date, end=end_date, freq='D') # generates a fixed-frequency DatetimeIndex

    # Merge the full date range with the monthly averages to fill in all days
    df_full_date_range = pd.DataFrame(full_date_range, columns=['Date'])
    df = pd.merge(df_full_date_range, df_monthly, on='Date', how='left')
    df_daily = df.ffill(axis=0) # to fill the missing value based on last valid observation following index sequence
    return df_daily

In [4]:
## Import monthly electrcity data
elec_df = pd.read_csv('ELECTRICITY.csv').iloc[:,1:]
elec_df['Time'] = pd.to_datetime(elec_df['Year'].astype(str) + elec_df['Month'].astype(str), format='%Y%m')
elec_df = elec_df[elec_df['Year'].between(2011,2023)].reset_index().drop('index',axis=1)

print(elec_df.info())
print(elec_df.groupby(['Year']).count())
print(elec_df.isna().sum().sort_values()) # checking missing values

In [5]:
## Import monthly TTF_GAS data
apiKey = '29219060bc68b2802af8584e0f328b52'
fred = Fred(api_key=apiKey)

# Get Natural Gas prices in Europe per month
TTF_gas_df = pd.DataFrame(fred.get_series('PNGASEUUSDM'), 
                       columns=['PNGASEUUSDM']).reset_index() 
TTF_gas_df['index'] = pd.to_datetime(TTF_gas_df['index'], format='%Y-%m-%d')
TTF_gas_df['Year'] = TTF_gas_df['index'].dt.year
TTF_gas_df['Month'] = TTF_gas_df['index'].dt.month
TTF_gas_df = TTF_gas_df[TTF_gas_df['Year'].between(2011,2023)]
TTF_gas_df.rename(columns = {'index':'Time'}, inplace = True)

print(TTF_gas_df.info())
print(TTF_gas_df.groupby(['Year']).count())
print(TTF_gas_df.isna().sum().sort_values()) # Check missing values




In [6]:
## Import price evaluatioin data
price_evo_df = pd.read_csv('Dataset_Predicting_Price_Evolutions.csv').iloc[:,1:]
price_evo_df['POSTING DATE'] = pd.to_datetime(price_evo_df['POSTING DATE'], format='%Y-%m-%d')
price_evo_df['Year'] = price_evo_df['POSTING DATE'].dt.year
price_evo_df['Month'] = price_evo_df['POSTING DATE'].dt.month
# price_evo_df = price_evo_df.sort_values(['Year','Month'],ascending=True)
price_evo_df = price_evo_df[price_evo_df['Year'].between(2012,2023)].reset_index().drop(['index'], axis=1)

price_evo_df.rename(columns = {'POSTING DATE':'Time'}, inplace = True)

# Drop unnecessary columns
price_evo_df = price_evo_df.drop(['SITE', 'SUPPLIER NUMBER', 'PURCHASE NUMBER', 'WEIGHT (kg)'], axis=1)

print(price_evo_df.info())
print(price_evo_df.groupby(['Year']).count())
print(price_evo_df.isna().sum().sort_values()) # Check missing values



In [7]:
## Create rows and encoding Alkalis_RM02_0001, 
## To calculate the monthly average prices of Alkalis
Alkalis_df = price_evo_df[price_evo_df['Group Description']=="Alkalis"].sort_values(['Year','Month'],ascending=True)
Alkalis_df = Alkalis_df.reset_index().drop('index',axis=1)

# encoding Alkalis_RM02_0001, Alkalis_RM02_0002 with n-1 dummy variables
Alkalis_df_dummies = pd.get_dummies(Alkalis_df['Key RM code'], drop_first=True)
# combine dummy variables with Alkalis_df
Alkalis_df_dummies = pd.concat([Alkalis_df, Alkalis_df_dummies], axis=1)
Alkalis_df_dummies = Alkalis_df_dummies.drop('Key RM code', axis=1)

## Calculate the average raw material price
"""
average_price = Alkalis_df_dummies.groupby(['Year','Month'])['PRICE (EUR/kg)']\
                                    .mean()\
                                    .reset_index()
Resetting the index of the resulting series is necessary to ensure that the indices align properly when merging the series back into the original dataframe.

When you perform a groupby operation in pandas, the resulting object is a new DataFrame or Series with a hierarchical index (MultiIndex) if you group by multiple columns. In your case, when you group by ['Year', 'Month'] and calculate the mean, the resulting Series has a MultiIndex consisting of 'Year' and 'Month'.

Merging this Series directly with the original dataframe without resetting the index could lead to issues because the indices won't align properly, and you may end up with NaN values or incorrect mappings.

Resetting the index of the resulting series converts the indices into regular integer indices, making it easier to merge with the original dataframe based on the common columns ('Year' and 'Month'). This ensures that the average prices are correctly aligned with the corresponding rows in the original dataframe.
"""
average_price = Alkalis_df_dummies.groupby(['Year','Month'])['PRICE (EUR/kg)']\
                                    .mean()\
                                    .reset_index()

# Merge the average monthly price with the original dataframe
Alkalis_df_dummies = pd.merge(Alkalis_df_dummies, average_price, on=['Year','Month'], suffixes=('', '_avg'))

# Rename the new column to 'Average_price'
Alkalis_df_dummies.rename(columns={'PRICE (EUR/kg)_avg': 'Average_price'}, inplace=True)
Alkalis_df_dummies = Alkalis_df_dummies.drop('PRICE (EUR/kg)', axis=1)

print(Alkalis_df_dummies.info())
print(Alkalis_df_dummies.sort_values('Time'))
print(Alkalis_df_dummies.isna().sum().sort_values())


In [8]:
## Create 12*N features, external factor prices from one-month before to 12-month before
## Combine features with target variables
# To prepare feature datasets
merged_df = pd.merge(elec_df, TTF_gas_df,how='left', on = (['Year', 'Month', 'Time']))
feature_df = merged_df.copy()
feature_df['Time_label'] = feature_df['Time'].dt.strftime('%Y-%m')
feature_df = feature_df.drop(['Year','Month', 'Time'], axis=1) # to prevent duplicate columns when merging

# create time labels
label_dfs=[]    # To store labels
                # ref: 'https://pandas.pydata.org/docs/user_guide/merging.html'
    
for i in range(1,13): # 13 is not included
    label = Alkalis_df_dummies[['Time']]
    label.rename(columns = {'Time':f'Time_label{i}'}, inplace = True)
    label = (label[f'Time_label{i}'] - pd.DateOffset(months=i)).dt.strftime('%Y-%m')
    label_dfs.append(label)

result = pd.concat(label_dfs, axis=1)

# To merge with features
for i in range(1,13): # 13 is not included
    result = result.merge(feature_df, how='left',\
                          left_on=[f'Time_label{i}'],\
                          right_on=['Time_label'])
    result.rename(columns = {'Electricity':f'Electricity_{i}',
                              'PNGASEUUSDM':f'PNGASEUUSDM_{i}'
                             }, inplace = True)
    result = result.drop(['Time_label',f'Time_label{i}'], axis=1)
    
Alkalis_df_dummies = pd.concat([Alkalis_df_dummies,result],axis=1)
print(Alkalis_df_dummies.info())
print(Alkalis_df_dummies)
