# Acid
```
'RM01/0001'
'RM01/0004'
'RM01/0006'
'RM01/0007'
```

In [1]:
import src.preprocessor as pre
import src.forecastor as fc
import pandas as pd
import numpy as np

## Preparations
# Import data
gas_df = pre.get_Fred_data('PNGASEUUSDM',2014,2024)
wheat_df = pre.get_Fred_data('PWHEAMTUSDM',2014,2024)
ammonia_df = pre.get_Fred_data('WPU0652013A',2014,2024)
elec_df = pre.clean_elec_csv('/Users/huangp/Documents/Barry/sideproject/Raw_Material_Price_Prediction/data/raw/ELECTRICITY_03_2024.csv',2014,2024)

df = pre.clean_pred_price_evo_csv("/Users/huangp/Documents/Barry/sideproject/Raw_Material_Price_Prediction/data/raw/Dataset_Future_Predicting_Price_Evolutions_202403.csv",2014,2023)

target = 'acid'.lower()

RM_codes = ['RM01/0001','RM01/0004','RM01/0006','RM01/0007']

external_drivers = {
    "PNGASEUUSDM": gas_df,
    "PWHEAMTUSDM": wheat_df,
    "WPU0652013A": ammonia_df,
    "Electricity": elec_df
}

test_periods = [
    ('2019-01-01', '2019-07-01'),
    ('2019-07-01', '2020-01-01'),
    ('2020-01-01', '2020-07-01'),
    ('2020-07-01', '2021-01-01'),
    ('2021-01-01', '2021-07-01'),
    ('2021-07-01', '2022-01-01'),
    ('2022-01-01', '2022-07-01'),
    ('2022-07-01', '2023-01-01'),
    ('2023-01-01', '2023-07-01'),
    ('2023-07-01', '2024-01-01')
]

lags = [1,3,6]

alpha_bottom = 0.01

In [2]:
# Impute raw data of target variables 
imputed_df, missing = pre.impute_pred_price_evo_csv(df)

# Feature engineering
dummy_df = pre.get_dummies_and_average_price(imputed_df,target,*RM_codes)
feature_df = pre.generate_features(1,12,dummy_df,missing,*RM_codes, **external_drivers)

if type(feature_df.Time) != "datetime64":
    feature_df['Time'] = pd.to_datetime(feature_df['Time'])

assert feature_df['Time'].dtype == "datetime64[ns]" , "df[Time] is not dataetime64."

feature_df = feature_df[feature_df.Year >= 2016]

  .resample('M') \
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ar_df.rename(columns={"Average_price": "AR"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ar_df['Time_label'] = ar_df['Time'].dt.strftime('%Y-%m')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label.rename(columns={'Time': f'Time_label{i}'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.

In [3]:
# Persistent Naive
for code in RM_codes:
    for lag in lags:
        mape_values = list()
        for period in test_periods:
            result = fc.persistence_Naive_MAPE(feature_df,code,lag,period)
            mape_values.append(result)
            
        assert len(mape_values) == len(test_periods), "len(mape_values)!=len(test_periods)"
        average_mape = np.mean(mape_values) 
        print(f"{target} {code}, {lag}-month lag, Naive, average MAPE: {average_mape:.3f}")

acid RM01/0001, 1-month lag, Naive, average MAPE: 4.317
acid RM01/0001, 3-month lag, Naive, average MAPE: 11.371
acid RM01/0001, 6-month lag, Naive, average MAPE: 21.451
acid RM01/0004, 1-month lag, Naive, average MAPE: 9.444
acid RM01/0004, 3-month lag, Naive, average MAPE: 14.865
acid RM01/0004, 6-month lag, Naive, average MAPE: 25.072
acid RM01/0006, 1-month lag, Naive, average MAPE: 9.801
acid RM01/0006, 3-month lag, Naive, average MAPE: 13.123
acid RM01/0006, 6-month lag, Naive, average MAPE: 21.071
acid RM01/0007, 1-month lag, Naive, average MAPE: 12.455
acid RM01/0007, 3-month lag, Naive, average MAPE: 16.383
acid RM01/0007, 6-month lag, Naive, average MAPE: 23.324


In [4]:
# Lasso with autoregression features only
for code in RM_codes:
    for lag in lags:
        mape_values = list()
        for period in test_periods:
            result = fc.train_model_AR(feature_df,code,lag,period,alpha_bottom)
            mape_values.append(result)
        
        assert len(mape_values) == len(test_periods), "len(mape_values)!=len(test_periods)"
        average_mape = np.mean(mape_values) 
        print(f"{target} {code}, {lag}-month lag, AR, average MAPE: {average_mape:.3f}")

acid RM01/0001, 1-month lag, AR, average MAPE: 5.503
acid RM01/0001, 3-month lag, AR, average MAPE: 13.357
acid RM01/0001, 6-month lag, AR, average MAPE: 26.879
acid RM01/0004, 1-month lag, AR, average MAPE: 9.600
acid RM01/0004, 3-month lag, AR, average MAPE: 14.473
acid RM01/0004, 6-month lag, AR, average MAPE: 22.958
acid RM01/0006, 1-month lag, AR, average MAPE: 10.154
acid RM01/0006, 3-month lag, AR, average MAPE: 14.806
acid RM01/0006, 6-month lag, AR, average MAPE: 21.118
acid RM01/0007, 1-month lag, AR, average MAPE: 16.910


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


acid RM01/0007, 3-month lag, AR, average MAPE: 24.730


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


acid RM01/0007, 6-month lag, AR, average MAPE: 28.414


In [5]:
# Lasso with autoregression features and external price drivers
for code in RM_codes:
    for lag in lags:
        mape_values = list()
        for period in test_periods:
            result = fc.train_model_all_features(feature_df,code,lag,period,alpha_bottom)
            mape_values.append(result)
        
        assert len(mape_values) == len(test_periods), "len(mape_values)!=len(test_periods)"
        average_mape = np.mean(mape_values) 
        print(f"{target} {code}, {lag}-month lag, all features, average MAPE: {average_mape:.3f}")

acid RM01/0001, 1-month lag, all features, average MAPE: 5.745
acid RM01/0001, 3-month lag, all features, average MAPE: 11.314
acid RM01/0001, 6-month lag, all features, average MAPE: 19.376
acid RM01/0004, 1-month lag, all features, average MAPE: 12.864
acid RM01/0004, 3-month lag, all features, average MAPE: 21.045
acid RM01/0004, 6-month lag, all features, average MAPE: 32.628


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


acid RM01/0006, 1-month lag, all features, average MAPE: 14.012


  model = cd_fast.enet_coordinate_descent(


acid RM01/0006, 3-month lag, all features, average MAPE: 14.748
acid RM01/0006, 6-month lag, all features, average MAPE: 22.476


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


acid RM01/0007, 1-month lag, all features, average MAPE: 23.898


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


acid RM01/0007, 3-month lag, all features, average MAPE: 27.501


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


acid RM01/0007, 6-month lag, all features, average MAPE: 28.915
