# Non-ionic surfactant
```
RM012/0002, RM12/0003, RM012/0005, RM012/0008, RM012/0012
```

In [1]:
import src.preprocessor as pre
import src.forecastor as fc
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np

# Import data
gas_df = pre.get_Fred_data('PNGASEUUSDM',2014,2024)
crude_oil_df = pre.get_Fred_data('POILBREUSDM',2014,2024)
palm_oil_df = pre.get_Fred_data('PPOILUSDM',2014,2024)
ammonia_df = pre.get_Fred_data('WPU0652013A',2014,2024)
elec_df = pre.clean_elec_csv('/Users/huangp/Documents/Barry/sideproject/Raw_Material_Price_Prediction/data/raw/ELECTRICITY_03_2024.csv',2014,2024)

df = pre.clean_pred_price_evo_csv("/Users/huangp/Documents/Barry/sideproject/Raw_Material_Price_Prediction/data/raw/Dataset_Future_Predicting_Price_Evolutions_202403.csv",2014,2023)

target = 'non-ionic surfactant'.lower()
RM_codes = ['RM12/0002', 'RM12/0003', 'RM12/0005', 'RM12/0008', 'RM12/0012']

external_drivers = {
    "PNGASEUUSDM": gas_df,
    'POILBREUSDM': crude_oil_df,
    'PPOILUSDM': palm_oil_df,
    "WPU0652013A": ammonia_df,
    "Electricity": elec_df
}

test_periods = [
    ('2019-01-01', '2019-07-01'),
    ('2019-07-01', '2020-01-01'),
    ('2020-01-01', '2020-07-01'),
    ('2020-07-01', '2021-01-01'),
    ('2021-01-01', '2021-07-01'),
    ('2021-07-01', '2022-01-01'),
    ('2022-01-01', '2022-07-01'),
    ('2022-07-01', '2023-01-01'),
    ('2023-01-01', '2023-07-01'),
    ('2023-07-01', '2024-01-01')
]

lags = [1,3,6]

alpha_bottom = 0.01

In [None]:
# Impute raw data of target variables 
imputed_df, missing = pre.impute_pred_price_evo_csv(df)

# Feature engineering
dummy_df = pre.get_dummies_and_average_price(imputed_df,target,*RM_codes)
feature_df = pre.generate_features(1,12,dummy_df,missing,*RM_codes, **external_drivers)

if type(feature_df.Time) != "datetime64":
    feature_df['Time'] = pd.to_datetime(feature_df['Time'])

assert feature_df['Time'].dtype == "datetime64[ns]" , "df[Time] is not dataetime64."

feature_df = feature_df[feature_df.Year >= 2016]

In [3]:
# Persistent Naive
for code in RM_codes:
    for lag in lags:
        mape_values = list()
        for period in test_periods:
            result = fc.persistence_Naive_MAPE(feature_df,code,lag,period)
            mape_values.append(result)
            
        assert len(mape_values) == len(test_periods), "len(mape_values)!=len(test_periods)"
        average_mape = np.mean(mape_values) 
        print(f"{target} {code}, {lag}-month lag, Naive, average MAPE: {average_mape:.3f}")

non-ionic surfactant RM12/0002, 1-month lag, Naive, average MAPE: 15.539
non-ionic surfactant RM12/0002, 3-month lag, Naive, average MAPE: 17.285
non-ionic surfactant RM12/0002, 6-month lag, Naive, average MAPE: 15.012
non-ionic surfactant RM12/0003, 1-month lag, Naive, average MAPE: 7.916
non-ionic surfactant RM12/0003, 3-month lag, Naive, average MAPE: 9.015
non-ionic surfactant RM12/0003, 6-month lag, Naive, average MAPE: 12.152
non-ionic surfactant RM12/0005, 1-month lag, Naive, average MAPE: 7.766
non-ionic surfactant RM12/0005, 3-month lag, Naive, average MAPE: 8.976
non-ionic surfactant RM12/0005, 6-month lag, Naive, average MAPE: 12.204
non-ionic surfactant RM12/0008, 1-month lag, Naive, average MAPE: 13.935
non-ionic surfactant RM12/0008, 3-month lag, Naive, average MAPE: 13.658
non-ionic surfactant RM12/0008, 6-month lag, Naive, average MAPE: 15.578
non-ionic surfactant RM12/0012, 1-month lag, Naive, average MAPE: 9.571
non-ionic surfactant RM12/0012, 3-month lag, Naive, aver

In [4]:
# Lasso with autoregression features only
for code in RM_codes:
    for lag in lags:
        mape_values = list()
        for period in test_periods:
            result = fc.train_model_AR(feature_df,code,lag,period,alpha_bottom)
            mape_values.append(result)
        
        assert len(mape_values) == len(test_periods), "len(mape_values)!=len(test_periods)"
        average_mape = np.mean(mape_values) 
        print(f"{target} {code}, {lag}-month lag, AR, average MAPE: {average_mape:.3f}")

non-ionic surfactant RM12/0002, 1-month lag, AR, average MAPE: 15.849
non-ionic surfactant RM12/0002, 3-month lag, AR, average MAPE: 15.826
non-ionic surfactant RM12/0002, 6-month lag, AR, average MAPE: 16.586
non-ionic surfactant RM12/0003, 1-month lag, AR, average MAPE: 7.738
non-ionic surfactant RM12/0003, 3-month lag, AR, average MAPE: 8.465
non-ionic surfactant RM12/0003, 6-month lag, AR, average MAPE: 11.848
non-ionic surfactant RM12/0005, 1-month lag, AR, average MAPE: 8.592
non-ionic surfactant RM12/0005, 3-month lag, AR, average MAPE: 10.651
non-ionic surfactant RM12/0005, 6-month lag, AR, average MAPE: 12.858
non-ionic surfactant RM12/0008, 1-month lag, AR, average MAPE: 13.993
non-ionic surfactant RM12/0008, 3-month lag, AR, average MAPE: 14.002
non-ionic surfactant RM12/0008, 6-month lag, AR, average MAPE: 15.777
non-ionic surfactant RM12/0012, 1-month lag, AR, average MAPE: 12.486
non-ionic surfactant RM12/0012, 3-month lag, AR, average MAPE: 14.632
non-ionic surfactant RM

In [5]:
# Lasso with autoregression features and external price drivers
for code in RM_codes:
    for lag in lags:
        mape_values = list()
        for period in test_periods:
            result = fc.train_model_all_features(feature_df,code,lag,period,alpha_bottom)
            mape_values.append(result)
        
        assert len(mape_values) == len(test_periods), "len(mape_values)!=len(test_periods)"
        average_mape = np.mean(mape_values) 
        print(f"{target} {code}, {lag}-month lag, all features, average MAPE: {average_mape:.3f}")

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


non-ionic surfactant RM12/0002, 1-month lag, all features, average MAPE: 12.971


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


non-ionic surfactant RM12/0002, 3-month lag, all features, average MAPE: 15.945


  model = cd_fast.enet_coordinate_descent(


non-ionic surfactant RM12/0002, 6-month lag, all features, average MAPE: 14.231
non-ionic surfactant RM12/0003, 1-month lag, all features, average MAPE: 10.613
non-ionic surfactant RM12/0003, 3-month lag, all features, average MAPE: 11.299
non-ionic surfactant RM12/0003, 6-month lag, all features, average MAPE: 12.556


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


non-ionic surfactant RM12/0005, 1-month lag, all features, average MAPE: 12.122
non-ionic surfactant RM12/0005, 3-month lag, all features, average MAPE: 13.039
non-ionic surfactant RM12/0005, 6-month lag, all features, average MAPE: 12.558


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


non-ionic surfactant RM12/0008, 1-month lag, all features, average MAPE: 12.170


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


non-ionic surfactant RM12/0008, 3-month lag, all features, average MAPE: 12.428


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


non-ionic surfactant RM12/0008, 6-month lag, all features, average MAPE: 14.665


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


non-ionic surfactant RM12/0012, 1-month lag, all features, average MAPE: 14.069


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


non-ionic surfactant RM12/0012, 3-month lag, all features, average MAPE: 14.993


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


non-ionic surfactant RM12/0012, 6-month lag, all features, average MAPE: 16.598
