# Test_Acid

In [1]:
!pip install fredapi

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import preprocessor as pre
import visualiser as visual
import pandas as pd
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
import numpy as np
from matplotlib import pyplot as plt

## Preparations
# Import data
gas_df = pre.get_Fred_data('PNGASEUUSDM',2014,2024)
wheat_df = pre.get_Fred_data('PWHEAMTUSDM',2014,2024)
ammonia_df = pre.get_Fred_data('WPU0652013A',2014,2024)
elec_df = pre.clean_elec_csv('Data_flat_files/ELECTRICITY_03_2024.csv',2014,2024)

df = pre.clean_pred_price_evo_csv("Data_flat_files/Dataset_Future_Predicting_Price_Evolutions_202403.csv",2014,2023)

target = 'acid'.lower()

RM_codes = ['RM01/0001','RM01/0004','RM01/0006','RM01/0007']

external_drivers = {
    "PNGASEUUSDM": gas_df,
    "PWHEAMTUSDM": wheat_df,
    "WPU0652013A": ammonia_df,
    "Electricity": elec_df
}

test_periods = [
    ('2020-01-01', '2020-07-01'),
    ('2020-07-01', '2021-01-01'),
    ('2021-01-01', '2021-07-01'),
    ('2021-07-01', '2022-01-01'),
    ('2022-01-01', '2022-07-01'),
    ('2022-07-01', '2023-01-01'),
    ('2023-01-01', '2023-07-01'),
    ('2023-07-01', '2024-01-01')
]

lags = [1,3,6]

In [3]:
# Impute raw data of target variables 
imputed_df, missing = pre.impute_pred_price_evo_csv(df)

# Feature engineering
dummy_df = pre.get_dummies_and_average_price(imputed_df,target,*RM_codes)
feature_df = pre.generate_features(1,12,dummy_df,*RM_codes, **external_drivers)

if type(feature_df.Time) != "datetime64":
    feature_df['Time'] = pd.to_datetime(feature_df['Time'])

assert feature_df['Time'].dtype == "datetime64[ns]" , "df[Time] is not dataetime64."

feature_df = feature_df[feature_df.Year >= 2016]

In [4]:
def train_model_AR(raw_df:pd.DataFrame, code:str, lag:int, test_periods):
    assert not len(raw_df) == 0, "df is empty!"
    assert not len(code) == 0, "RM_codes are missed."
    assert isinstance(lag, int), "Time lag is missed."
    assert len(test_periods) == 2, "There should only be one test_start and one test_end." 

    df = raw_df[raw_df[code]==True]
    test_start, test_end = test_periods
    # Split data into train and test sets
    train_df = df[df.Time < test_start]
    test_df = df[df.Time.between(test_start, test_end, inclusive = "left")]

    X_train = train_df.filter(regex='^AR_')
    X_test = test_df.filter(regex='^AR_')

    if lag > 1:
        conditions = tuple((f"_{i}" for i in range(1,lag)))
        assert_con = tuple((f"_{i}$" for i in range(1,lag)))
        X_train = X_train.loc[:,~X_train.columns.str.endswith(conditions)]
        X_test = X_test.loc[:,~X_test.columns.str.endswith(conditions)]
        assert not X_train.filter(regex='|'.join(assert_con)).any(axis=1).any(), "X_train not filtered correctly"
        assert not X_test.filter(regex='|'.join(assert_con)).any(axis=1).any(), "X_test not filtered correctly"

    y_train = train_df['Average_price'].values
    y_test = test_df['Average_price'].values

    # Standardlisation
    scaler_x = StandardScaler()
    X_train_scaled = scaler_x.fit_transform(X_train)
    X_test_scaled = scaler_x.transform(X_test)

    scaler_y = StandardScaler()
    y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1,1))
    y_test_scaled = scaler_y.transform(y_test.reshape(-1,1))

    # Define the parameter grid
    param_grid = {'alpha': np.linspace(0.0000001, 1, 3000)}
    # Create a Lasso regression model
    lasso = Lasso()
    # Create RandomizedSearchCV object
    random_search = RandomizedSearchCV(estimator=lasso,
                                       param_distributions=param_grid,
                                       n_iter=300,
                                       cv=5,
                                       random_state=42)
    # Fit the data to perform a grid search
    random_search.fit(X_train_scaled, y_train_scaled)
    assert random_search.n_features_in_ == len(X_train.columns)

    # Get the best Lasso model from RandomizedSearchCV
    best_lasso_model = random_search.best_estimator_
    # Predict on the test data
    y_pred_test = best_lasso_model.predict(X_test_scaled)
    y_pred_test_inverse = scaler_y.inverse_transform(y_pred_test.reshape(-1,1))
    mape = mean_absolute_percentage_error(y_test,y_pred_test_inverse)
    return mape


In [5]:
for code in RM_codes:
    for lag in lags:
        mape_values = list()
        for period in test_periods:
            result = train_model_AR(feature_df,code,lag,period)
            mape_values.append(result)
        average_mape = np.mean(mape_values)
        print(f"{target} {code}, {lag}-month lag average MAPE, AR: {average_mape:.3f}")

acid RM01/0001, 1-month lag average MAPE, AR: 0.057
acid RM01/0001, 3-month lag average MAPE, AR: 0.147
acid RM01/0001, 6-month lag average MAPE, AR: 0.266
acid RM01/0004, 1-month lag average MAPE, AR: 0.101
acid RM01/0004, 3-month lag average MAPE, AR: 0.162
acid RM01/0004, 6-month lag average MAPE, AR: 0.251
acid RM01/0006, 1-month lag average MAPE, AR: 0.109
acid RM01/0006, 3-month lag average MAPE, AR: 0.187
acid RM01/0006, 6-month lag average MAPE, AR: 0.264
acid RM01/0007, 1-month lag average MAPE, AR: 0.171
acid RM01/0007, 3-month lag average MAPE, AR: 0.237
acid RM01/0007, 6-month lag average MAPE, AR: 0.302


In [6]:
def train_model_all_features(raw_df:pd.DataFrame, code:str, lag:int, test_periods):
    assert not len(raw_df) == 0, "df is empty!"
    assert not len(code) == 0, "RM_codes are missed."
    assert isinstance(lag, int), "Time lag is missed."
    assert len(test_periods) == 2, "There should only be one test_start and one test_end." 

    df = raw_df[raw_df[code]==True]
    df = df.loc[:,~df.columns.str.startswith("RM")]
    
    test_start, test_end = test_periods
    # Split data into train and test sets
    train_df = df[df.Time < test_start]
    test_df = df[df.Time.between(test_start, test_end, inclusive = "left")]

    X_train = train_df.drop(['Time', 'Group Description', 'Year','Month','Average_price'],axis=1)
    X_test = test_df.drop(['Time', 'Group Description', 'Year','Month','Average_price'],axis=1)
   

    if lag > 1:
        conditions = tuple((f"_{i}" for i in range(1,lag)))
        assert_con = tuple((f"_{i}$" for i in range(1,lag)))
        X_train = X_train.loc[:,~X_train.columns.str.endswith(conditions)]
        X_test = X_test.loc[:,~X_test.columns.str.endswith(conditions)]
        assert not X_train.filter(regex='|'.join(assert_con)).any(axis=1).any(), "X_train not filtered correctly"
        assert not X_test.filter(regex='|'.join(assert_con)).any(axis=1).any(), "X_test not filtered correctly"
    
    y_train = train_df['Average_price'].values
    y_test = test_df['Average_price'].values

    # Standardlisation
    scaler_x = StandardScaler()
    X_train_scaled = scaler_x.fit_transform(X_train)
    X_test_scaled = scaler_x.transform(X_test)

    scaler_y = StandardScaler()
    y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1,1))
    y_test_scaled = scaler_y.transform(y_test.reshape(-1,1))

    # Define the parameter grid
    param_grid = {'alpha': np.linspace(0.0000001, 1, 3000)}
    # Create a Lasso regression model
    lasso = Lasso()
    # Create RandomizedSearchCV object
    random_search = RandomizedSearchCV(estimator=lasso,
                                       param_distributions=param_grid,
                                       n_iter=300,
                                       cv=5,
                                       random_state=42)
    # Fit the data to perform a grid search
    random_search.fit(X_train_scaled, y_train_scaled)
    assert random_search.n_features_in_ == len(X_train.columns)

    # Get the best Lasso model from RandomizedSearchCV
    best_lasso_model = random_search.best_estimator_
    # Predict on the test data
    y_pred_test = best_lasso_model.predict(X_test_scaled)
    y_pred_test_inverse = scaler_y.inverse_transform(y_pred_test.reshape(-1,1))
    mape = mean_absolute_percentage_error(y_test,y_pred_test_inverse)
    return mape


In [7]:
for code in RM_codes:
    for lag in lags:
        mape_values = list()
        for period in test_periods:
            result = train_model_all_features(feature_df,code,lag,period)
            mape_values.append(result)
        average_mape = np.mean(mape_values)
        print(f"{target} {code}, {lag}-month lag average MAPE, all: {average_mape:.3f}")

acid RM01/0001, 1-month lag average MAPE, all: 0.058
acid RM01/0001, 3-month lag average MAPE, all: 0.112
acid RM01/0001, 6-month lag average MAPE, all: 0.212
acid RM01/0004, 1-month lag average MAPE, all: 0.126
acid RM01/0004, 3-month lag average MAPE, all: 0.203
acid RM01/0004, 6-month lag average MAPE, all: 0.338
acid RM01/0006, 1-month lag average MAPE, all: 0.155
acid RM01/0006, 3-month lag average MAPE, all: 0.192
acid RM01/0006, 6-month lag average MAPE, all: 0.217
acid RM01/0007, 1-month lag average MAPE, all: 0.166
acid RM01/0007, 3-month lag average MAPE, all: 0.189
acid RM01/0007, 6-month lag average MAPE, all: 0.187
