In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings


from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from statsmodels.tsa.arima.model import ARIMA
from dateutil import parser
from joblib import dump

warnings.filterwarnings('ignore')

In [2]:
# Collet Data
data = pd.read_csv('dataset/dataset_2.csv')
# # Basic data check
data.head()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58824 entries, 0 to 58823
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   state   58824 non-null  object 
 1   item    58824 non-null  object 
 2   Unit    58824 non-null  object 
 3   Date    58824 non-null  object 
 4   Value   58824 non-null  float64
dtypes: float64(1), object(4)
memory usage: 2.2+ MB


In [3]:
# Convert date column to datetime
time_index = data['Date']
time_index = pd.Series([f"1-{i[5:]}-{i[:4]}" for i in time_index])
time_index = pd.to_datetime(time_index)
formatted_data = data.drop('Date', axis=1)
formatted_data['Date'] = time_index

# Basic data check
formatted_data.head()

Unnamed: 0,state,item,Unit,Value,Date
0,Nigeria,Agric eggs medium size,1 Dozen,328.844837,2016-01-01
1,Nigeria,Agric eggs medium size,1 Dozen,338.797432,2016-01-02
2,Nigeria,Agric eggs medium size,1 Dozen,350.917655,2016-01-03
3,Nigeria,Agric eggs medium size,1 Dozen,367.747957,2016-01-04
4,Nigeria,Agric eggs medium size,1 Dozen,370.006795,2016-01-05


In [4]:
# Assigning Desired Products & Time Range
allowed_products = ['Agric eggs medium size', 'Agric eggs(medium size price of one)', 'Beans brown,sold loose', 'Beans:white black eye. sold loose', 'Gari white,sold loose', 'Gari yellow,sold loose', 'Rice agric sold loose', 'Rice local sold loose', 'Rice Medium Grained']
time_range_for_data = [2016, 2017, 2018]

In [5]:
# Dividing Data
egg_data = formatted_data[formatted_data['item'].str.contains('Agric eggs', case=True, regex=False)]
bean_data = formatted_data[formatted_data['item'].str.contains('Beans', case=True, regex=False)]
gari_data = formatted_data[formatted_data['item'].str.contains('Gari ', case=True, regex=False)]
rice_data = formatted_data[formatted_data['item'].str.contains('Rice ', case=True, regex=False)]

In [6]:
# Data Table Reasining 
nig_egg = egg_data.drop([i for i, r in egg_data.iterrows() if r['state'] != "Nigeria"])
nig_bean = bean_data.drop([i for i, r in bean_data.iterrows() if r['state'] != "Nigeria"])
nig_garri = gari_data.drop([i for i, r in gari_data.iterrows() if r['state'] != "Nigeria"])
nig_rice = rice_data.drop([i for i, r in rice_data.iterrows() if r['state'] != "Nigeria"])

In [7]:
# A function to Create each Product Based data
def create_product_based_data(*args):
    data = []
    for i in args:
        dataframe = pd.DataFrame(
            {
                "Year": pd.DatetimeIndex(i['Date']).year,
                "Month": pd.DatetimeIndex(i['Date']).day, 
                "Product": i['item'],
                "Price": i['Value'],
                "Unit": i['Unit'],
            }
        )
        data.append(dataframe)
    return data

In [8]:
cleaned_data_nigeria = create_product_based_data(nig_egg, nig_bean, nig_garri, nig_rice)
cleaned_data_all = create_product_based_data(egg_data, bean_data, gari_data, rice_data)

In [9]:
def produce_data(cdn):
    for i in cdn:
        i['First Lag'] = i['Price'].shift(1)
        i['Second Lag'] = i['Price'].shift(2)
        i['Rolling Mean'] = i['Price'].rolling(window=3).mean()
        i.dropna(inplace=True)
    return cdn

In [10]:
model_data = produce_data(cleaned_data_all)


In [11]:

def linear_reg_mod(x_train, y_train, x_test, data_type):
    data_type = "eggs" if data_type == 0 else "beans" if data_type == 1 else "garri" if data_type == 2 else "rice"
    lin_reg = LinearRegression()
    lin_reg.fit(x_train, y_train)
    dump(lin_reg, f'models/{data_type}_linear_regression_model.joblib')
    lin_pred = lin_reg.predict(x_test)
    return lin_pred
    
def ridge_mod(x_train, y_train, x_test, data_type):
    data_type = "eggs" if data_type == 0 else "beans" if data_type == 1 else "garri" if data_type == 2 else "rice"
    ridge = Ridge(alpha=0.5)
    ridge.fit(x_train, y_train)
    dump(ridge, f'models/{data_type}_ridge_model.joblib')
    ridge_pred = ridge.predict(x_test)
    return ridge_pred


def pipeline_mod(x_train, y_train, x_test, data_type):
    data_type = "eggs" if data_type == 0 else "beans" if data_type == 1 else "garri" if data_type == 2 else "rice"
    pipeline = make_pipeline(PolynomialFeatures(2), LinearRegression())
    pipeline.fit(x_train, y_train)
    dump(pipeline, f'models/{data_type}_pipeline_model.joblib')
    pipe_pred = pipeline.predict(x_test)
    return pipe_pred
    

def ensembly_mod(x_train, y_train, x_test, data_type):
    data_type = "eggs" if data_type == 0 else "beans" if data_type == 1 else "garri" if data_type == 2 else "rice"
    
    # Define individual models (base models)
    base_models = [
        ('lr', LinearRegression()),
        ('rd', Ridge(alpha=0.4)),
        ('rf', RandomForestRegressor(n_estimators=100)),
        ('pp', make_pipeline(PolynomialFeatures(2), LinearRegression()))
    ]
    
    # Define the meta-model
    meta_model = SVR()

    # Create a stacking regressor
    stacking_regressor = StackingRegressor(estimators=base_models, final_estimator=meta_model)

    # Train the stacking regressor
    stacking_regressor.fit(x_train, y_train)

    dump(stacking_regressor, f'models/{data_type}_ensemble_model.joblib')
    # Predict using the stacking regressor
    stacked_pred = stacking_regressor.predict(x_test)
    
    return stacked_pred
        

In [12]:
def trainer(i, param):
        return_data = {}
        x_train, x_test, y_train, y_test = param # x_train, x_test, y_train, y_test
        return_data[f'{i}'] = {
            "Linear Regression": linear_reg_mod(x_train, y_train, x_test, i), 
            "Ridge": ridge_mod(x_train, y_train, x_test, i), 
            "Pipeline(Polynomial Feautures & Linear Regression)": pipeline_mod(x_train, y_train, x_test, i), 
            "Ensembly of all Models": ensembly_mod(x_train, y_train, x_test, i)
        }
        return return_data

In [13]:
def _default(model, y_test):  
        mse = np.sqrt(mean_squared_error(y_test, model))
        data = {
            'Model Result': mse,
            'Mean Absolute Error:': f'{mean_absolute_error(y_test, model):.4e} ({mean_absolute_error(y_test, model)*100:3.3}%)',
            'Mean Squared Error' :f'{mse:.4e} ({mse/np.mean(model)*100:3.3}%)',
        }
        return data

In [14]:
def best_mod(model_scores:dict):
    model_scores = {i: model_scores[i]['Model Result'] for i in model_scores}
    best_model_name = min(model_scores, key=model_scores.get)
    best_model_score = model_scores[best_model_name]
    return best_model_name, best_model_score

In [15]:
def evaluater(data, data_id, y_test):
        models: dict = data[str(data_id)]
        results = {'default_scoring':{}}
        best_model = {'default_scoring': ''}
        for model_name, model in models.items():
            results['default_scoring'].update({model_name :_default(model, y_test)})
        best_model['default_scoring'] = best_mod(results['default_scoring'])
        return results, best_model

In [16]:

trained = []
evaluated = []
for i, data in enumerate(model_data):
    features = data[['First Lag', 'Second Lag', 'Rolling Mean', 'Year', 'Month']]
    target = data['Price']
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, shuffle=False)
    param =  x_train, x_test, y_train, y_test
    trained_data = trainer(i, param)
    evaluated_result = evaluater(trained_data, i, param[3])
    trained.append(trained_data)
    evaluated.append(evaluated_result)
    # print(evaluated_result)
    # self.plotter(trained_data, i)
    

    

3
        First Lag  Second Lag  Rolling Mean  Year  Month
47146  308.823529  308.823529    308.823529  2017     11
47147  308.823529  308.823529    320.627855  2017     12
47148  344.236505  308.823529    331.853345  2018      1
47149  342.500000  344.236505    337.245502  2018      2
47150  325.000000  342.500000    317.123656  2018      3
...           ...         ...           ...   ...    ...
58063  326.143040  306.666667    322.033250  2018      8
58064  333.290043  326.143040    330.128488  2018      9
58065  330.952381  333.290043    300.303030  2018     10
58066  236.666667  330.952381    268.095238  2018     11
58067  236.666667  236.666667    240.299718  2018     12

[1094 rows x 5 columns]


In [19]:
from joblib import load


def get_model(model_name, data_type):
    models = ['ensemble_model', 'pipeline_model', 'ridge_model', 'linear_regression_model']
    if model_name in models:
        model = load(f'models/{data_type}_{model_name}.joblib')
        return model
    else:
        return None

In [20]:
# EggS
# predict_data = model_data[0][model_data[0]['Product'].str.contains('Agric eggs medium size', case=True, regex=False)]
# Beans
values = []
for data_type, i in enumerate(model_data):
    predict_data = i
    features = predict_data[['First Lag', 'Second Lag', 'Rolling Mean', 'Year', 'Month']]
    target = predict_data['Price']
    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, shuffle=False)


    predict_model = get_model('linear_regression_model', f'{"eggs" if data_type == 0 else "beans" if data_type == 1 else "garri" if data_type == 2 else "rice"}')
    years = list(year for year in range(2024, 2027) for _ in range(12))
    months = list(month  for month in range(3) for month in range(1, 13))

    new_data = pd.DataFrame({   
        'First Lag': x_test['First Lag'][:36],
        'Second Lag': x_test['Second Lag'][:36],
        'Rolling Mean': x_test['Rolling Mean'][:36],
        'Year': years,
        'Month': months

    })

    predicted_values = list(round(i, 2) for i in predict_model.predict(new_data))

    values.append(predicted_values)
    predicted_values

In [None]:
def plot():
    plt.plot(values[0], label='Eggs', color='red', linestyle='-',)
    plt.plot(values[1], label='Beans', color='blue', linestyle='--',)
    plt.plot(values[2], label='Garri', color='black', linestyle='-.',)
    plt.plot(values[3], label='Rice', color='green', linestyle=':',)

    # Adding labels and title
    plt.title('Predicted Values Line Graph')
    plt.xlabel('Time Step')
    plt.ylabel('Predicted Value')
    plt.legend()  # Show the legend
    # Show the graph
    plt.show()