# forecasting selling price of chicken with Bins

In [1]:
# libraries
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

!pip install category_encoders
import category_encoders as ce
from category_encoders import TargetEncoder




In [2]:
# reading file
df = pd.read_csv('/workspaces/forecasting/data/data.csv')
df.head()

Unnamed: 0,DAY_DATE,UNIT,TAILS,KG,SALES PER KG,TOTAL SALES,ABW,CHICKEN SIZE,PROVINCE
0,2017-01-01,LOMBOK,6237,11356.5,22000.0,249843000,1.820827,MEDIUM,NUSA TENGGARA BARAT
1,2017-01-01,GARUT,2625,5446.8,17000.0,92595600,2.074971,LARGE,JAWA BARAT
2,2017-01-01,SUMEDANG,7788,14176.8,16106.05355,228332300,1.820339,MEDIUM,JAWA BARAT
3,2017-01-01,CIREBON,7504,15958.0,17000.0,271286000,2.126599,LARGE,JAWA BARAT
4,2017-01-01,PEKALONGAN,9121,16250.0,16500.0,268125000,1.781603,MEDIUM,JAWA TENGAH


# cleaning

In [3]:
# removing outlier
df = df[df['SALES PER KG'] <= 35000]

# removing 'total sales' column
df = df.drop('TOTAL SALES', axis = 1)

# creating bins for prediction

# using sturges to print out the number of bins
sturges = int(np.ceil(1 + np.log2(len(df['SALES PER KG']))))
print(sturges)

max_value = 35000
bin_width = int((max_value - 0) // sturges)

# Correctly calculate 'bin_edges'
# Ensure 'bin_width' is added to 'max_value' to include the upper edge
bin_edges = np.arange(0, max_value + bin_width, bin_width)

# Use pd.cut to bin the data
df['price_bin'] = pd.cut(df['SALES PER KG'], bins=bin_edges, labels=False, right=False)

# Add 1 to change bins from 0-19 to 1-20
df['price_bin'] += 1

# Display the first few rows to verify
print(df.head())

# dropping sales per kg column as we already have price_bin
df = df.drop('SALES PER KG', axis = 1)

# dropping ABW 
df = df.drop('ABW', axis = 1)


20
     DAY_DATE        UNIT  TAILS       KG  SALES PER KG       ABW  \
0  2017-01-01      LOMBOK   6237  11356.5   22000.00000  1.820827   
1  2017-01-01       GARUT   2625   5446.8   17000.00000  2.074971   
2  2017-01-01    SUMEDANG   7788  14176.8   16106.05355  1.820339   
3  2017-01-01     CIREBON   7504  15958.0   17000.00000  2.126599   
4  2017-01-01  PEKALONGAN   9121  16250.0   16500.00000  1.781603   

  CHICKEN SIZE             PROVINCE  price_bin  
0       MEDIUM  NUSA TENGGARA BARAT         13  
1        LARGE           JAWA BARAT         10  
2       MEDIUM           JAWA BARAT         10  
3        LARGE           JAWA BARAT         10  
4       MEDIUM          JAWA TENGAH         10  


## pre processing
- extracting date column
- splitting, scaling
- creating function to evaluate the model [call `results(y_test, predictions)`]


In [4]:
# extracting year and month from date column for seasonality trends
df['DAY_DATE'] = pd.to_datetime(df['DAY_DATE'], errors='coerce').dt.normalize()
df['YEAR'] = df['DAY_DATE'].dt.year
df['MONTH'] = df['DAY_DATE'].dt.month
df['DAY'] = df['DAY_DATE'].dt.day
df = df.drop('DAY_DATE', axis = 1)


In [5]:
df

Unnamed: 0,UNIT,TAILS,KG,CHICKEN SIZE,PROVINCE,price_bin,YEAR,MONTH,DAY
0,LOMBOK,6237,11356.5,MEDIUM,NUSA TENGGARA BARAT,13,2017,1,1
1,GARUT,2625,5446.8,LARGE,JAWA BARAT,10,2017,1,1
2,SUMEDANG,7788,14176.8,MEDIUM,JAWA BARAT,10,2017,1,1
3,CIREBON,7504,15958.0,LARGE,JAWA BARAT,10,2017,1,1
4,PEKALONGAN,9121,16250.0,MEDIUM,JAWA TENGAH,10,2017,1,1
...,...,...,...,...,...,...,...,...,...
382897,SRAGEN,510,618.4,SMALL,JAWA TENGAH,10,2024,1,27
382898,SLEMAN,48,63.2,SMALL,DIY YOGYAKARTA,7,2024,1,28
382899,SRAGEN,315,415.6,SMALL,JAWA TENGAH,10,2024,1,28
382900,BOYOLALI,785,974.8,SMALL,JAWA TENGAH,10,2024,1,28


## demand and supply
importing the demand and supply dataset

In [6]:
# filtering for rows for year 2019 to 2023
df = df[(df['YEAR'] >= 2019) & (df['YEAR'] <= 2023)]

# removine rows that are earlier than december 2019
df = df[(df['YEAR'] >= 2019) & (df['MONTH'] >= 12)]


In [13]:
df

Unnamed: 0,UNIT,TAILS,KG,CHICKEN SIZE,PROVINCE,price_bin,YEAR,MONTH,DAY,MONTH_YEAR
165114,PAYAKUMBUH,1267,2856.0,LARGE,SUMATERA BARAT,11,2019,12,1,2019 12
165115,SINJAI,3310,7799.5,LARGE,SULAWESI SELATAN,13,2019,12,1,2019 12
165116,BANDAR JAYA,8236,22197.0,LARGE,LAMPUNG,10,2019,12,1,2019 12
165117,BANDAR LAMPUNG,2565,4971.0,MEDIUM,LAMPUNG,11,2019,12,1,2019 12
165118,BANYUASIN,696,1464.4,LARGE,SUMATERA SELATAN,10,2019,12,1,2019 12
...,...,...,...,...,...,...,...,...,...,...
382498,BOYOLALI,430,901.4,BIG,JAWA TENGAH,10,2023,12,31,2023 12
382499,BOYOLALI,294,605.0,BIG,JAWA TENGAH,10,2023,12,31,2023 12
382500,BOYOLALI,432,962.4,BIG,JAWA TENGAH,10,2023,12,31,2023 12
382501,GUNUNGKIDUL,200,370.8,MEDIUM,DIY YOGYAKARTA,10,2023,12,31,2023 12


In [22]:
# importing demand and supply dataset
df_demand_supply = pd.read_csv('/workspaces/forecasting/data/provinceDD&SS.csv')
df_demand_supply

# splitting the month_year column into separate month and year columns
df_demand_supply[['MONTH', 'YEAR']] = df_demand_supply['Month_Year'].str.split(' ', expand = True)

# mapping month to numerical values
month_num = {
    'Jan' : 1, 'Feb' : 2, 'Mar' : 3, 'Apr' : 4, 'May' : 5, 'Jun' : 6, 'Jul' : 7, 
    'Aug' : 8, 'Sep' : 9, 'Oct' : 10, 'Nov' : 11, 'Dec' : 12
}

# replacing month with numerical values
df_demand_supply['MONTH'] = df_demand_supply['MONTH'].replace(month_num)
df_demand_supply['YEAR'] = df_demand_supply['YEAR'].astype(int)

# dropping original month_year column
df_demand_supply.drop('Month_Year', axis = 1, inplace = True)
df_demand_supply

  df_demand_supply['MONTH'] = df_demand_supply['MONTH'].replace(month_num)


Unnamed: 0,PROVINCE,SupplyProvince,DemandProvince,MONTH,YEAR
0,ACEH,2687246,3930723,12,2019
1,BALI,5120529,5141045,12,2019
2,BANTEN,16349469,13433846,12,2019
3,BENGKULU,554903,1339242,12,2019
4,DI YOGYAKARTA,4206148,6313628,12,2019
...,...,...,...,...,...
1723,SULAWESI TENGGARA,104238,705272,12,2023
1724,SULAWESI UTARA,396941,1307670,12,2023
1725,SUMATERA BARAT,3689089,4922821,12,2023
1726,SUMATERA SELATAN,5467836,5767260,12,2023


In [24]:
# merging the demand and supply dataset with the main dataset
df = pd.merge(df, df_demand_supply, how = 'left', on = ['YEAR', 'MONTH', 'PROVINCE'])
df

Unnamed: 0,UNIT,TAILS,KG,CHICKEN SIZE,PROVINCE,price_bin,YEAR,MONTH,DAY,MONTH_YEAR,SupplyProvince,DemandProvince
0,PAYAKUMBUH,1267,2856.0,LARGE,SUMATERA BARAT,11,2019,12,1,2019 12,3885216.0,5993194.0
1,SINJAI,3310,7799.5,LARGE,SULAWESI SELATAN,13,2019,12,1,2019 12,5447863.0,5931514.0
2,BANDAR JAYA,8236,22197.0,LARGE,LAMPUNG,10,2019,12,1,2019 12,5404511.0,5618463.0
3,BANDAR LAMPUNG,2565,4971.0,MEDIUM,LAMPUNG,11,2019,12,1,2019 12,5404511.0,5618463.0
4,BANYUASIN,696,1464.4,LARGE,SUMATERA SELATAN,10,2019,12,1,2019 12,5758528.0,7021239.0
...,...,...,...,...,...,...,...,...,...,...,...,...
17766,BOYOLALI,430,901.4,BIG,JAWA TENGAH,10,2023,12,31,2023 12,54958385.0,26285612.0
17767,BOYOLALI,294,605.0,BIG,JAWA TENGAH,10,2023,12,31,2023 12,54958385.0,26285612.0
17768,BOYOLALI,432,962.4,BIG,JAWA TENGAH,10,2023,12,31,2023 12,54958385.0,26285612.0
17769,GUNUNGKIDUL,200,370.8,MEDIUM,DIY YOGYAKARTA,10,2023,12,31,2023 12,,


In [5]:
# train test split before label encoding to prevent data leakage


X = df.drop('price_bin', axis=1)
y = df['price_bin']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# scaling numerical variable

scaler = StandardScaler()
columns_to_scale = ['KG', 'TAILS']
X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

In [6]:
# creating function to evaluate the model
def results(y_test, predictions):
  mae = mean_absolute_error(y_test, predictions)
  mse = mean_squared_error(y_test, predictions)
  rmse = np.sqrt(mse)
  r2 = r2_score(y_test, predictions)

  print(f"Mean Absolute Error (MAE): {mae}")
  print(f"Mean Squared Error (MSE): {mse}")
  print(f"Root Mean Squared Error (RMSE): {rmse}")
  print(f"R-squared (R²): {r2}")


# encoding

In [7]:
# encoding of columns
# label encoding for province

label_encoder = LabelEncoder()
X_train['SIZE ENCODE'] = label_encoder.fit_transform(X_train['CHICKEN SIZE'])
X_test['SIZE ENCODE'] = label_encoder.transform(X_test['CHICKEN SIZE'])

# dropping chicken size column
X_train = X_train.drop(['CHICKEN SIZE'], axis = 1)
X_test = X_test.drop(['CHICKEN SIZE'], axis = 1)


# one hot encoding for province

encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X_train[['PROVINCE']])
X_train_encoded = encoder.transform(X_train[['PROVINCE']])
X_test_encoded = encoder.transform(X_test[['PROVINCE']])

# Convert the encoded matrices back to dataframes for easier manipulation
columns = encoder.get_feature_names_out(['PROVINCE'])
X_train_encoded_df = pd.DataFrame(X_train_encoded.toarray(), columns=columns, index=X_train.index)
X_test_encoded_df = pd.DataFrame(X_test_encoded.toarray(), columns=columns, index=X_test.index)

# merge back the dataframe
X_train = pd.concat([X_train.drop('PROVINCE', axis=1), X_train_encoded_df], axis=1)
X_test = pd.concat([X_test.drop('PROVINCE', axis=1), X_test_encoded_df], axis=1)


# one hot encoding for unit
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X_train[['UNIT']])
X_train_encoded_unit = encoder.transform(X_train[['UNIT']])
X_test_encoded_unit = encoder.transform(X_test[['UNIT']])

# Convert the encoded matrices back to dataframes for easier manipulation
columns = encoder.get_feature_names_out(['UNIT'])
X_train_encoded_df_unit = pd.DataFrame(X_train_encoded_unit.toarray(), columns=columns, index=X_train.index)
X_test_encoded_df_unit = pd.DataFrame(X_test_encoded_unit.toarray(), columns=columns, index=X_test.index)

# merge back the dataframe, dropping unit column
X_train = pd.concat([X_train.drop('UNIT', axis=1), X_train_encoded_df_unit], axis=1)
X_test = pd.concat([X_test.drop('UNIT', axis=1), X_test_encoded_df_unit], axis=1)




In [8]:
print(X_train)

           TAILS        KG  YEAR  MONTH  DAY  SIZE ENCODE  PROVINCE_ACEH  \
276802  0.456661  0.511789  2021     10   30            2            0.0   
126530 -0.144031 -0.129820  2019      4   12            2            0.0   
298428  1.224147  1.070229  2022      2   15            2            0.0   
133120  1.104237  0.617678  2019      5   23            3            0.0   
115983  0.122580 -0.032514  2019      2    9            2            0.0   
...          ...       ...   ...    ...  ...          ...            ...   
259179  4.653331  6.017824  2021      7   12            1            0.0   
365839 -0.889626 -0.816256  2023      3    6            1            0.0   
131933  0.108858  0.166973  2019      5   16            2            0.0   
146868  0.396052  0.445115  2019      8   16            2            0.0   
121959 -0.440374 -0.431000  2019      3   17            2            0.0   

        PROVINCE_BALI  PROVINCE_BANTEN  PROVINCE_BENGKULU  ...  \
276802            0.0

# trying lazy predict

In [9]:
pip install lazypredict-nightly

Note: you may need to restart the kernel to use updated packages.


In [10]:
# libraries
import lazypredict
from lazypredict.Supervised import LazyRegressor
from sklearn.utils import all_estimators
from sklearn.base import RegressorMixin


In [11]:
chosen_regressors = [
    'SVR', 
    'BaggingRegressor',
    'ExtraTreesRegressor',
    'RandomForestRegressor',
    'GradientBoostingRegressor',
    'LGBMRegressor',
    'XGBRegressor',
    'CatBoostRegressor',
    'HistGradientBoostingRegressor',
    'AdaBoostRegressor',
    'KNeighborsRegressor',
    'DecisionTreeRegressor'
]

REGRESSORS = [
    est
    for est in all_estimators()
    if (issubclass(est[1], RegressorMixin) and est[0] in chosen_regressors)

]

# initialising the regressor with chosen regressors
reg = LazyRegressor(verbose=1, ignore_warnings=False, custom_metric=None, regressors=REGRESSORS)


In [12]:

# fitting and evaluate models
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
models

'tuple' object has no attribute '__name__'
Invalid Regressor(s)


 11%|█         | 1/9 [00:51<06:54, 51.80s/it]

{'Model': 'AdaBoostRegressor', 'R-Squared': 0.11606127578576719, 'Adjusted R-Squared': 0.11417580511763159, 'RMSE': 1.7841519916179764, 'Time taken': 51.79626154899597}


 22%|██▏       | 2/9 [01:23<04:41, 40.18s/it]

{'Model': 'BaggingRegressor', 'R-Squared': 0.8592760698242314, 'Adjusted R-Squared': 0.8589759010055308, 'RMSE': 0.711876648050294, 'Time taken': 32.048752307891846}


 33%|███▎      | 3/9 [01:29<02:26, 24.47s/it]

{'Model': 'DecisionTreeRegressor', 'R-Squared': 0.7718515540599026, 'Adjusted R-Squared': 0.7713649058443454, 'RMSE': 0.9064200977523507, 'Time taken': 5.764910697937012}


 44%|████▍     | 4/9 [06:13<10:34, 126.82s/it]

{'Model': 'ExtraTreesRegressor', 'R-Squared': 0.8712144067070346, 'Adjusted R-Squared': 0.8709397027575633, 'RMSE': 0.6810114447358567, 'Time taken': 283.73812317848206}


 56%|█████▌    | 5/9 [07:34<07:21, 110.46s/it]

{'Model': 'GradientBoostingRegressor', 'R-Squared': 0.3870940196260765, 'Adjusted R-Squared': 0.38578667080577544, 'RMSE': 1.4856540748538896, 'Time taken': 81.44074416160583}


 67%|██████▋   | 6/9 [07:54<03:59, 79.70s/it] 

{'Model': 'HistGradientBoostingRegressor', 'R-Squared': 0.6394577531386746, 'Adjusted R-Squared': 0.6386887045468901, 'RMSE': 1.1394593841542493, 'Time taken': 19.991281509399414}


 78%|███████▊  | 7/9 [10:05<03:12, 96.39s/it]

{'Model': 'KNeighborsRegressor', 'R-Squared': 0.5405078785961179, 'Adjusted R-Squared': 0.5395277666342662, 'RMSE': 1.286351933440563, 'Time taken': 130.75358748435974}


 89%|████████▉ | 8/9 [15:13<02:43, 163.65s/it]

{'Model': 'RandomForestRegressor', 'R-Squared': 0.8723145708552076, 'Adjusted R-Squared': 0.8720422135924178, 'RMSE': 0.6780964007877455, 'Time taken': 307.66624999046326}
