# forecasting selling price of chicken with Bins

In [1]:
# libraries
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

!pip install category_encoders
import category_encoders as ce
from category_encoders import TargetEncoder









In [2]:
# reading file
df = pd.read_csv('/workspaces/forecasting/data/data.csv')
df.head()

Unnamed: 0,DAY_DATE,UNIT,TAILS,KG,SALES PER KG,TOTAL SALES,ABW,CHICKEN SIZE,PROVINCE
0,2017-01-01,LOMBOK,6237,11356.5,22000.0,249843000,1.820827,MEDIUM,NUSA TENGGARA BARAT
1,2017-01-01,GARUT,2625,5446.8,17000.0,92595600,2.074971,LARGE,JAWA BARAT
2,2017-01-01,SUMEDANG,7788,14176.8,16106.05355,228332300,1.820339,MEDIUM,JAWA BARAT
3,2017-01-01,CIREBON,7504,15958.0,17000.0,271286000,2.126599,LARGE,JAWA BARAT
4,2017-01-01,PEKALONGAN,9121,16250.0,16500.0,268125000,1.781603,MEDIUM,JAWA TENGAH


# cleaning

In [3]:
# removing outlier
df = df[df['SALES PER KG'] <= 35000]

# removing 'total sales' column
df = df.drop('TOTAL SALES', axis = 1)

# creating bins for prediction

# using sturges to print out the number of bins
sturges = int(np.ceil(1 + np.log2(len(df['SALES PER KG']))))
print(sturges)

max_value = 35000
bin_width = int((max_value - 0) // sturges)

# Correctly calculate 'bin_edges'
# Ensure 'bin_width' is added to 'max_value' to include the upper edge
bin_edges = np.arange(0, max_value + bin_width, bin_width)

# Use pd.cut to bin the data
df['price_bin'] = pd.cut(df['SALES PER KG'], bins=bin_edges, labels=False, right=False)

# Add 1 to change bins from 0-19 to 1-20
df['price_bin'] += 1

# Display the first few rows to verify
print(df.head())

# dropping sales per kg column as we already have price_bin
df = df.drop('SALES PER KG', axis = 1)

# dropping ABW 
df = df.drop('ABW', axis = 1)


20
     DAY_DATE        UNIT  TAILS       KG  SALES PER KG       ABW  \
0  2017-01-01      LOMBOK   6237  11356.5   22000.00000  1.820827   
1  2017-01-01       GARUT   2625   5446.8   17000.00000  2.074971   
2  2017-01-01    SUMEDANG   7788  14176.8   16106.05355  1.820339   
3  2017-01-01     CIREBON   7504  15958.0   17000.00000  2.126599   
4  2017-01-01  PEKALONGAN   9121  16250.0   16500.00000  1.781603   

  CHICKEN SIZE             PROVINCE  price_bin  
0       MEDIUM  NUSA TENGGARA BARAT         13  
1        LARGE           JAWA BARAT         10  
2       MEDIUM           JAWA BARAT         10  
3        LARGE           JAWA BARAT         10  
4       MEDIUM          JAWA TENGAH         10  


## pre processing
- extracting date column
- splitting, scaling
- creating function to evaluate the model [call `results(y_test, predictions)`]


In [4]:
# extracting year and month from date column for seasonality trends
df['DAY_DATE'] = pd.to_datetime(df['DAY_DATE'], errors='coerce').dt.normalize()
df['YEAR'] = df['DAY_DATE'].dt.year
df['MONTH'] = df['DAY_DATE'].dt.month
df['DAY'] = df['DAY_DATE'].dt.day
df = df.drop('DAY_DATE', axis = 1)


In [5]:
# train test split before label encoding to prevent data leakage


X = df.drop('price_bin', axis=1)
y = df['price_bin']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# scaling numerical variable

scaler = StandardScaler()
columns_to_scale = ['KG', 'TAILS']
X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

In [6]:
# creating function to evaluate the model
def results(y_test, predictions):
  mae = mean_absolute_error(y_test, predictions)
  mse = mean_squared_error(y_test, predictions)
  rmse = np.sqrt(mse)
  r2 = r2_score(y_test, predictions)

  print(f"Mean Absolute Error (MAE): {mae}")
  print(f"Mean Squared Error (MSE): {mse}")
  print(f"Root Mean Squared Error (RMSE): {rmse}")
  print(f"R-squared (R²): {r2}")


# encoding

In [7]:
# encoding of columns
# label encoding for province

label_encoder = LabelEncoder()
X_train['SIZE ENCODE'] = label_encoder.fit_transform(X_train['CHICKEN SIZE'])
X_test['SIZE ENCODE'] = label_encoder.transform(X_test['CHICKEN SIZE'])

# dropping chicken size column
X_train = X_train.drop(['CHICKEN SIZE'], axis = 1)
X_test = X_test.drop(['CHICKEN SIZE'], axis = 1)


# one hot encoding for province

encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X_train[['PROVINCE']])
X_train_encoded = encoder.transform(X_train[['PROVINCE']])
X_test_encoded = encoder.transform(X_test[['PROVINCE']])

# Convert the encoded matrices back to dataframes for easier manipulation
columns = encoder.get_feature_names_out(['PROVINCE'])
X_train_encoded_df = pd.DataFrame(X_train_encoded.toarray(), columns=columns, index=X_train.index)
X_test_encoded_df = pd.DataFrame(X_test_encoded.toarray(), columns=columns, index=X_test.index)

# merge back the dataframe
X_train = pd.concat([X_train.drop('PROVINCE', axis=1), X_train_encoded_df], axis=1)
X_test = pd.concat([X_test.drop('PROVINCE', axis=1), X_test_encoded_df], axis=1)


# one hot encoding for unit
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X_train[['UNIT']])
X_train_encoded_unit = encoder.transform(X_train[['UNIT']])
X_test_encoded_unit = encoder.transform(X_test[['UNIT']])

# Convert the encoded matrices back to dataframes for easier manipulation
columns = encoder.get_feature_names_out(['UNIT'])
X_train_encoded_df_unit = pd.DataFrame(X_train_encoded_unit.toarray(), columns=columns, index=X_train.index)
X_test_encoded_df_unit = pd.DataFrame(X_test_encoded_unit.toarray(), columns=columns, index=X_test.index)

# merge back the dataframe, dropping unit column
X_train = pd.concat([X_train.drop('UNIT', axis=1), X_train_encoded_df_unit], axis=1)
X_test = pd.concat([X_test.drop('UNIT', axis=1), X_test_encoded_df_unit], axis=1)




In [8]:
print(X_train)

           TAILS        KG  YEAR  MONTH  DAY  SIZE ENCODE  PROVINCE_ACEH  \
276802  0.456661  0.511789  2021     10   30            2            0.0   
126530 -0.144031 -0.129820  2019      4   12            2            0.0   
298428  1.224147  1.070229  2022      2   15            2            0.0   
133120  1.104237  0.617678  2019      5   23            3            0.0   
115983  0.122580 -0.032514  2019      2    9            2            0.0   
...          ...       ...   ...    ...  ...          ...            ...   
259179  4.653331  6.017824  2021      7   12            1            0.0   
365839 -0.889626 -0.816256  2023      3    6            1            0.0   
131933  0.108858  0.166973  2019      5   16            2            0.0   
146868  0.396052  0.445115  2019      8   16            2            0.0   
121959 -0.440374 -0.431000  2019      3   17            2            0.0   

        PROVINCE_BALI  PROVINCE_BANTEN  PROVINCE_BENGKULU  ...  \
276802            0.0

# trying lazy predict

In [9]:
pip install lazypredict-nightly

Note: you may need to restart the kernel to use updated packages.


In [10]:
# libraries
import lazypredict
from lazypredict.Supervised import LazyRegressor
lazypredict.Supervised.REGRESSORS


[('AdaBoostRegressor', sklearn.ensemble._weight_boosting.AdaBoostRegressor),
 ('BaggingRegressor', sklearn.ensemble._bagging.BaggingRegressor),
 ('BayesianRidge', sklearn.linear_model._bayes.BayesianRidge),
 ('DecisionTreeRegressor', sklearn.tree._classes.DecisionTreeRegressor),
 ('DummyRegressor', sklearn.dummy.DummyRegressor),
 ('ElasticNet', sklearn.linear_model._coordinate_descent.ElasticNet),
 ('ElasticNetCV', sklearn.linear_model._coordinate_descent.ElasticNetCV),
 ('ExtraTreeRegressor', sklearn.tree._classes.ExtraTreeRegressor),
 ('ExtraTreesRegressor', sklearn.ensemble._forest.ExtraTreesRegressor),
 ('GammaRegressor', sklearn.linear_model._glm.glm.GammaRegressor),
 ('GaussianProcessRegressor',
  sklearn.gaussian_process._gpr.GaussianProcessRegressor),
 ('GradientBoostingRegressor', sklearn.ensemble._gb.GradientBoostingRegressor),
 ('HistGradientBoostingRegressor',
  sklearn.ensemble._hist_gradient_boosting.gradient_boosting.HistGradientBoostingRegressor),
 ('HuberRegressor', sk

In [11]:
# initialising regressor
reg = LazyRegressor(verbose=10, ignore_warnings=True, custom_metric=None)

# fitting and evaluate models
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
models

  0%|          | 0/42 [00:00<?, ?it/s]

'ExtraTreesRegressor', 
'BaggingRegressor', 
'svr'
