# forecasting selling price of chicken with Bins

In [3]:
# libraries
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

!pip install category_encoders
import category_encoders as ce
from category_encoders import TargetEncoder






In [4]:
# reading file
df = pd.read_csv('/workspaces/forecasting/data/data.csv')
df.head()

Unnamed: 0,DAY_DATE,UNIT,TAILS,KG,SALES PER KG,TOTAL SALES,ABW,CHICKEN SIZE,PROVINCE
0,2017-01-01,LOMBOK,6237,11356.5,22000.0,249843000,1.820827,MEDIUM,NUSA TENGGARA BARAT
1,2017-01-01,GARUT,2625,5446.8,17000.0,92595600,2.074971,LARGE,JAWA BARAT
2,2017-01-01,SUMEDANG,7788,14176.8,16106.05355,228332300,1.820339,MEDIUM,JAWA BARAT
3,2017-01-01,CIREBON,7504,15958.0,17000.0,271286000,2.126599,LARGE,JAWA BARAT
4,2017-01-01,PEKALONGAN,9121,16250.0,16500.0,268125000,1.781603,MEDIUM,JAWA TENGAH


# cleaning

In [5]:
# removing outlier
df = df[df['SALES PER KG'] <= 35000]

# removing 'total sales' column
df = df.drop('TOTAL SALES', axis = 1)

# creating bins for prediction

# using sturges to print out the number of bins
sturges = int(np.ceil(1 + np.log2(len(df['SALES PER KG']))))
print(sturges)

max_value = 35000
bin_width = int((max_value - 0) // sturges)

# Correctly calculate 'bin_edges'
# Ensure 'bin_width' is added to 'max_value' to include the upper edge
bin_edges = np.arange(0, max_value + bin_width, bin_width)

# Use pd.cut to bin the data
df['price_bin'] = pd.cut(df['SALES PER KG'], bins=bin_edges, labels=False, right=False)

# Add 1 to change bins from 0-19 to 1-20
df['price_bin'] += 1

# Display the first few rows to verify
print(df.head())

# dropping sales per kg column as we already have price_bin
df = df.drop('SALES PER KG', axis = 1)


20
     DAY_DATE        UNIT  TAILS       KG  SALES PER KG       ABW  \
0  2017-01-01      LOMBOK   6237  11356.5   22000.00000  1.820827   
1  2017-01-01       GARUT   2625   5446.8   17000.00000  2.074971   
2  2017-01-01    SUMEDANG   7788  14176.8   16106.05355  1.820339   
3  2017-01-01     CIREBON   7504  15958.0   17000.00000  2.126599   
4  2017-01-01  PEKALONGAN   9121  16250.0   16500.00000  1.781603   

  CHICKEN SIZE             PROVINCE  price_bin  
0       MEDIUM  NUSA TENGGARA BARAT         13  
1        LARGE           JAWA BARAT         10  
2       MEDIUM           JAWA BARAT         10  
3        LARGE           JAWA BARAT         10  
4       MEDIUM          JAWA TENGAH         10  


## pre processing
- extracting date column
- splitting, scaling
- creating function to evaluate the model [call `results(y_test, predictions)`]


In [6]:
# extracting year and month from date column for seasonality trends
df['DAY_DATE'] = pd.to_datetime(df['DAY_DATE'], errors='coerce').dt.normalize()
df['YEAR'] = df['DAY_DATE'].dt.year
df['MONTH'] = df['DAY_DATE'].dt.month
df['DAY'] = df['DAY_DATE'].dt.day
df = df.drop('DAY_DATE', axis = 1)


In [7]:
# train test split before label encoding to prevent data leakage


X = df.drop('price_bin', axis=1)
y = df['price_bin']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# scaling numerical variable

scaler = StandardScaler()
columns_to_scale = ['KG', 'TAILS']
X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

In [8]:
# creating function to evaluate the model
def results(y_test, predictions):
  mae = mean_absolute_error(y_test, predictions)
  mse = mean_squared_error(y_test, predictions)
  rmse = np.sqrt(mse)
  r2 = r2_score(y_test, predictions)

  print(f"Mean Absolute Error (MAE): {mae}")
  print(f"Mean Squared Error (MSE): {mse}")
  print(f"Root Mean Squared Error (RMSE): {rmse}")
  print(f"R-squared (R²): {r2}")


# lazy cat

In [9]:
!pip install dirty_cat
from dirty_cat import SuperVectorizer




In [10]:
super_encoder = SuperVectorizer(auto_cast=True)
X_train_encoded = super_encoder.fit_transform(X_train)
X_test_encoded = super_encoder.transform(X_test)



In [13]:
X_train_encoded

array([[0.000e+00, 0.000e+00, 1.000e+00, ..., 2.021e+03, 1.000e+01,
        3.000e+01],
       [0.000e+00, 0.000e+00, 1.000e+00, ..., 2.019e+03, 4.000e+00,
        1.200e+01],
       [0.000e+00, 0.000e+00, 1.000e+00, ..., 2.022e+03, 2.000e+00,
        1.500e+01],
       ...,
       [0.000e+00, 0.000e+00, 1.000e+00, ..., 2.019e+03, 5.000e+00,
        1.600e+01],
       [0.000e+00, 0.000e+00, 1.000e+00, ..., 2.019e+03, 8.000e+00,
        1.600e+01],
       [0.000e+00, 0.000e+00, 1.000e+00, ..., 2.019e+03, 3.000e+00,
        1.700e+01]])

# encoding

In [53]:
# encoding of columns
# label encoding for province

label_encoder = LabelEncoder()
X_train['SIZE ENCODE'] = label_encoder.fit_transform(X_train['CHICKEN SIZE'])
X_test['SIZE ENCODE'] = label_encoder.transform(X_test['CHICKEN SIZE'])

# dropping chicken size column
X_train = X_train.drop(['CHICKEN SIZE'], axis = 1)
X_test = X_test.drop(['CHICKEN SIZE'], axis = 1)


# one hot encoding for province

encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X_train[['PROVINCE']])
X_train_encoded = encoder.transform(X_train[['PROVINCE']])
X_test_encoded = encoder.transform(X_test[['PROVINCE']])

# Convert the encoded matrices back to dataframes for easier manipulation
columns = encoder.get_feature_names_out(['PROVINCE'])
X_train_encoded_df = pd.DataFrame(X_train_encoded.toarray(), columns=columns, index=X_train.index)
X_test_encoded_df = pd.DataFrame(X_test_encoded.toarray(), columns=columns, index=X_test.index)

# merge back the dataframe
X_train = pd.concat([X_train.drop('PROVINCE', axis=1), X_train_encoded_df], axis=1)
X_test = pd.concat([X_test.drop('PROVINCE', axis=1), X_test_encoded_df], axis=1)


# one hot encoding for unit
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X_train[['UNIT']])
X_train_encoded_unit = encoder.transform(X_train[['UNIT']])
X_test_encoded_unit = encoder.transform(X_test[['UNIT']])

# Convert the encoded matrices back to dataframes for easier manipulation
columns = encoder.get_feature_names_out(['UNIT'])
X_train_encoded_df_unit = pd.DataFrame(X_train_encoded_unit.toarray(), columns=columns, index=X_train.index)
X_test_encoded_df_unit = pd.DataFrame(X_test_encoded_unit.toarray(), columns=columns, index=X_test.index)

# merge back the dataframe, dropping unit column
X_train = pd.concat([X_train.drop('UNIT', axis=1), X_train_encoded_df_unit], axis=1)
X_test = pd.concat([X_test.drop('UNIT', axis=1), X_test_encoded_df_unit], axis=1)




In [56]:
X_train.columns

Index(['TAILS', 'KG', 'ABW', 'YEAR', 'MONTH', 'DAY', 'SIZE ENCODE',
       'PROVINCE_ACEH', 'PROVINCE_BALI', 'PROVINCE_BANTEN',
       ...
       'UNIT_TASIKMALAYA', 'UNIT_TEBING TINGGI', 'UNIT_TEGAL',
       'UNIT_TEMANGGUNG', 'UNIT_TULANG BAWANG', 'UNIT_TULUNGAGUNG',
       'UNIT_UNGARAN', 'UNIT_WABIN', 'UNIT_WONOGIRI', 'UNIT_YOGYAKARTA'],
      dtype='object', length=164)

# trying lazy predict

In [55]:
pip install lazypredict-nightly

^C
[31mERROR: Operation cancelled by user[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [14]:
# libraries
import lazypredict
from lazypredict.Supervised import LazyRegressor


In [15]:
# initialising regressor
reg = LazyRegressor(verbose=10, ignore_warnings=True, custom_metric=None)

# fitting and evaluate models
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
top5_models = models.sort_values(by='R-Squared', ascending = False).head(5)
print(top5_models)

  0%|          | 0/42 [00:00<?, ?it/s]

  2%|▏         | 1/42 [00:12<08:39, 12.68s/it]

{'Model': 'AdaBoostRegressor', 'R-Squared': 0.1541600505980727, 'Adjusted R-Squared': 0.15406063228638012, 'RMSE': 1.7452790101184326, 'Time taken': 12.682361841201782}


  5%|▍         | 2/42 [00:28<09:45, 14.63s/it]

{'Model': 'BaggingRegressor', 'R-Squared': 0.8469774129283023, 'Adjusted R-Squared': 0.8469594269638556, 'RMSE': 0.7423326098238878, 'Time taken': 15.985158681869507}


  7%|▋         | 3/42 [00:29<05:18,  8.17s/it]

{'Model': 'BayesianRidge', 'R-Squared': 0.08775391813411848, 'Adjusted R-Squared': 0.08764669458033458, 'RMSE': 1.8124948129081688, 'Time taken': 0.48565149307250977}


 10%|▉         | 4/42 [00:31<03:49,  6.03s/it]

{'Model': 'DecisionTreeRegressor', 'R-Squared': 0.7273477628349609, 'Adjusted R-Squared': 0.7273157158441357, 'RMSE': 0.9908896718778214, 'Time taken': 2.7548534870147705}


 12%|█▏        | 5/42 [00:32<02:26,  3.96s/it]

{'Model': 'DummyRegressor', 'R-Squared': -1.4372201842416388e-05, 'Adjusted R-Squared': -0.00013191186241634512, 'RMSE': 1.897683973457693, 'Time taken': 0.28624987602233887}


 14%|█▍        | 6/42 [00:32<01:38,  2.73s/it]

{'Model': 'ElasticNet', 'R-Squared': 0.005909837394221906, 'Adjusted R-Squared': 0.005792994053225309, 'RMSE': 1.8920545657291967, 'Time taken': 0.35145020484924316}


 17%|█▋        | 7/42 [00:34<01:23,  2.40s/it]

{'Model': 'ElasticNetCV', 'R-Squared': 0.0864098061593711, 'Adjusted R-Squared': 0.08630242462139248, 'RMSE': 1.8138295948889152, 'Time taken': 1.7068915367126465}


 19%|█▉        | 8/42 [00:35<01:04,  1.91s/it]

{'Model': 'ExtraTreeRegressor', 'R-Squared': 0.6874354959119426, 'Adjusted R-Squared': 0.6873987577142334, 'RMSE': 1.0609394468542903, 'Time taken': 0.8674163818359375}


 21%|██▏       | 9/42 [01:29<10:00, 18.19s/it]

{'Model': 'ExtraTreesRegressor', 'R-Squared': 0.871928720884948, 'Adjusted R-Squared': 0.8719136676466197, 'RMSE': 0.6791201907070983, 'Time taken': 53.99558091163635}


 24%|██▍       | 10/42 [01:30<06:52, 12.89s/it]

{'Model': 'GammaRegressor', 'R-Squared': 0.05963005741664007, 'Adjusted R-Squared': 0.05951952824132245, 'RMSE': 1.840221661638211, 'Time taken': 0.9980003833770752}


 29%|██▊       | 12/42 [02:24<11:26, 22.88s/it]

{'Model': 'GradientBoostingRegressor', 'R-Squared': 0.40459407146713644, 'Adjusted R-Squared': 0.4045240886621999, 'RMSE': 1.464290842379499, 'Time taken': 54.560240030288696}


 31%|███       | 13/42 [02:27<08:03, 16.69s/it]

{'Model': 'HistGradientBoostingRegressor', 'R-Squared': 0.6616713828645566, 'Adjusted R-Squared': 0.6616316164052676, 'RMSE': 1.1037993546500489, 'Time taken': 2.4348886013031006}


 33%|███▎      | 14/42 [02:31<05:57, 12.78s/it]

{'Model': 'HuberRegressor', 'R-Squared': 0.07550612126989331, 'Adjusted R-Squared': 0.07539745813491316, 'RMSE': 1.8246215042158225, 'Time taken': 3.759575843811035}


 36%|███▌      | 15/42 [02:35<04:39, 10.34s/it]

{'Model': 'KNeighborsRegressor', 'R-Squared': 0.536081449734642, 'Adjusted R-Squared': 0.5360269216893979, 'RMSE': 1.2925329949140412, 'Time taken': 4.683312892913818}


 40%|████      | 17/42 [02:36<02:10,  5.21s/it]

{'Model': 'Lars', 'R-Squared': 0.087754637404163, 'Adjusted R-Squared': 0.08764741393492059, 'RMSE': 1.8124940983677023, 'Time taken': 0.3463747501373291}


 43%|████▎     | 18/42 [02:37<01:32,  3.84s/it]

{'Model': 'LarsCV', 'R-Squared': 0.087754637404163, 'Adjusted R-Squared': 0.08764741393492059, 'RMSE': 1.8124940983677023, 'Time taken': 0.6607532501220703}


 45%|████▌     | 19/42 [02:37<01:04,  2.82s/it]

{'Model': 'Lasso', 'R-Squared': 0.0017786302473409288, 'Adjusted R-Squared': 0.0016613013326373105, 'RMSE': 1.8959819586418318, 'Time taken': 0.4327569007873535}


 48%|████▊     | 20/42 [02:39<00:54,  2.48s/it]

{'Model': 'LassoCV', 'R-Squared': 0.0869700631759891, 'Adjusted R-Squared': 0.08686274748948364, 'RMSE': 1.8132733462506656, 'Time taken': 1.6842398643493652}


 50%|█████     | 21/42 [02:39<00:38,  1.85s/it]

{'Model': 'LassoLars', 'R-Squared': 0.0017786302473409288, 'Adjusted R-Squared': 0.0016613013326373105, 'RMSE': 1.8959819586418318, 'Time taken': 0.3941354751586914}


 52%|█████▏    | 22/42 [02:40<00:29,  1.50s/it]

{'Model': 'LassoLarsCV', 'R-Squared': 0.087754637404163, 'Adjusted R-Squared': 0.08764741393492059, 'RMSE': 1.8124940983677023, 'Time taken': 0.6667523384094238}


 55%|█████▍    | 23/42 [02:40<00:22,  1.19s/it]

{'Model': 'LassoLarsIC', 'R-Squared': 0.087754637404163, 'Adjusted R-Squared': 0.08764741393492059, 'RMSE': 1.8124940983677023, 'Time taken': 0.4660615921020508}


 57%|█████▋    | 24/42 [02:41<00:17,  1.05it/s]

{'Model': 'LinearRegression', 'R-Squared': 0.08775463740416312, 'Adjusted R-Squared': 0.0876474139349207, 'RMSE': 1.812494098367702, 'Time taken': 0.4091050624847412}


 60%|█████▉    | 25/42 [03:32<04:33, 16.09s/it]

{'Model': 'LinearSVR', 'R-Squared': 0.07914069417129699, 'Adjusted R-Squared': 0.07903245823664218, 'RMSE': 1.8210312961631099, 'Time taken': 51.4101722240448}


 62%|██████▏   | 26/42 [06:53<19:02, 71.42s/it]

{'Model': 'MLPRegressor', 'R-Squared': 0.42493648786933724, 'Adjusted R-Squared': 0.42486889607075584, 'RMSE': 1.4390592493675831, 'Time taken': 200.5052525997162}


'ExtraTreesRegressor', 
'BaggingRegressor', 
