# forecasting selling price of chicken

Things to note:
* Changing how accuracy is being graded. As long as the model is able to predict the price within 20%, it is considered accurate. 
* Weekends are considered as a holiday as there would be many family gatherings and parties.

## importing the relevant libraries and dataset

In [3]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer, f1_score
import numpy as np
import xgboost as xgb


#!pip install category_encoders
import category_encoders as ce
from category_encoders import TargetEncoder


In [4]:
# reading file
df = pd.read_csv('/workspaces/forecasting/data/bigDataHoliday.csv')
df.head()

Unnamed: 0,UNIT,TAILS,KG,SALES PER KG,ABW,CHICKEN SIZE,PROVINCE,YEAR,MONTH,DAY,SupplyProvince,DemandProvince,Holiday
0,PAYAKUMBUH,1267,2856.0,18000.0,2.254144,LARGE,SUMATERA BARAT,19,12,1,3885216,5993194,1
1,SINJAI,3310,7799.5,22637.8973,2.356344,LARGE,SULAWESI SELATAN,19,12,1,5447863,5931514,1
2,BANDAR JAYA,8236,22197.0,15905.01419,2.695119,LARGE,LAMPUNG,19,12,1,5404511,5618463,1
3,BANDAR LAMPUNG,2565,4971.0,18060.47073,1.938012,MEDIUM,LAMPUNG,19,12,1,5404511,5618463,1
4,BANYUASIN,696,1464.4,16368.88828,2.104023,LARGE,SUMATERA SELATAN,19,12,1,5758528,7021239,1


In [5]:
df.describe()

Unnamed: 0,TAILS,KG,SALES PER KG,ABW,YEAR,MONTH,DAY,SupplyProvince,DemandProvince,Holiday
count,15055.0,15055.0,15055.0,15055.0,15055.0,15055.0,15055.0,15055.0,15055.0,15055.0
mean,8241.127532,15260.051938,19204.083254,1.829462,20.098439,12.0,15.780671,26170080.0,20402070.0,0.183726
std,7135.717864,13785.152507,2153.773552,0.336812,0.985753,0.0,8.87931,23282170.0,16758780.0,0.387274
min,15.0,24.8,6892.778929,0.5475,19.0,12.0,1.0,83222.0,391395.0,0.0
25%,3235.5,5698.0,17414.197155,1.596241,19.0,12.0,8.0,4424703.0,5141045.0,0.0
50%,6532.0,11683.2,19072.57049,1.821657,20.0,12.0,16.0,12562510.0,11238310.0,0.0
75%,11104.0,20707.45,20576.15414,2.06,21.0,12.0,23.0,49942530.0,34531310.0,0.0
max,103280.0,215666.4,28621.86139,3.481132,23.0,12.0,31.0,64997690.0,51933020.0,1.0


## train test split


In [6]:
# train test split before label encoding to prevent data leakage


X = df.drop('SALES PER KG', axis=1)
y = df['SALES PER KG']



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# checking for any nan values after splitting
print(X_train.isna().sum())
print(X_test.isna().sum())
print(y_train.isna().sum())
print(y_test.isna().sum())




UNIT              0
TAILS             0
KG                0
ABW               0
CHICKEN SIZE      0
PROVINCE          0
YEAR              0
MONTH             0
DAY               0
SupplyProvince    0
DemandProvince    0
Holiday           0
dtype: int64
UNIT              0
TAILS             0
KG                0
ABW               0
CHICKEN SIZE      0
PROVINCE          0
YEAR              0
MONTH             0
DAY               0
SupplyProvince    0
DemandProvince    0
Holiday           0
dtype: int64
0
0


In [7]:
# scaling numerical variable

scaler = StandardScaler()
columns_to_scale = ['KG', 'TAILS', 'SupplyProvince', 'DemandProvince']
X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

In [8]:
X_train

Unnamed: 0,UNIT,TAILS,KG,ABW,CHICKEN SIZE,PROVINCE,YEAR,MONTH,DAY,SupplyProvince,DemandProvince,Holiday
12179,SANGATTA,-0.868153,-0.903847,1.385429,SMALL,KALIMANTAN TIMUR,21,12,17,-1.024406,-0.964375,0
14448,SIJUNJUNG,-0.279093,-0.318778,1.739214,MEDIUM,SUMATERA BARAT,21,12,31,-0.995010,-0.935591,0
13769,BOJONEGORO,-0.943107,-0.881517,2.050574,LARGE,JAWA TIMUR,21,12,27,0.087111,0.845235,1
12070,BENGKULU,0.485797,0.670116,2.089701,LARGE,BENGKULU,21,12,16,-1.104780,-1.153716,0
11502,PALANGKARAYA,-0.470844,-0.300075,2.272398,LARGE,KALIMANTAN TENGAH,21,12,13,-1.060115,-1.051460,0
...,...,...,...,...,...,...,...,...,...,...,...,...
5191,GIANYAR,-0.105658,-0.134086,1.791001,MEDIUM,BALI,20,12,1,-0.918898,-0.925787,0
13418,50 KOTA,0.028610,-0.052511,1.721377,MEDIUM,SUMATERA BARAT,21,12,24,-0.995010,-0.935591,1
5390,CIREBON,0.012689,0.020821,1.864943,MEDIUM,JAWA BARAT,20,12,2,1.468594,1.720036,0
860,SUBANG(P),1.917237,1.183325,1.440357,SMALL,JAWA BARAT,19,12,5,1.665132,1.884218,0


# FUNCTIONS
- results(y_test, predictions):
- accuracy(y_test, predictions):

In [9]:
# defining new accuracy metrics
import math
def accuracy(y_test, predictions):
    cnt = 0
    length = len(predictions)

    for i in range(length):
        above = math.ceil(predictions[i] + (predictions[i] * 0.1))
        below = math.ceil(predictions[i] - (predictions[i] * 0.1))

        if (predictions[i] == y_test[i]):
            cnt += 1
        elif (y_test[i] >= predictions[i] and y_test[i] <= above):
            cnt += 1
        elif (y_test[i] <= predictions[i] and y_test[i] >= below):
            cnt += 1
    actual = cnt / length
    return actual

In [10]:
# creating function to evaluate the model
def results(y_test, predictions):
  mae = mean_absolute_error(y_test, predictions)
  mse = mean_squared_error(y_test, predictions)
  rmse = np.sqrt(mse)
  r2 = r2_score(y_test, predictions)

  y_test_list = np.array(y_test).tolist()
  predictions_list = predictions.tolist()
  acc = accuracy(y_test_list, predictions_list)

  print(f"Mean Absolute Error (MAE): {mae}")
  print(f"Mean Squared Error (MSE): {mse}")
  print(f"Root Mean Squared Error (RMSE): {rmse}")
  print(f"R-squared (R²): {r2}")
  print(f"Re-Defined Accuracy: {acc}")

# encoding

In [11]:
# encoding of columns
# label encoding for province

label_encoder = LabelEncoder()
X_train['SIZE ENCODE'] = label_encoder.fit_transform(X_train['CHICKEN SIZE'])
X_test['SIZE ENCODE'] = label_encoder.transform(X_test['CHICKEN SIZE'])

# dropping chicken size column
X_train = X_train.drop(['CHICKEN SIZE'], axis = 1)
X_test = X_test.drop(['CHICKEN SIZE'], axis = 1)


# one hot encoding for province

encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X_train[['PROVINCE']])
X_train_encoded = encoder.transform(X_train[['PROVINCE']])
X_test_encoded = encoder.transform(X_test[['PROVINCE']])

# Convert the encoded matrices back to dataframes for easier manipulation
columns = encoder.get_feature_names_out(['PROVINCE'])
X_train_encoded_df = pd.DataFrame(X_train_encoded.toarray(), columns=columns, index=X_train.index)
X_test_encoded_df = pd.DataFrame(X_test_encoded.toarray(), columns=columns, index=X_test.index)

# merge back the dataframe
X_train = pd.concat([X_train.drop('PROVINCE', axis=1), X_train_encoded_df], axis=1)
X_test = pd.concat([X_test.drop('PROVINCE', axis=1), X_test_encoded_df], axis=1)


# one hot encoding for unit
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X_train[['UNIT']])
X_train_encoded_unit = encoder.transform(X_train[['UNIT']])
X_test_encoded_unit = encoder.transform(X_test[['UNIT']])

# Convert the encoded matrices back to dataframes for easier manipulation
columns = encoder.get_feature_names_out(['UNIT'])
X_train_encoded_df_unit = pd.DataFrame(X_train_encoded_unit.toarray(), columns=columns, index=X_train.index)
X_test_encoded_df_unit = pd.DataFrame(X_test_encoded_unit.toarray(), columns=columns, index=X_test.index)

# merge back the dataframe, dropping unit column
X_train = pd.concat([X_train.drop('UNIT', axis=1), X_train_encoded_df_unit], axis=1)
X_test = pd.concat([X_test.drop('UNIT', axis=1), X_test_encoded_df_unit], axis=1)




# trying lazy predict

In [17]:
pip install lazypredict

Note: you may need to restart the kernel to use updated packages.


In [18]:
# libraries
import lazypredict
from lazypredict.Supervised import LazyClassifier
'''from sklearn.utils import all_estimators
from sklearn.base import RegressorMixin'''


'from sklearn.utils import all_estimators\nfrom sklearn.base import RegressorMixin'

In [19]:
print(lazypredict.Supervised.CLASSIFIERS)

[('AdaBoostClassifier', <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>), ('BaggingClassifier', <class 'sklearn.ensemble._bagging.BaggingClassifier'>), ('BernoulliNB', <class 'sklearn.naive_bayes.BernoulliNB'>), ('CalibratedClassifierCV', <class 'sklearn.calibration.CalibratedClassifierCV'>), ('CategoricalNB', <class 'sklearn.naive_bayes.CategoricalNB'>), ('DecisionTreeClassifier', <class 'sklearn.tree._classes.DecisionTreeClassifier'>), ('DummyClassifier', <class 'sklearn.dummy.DummyClassifier'>), ('ExtraTreeClassifier', <class 'sklearn.tree._classes.ExtraTreeClassifier'>), ('ExtraTreesClassifier', <class 'sklearn.ensemble._forest.ExtraTreesClassifier'>), ('GaussianNB', <class 'sklearn.naive_bayes.GaussianNB'>), ('KNeighborsClassifier', <class 'sklearn.neighbors._classification.KNeighborsClassifier'>), ('LabelPropagation', <class 'sklearn.semi_supervised._label_propagation.LabelPropagation'>), ('LabelSpreading', <class 'sklearn.semi_supervised._label_propagation.LabelSp

In [20]:
'''
    chosen_regressors = [
    'SVR', 
    'BaggingRegressor',
    'ExtraTreesRegressor',
    'RandomForestRegressor',
    'GradientBoostingRegressor',
    'LGBMRegressor',
    'XGBRegressor',
    'CatBoostRegressor',
    'HistGradientBoostingRegressor',
    'AdaBoostRegressor',
    'KNeighborsRegressor',
    'DecisionTreeRegressor'
]

REGRESSORS = [
    est
    for est in all_estimators()
    if (issubclass(est[1], RegressorMixin) and est[0] in chosen_regressors)

]'''

# initialising the regressor with chosen regressors
reg = LazyClassifier(verbose=1, ignore_warnings=False, custom_metric=None)


In [21]:

# fitting and evaluate models
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
models

  0%|          | 0/29 [00:00<?, ?it/s]

  3%|▎         | 1/29 [00:01<00:28,  1.00s/it]

ROC AUC couldn't be calculated for AdaBoostClassifier
multi_class must be in ('ovo', 'ovr')
{'Model': 'AdaBoostClassifier', 'Accuracy': 0.46496180670873466, 'Balanced Accuracy': 0.4645024031811561, 'ROC AUC': None, 'F1 Score': 0.4643338200488807, 'Time taken': 1.002105474472046}


 10%|█         | 3/29 [00:01<00:16,  1.59it/s]

ROC AUC couldn't be calculated for BaggingClassifier
multi_class must be in ('ovo', 'ovr')
{'Model': 'BaggingClassifier', 'Accuracy': 0.7356360013284623, 'Balanced Accuracy': 0.734941901152864, 'ROC AUC': None, 'F1 Score': 0.7360805580806883, 'Time taken': 0.7636623382568359}
ROC AUC couldn't be calculated for BernoulliNB
multi_class must be in ('ovo', 'ovr')
{'Model': 'BernoulliNB', 'Accuracy': 0.42909332447691795, 'Balanced Accuracy': 0.42601129936614723, 'ROC AUC': None, 'F1 Score': 0.3898159581942892, 'Time taken': 0.08462786674499512}





KeyboardInterrupt: 

In [None]:
# filtering out the top 5 models
top_5_models = models.head(5)
top_5_models

In [None]:
y_train.nunique()

# Linear Regression

In [18]:
#Applying Logistic Regression
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

# predictions
predictions_lr = lr.predict(X_test)

# print metrics
print(results(y_test, predictions_lr))



'''# Use the custom accuracy function
accuracy_score = accuracy(y_test_list, predictions_list)
print(f"Custom Accuracy Score: {accuracy_score}")'''

Mean Absolute Error (MAE): 1096.9887937565593
Mean Squared Error (MSE): 2204010.1056202706
Root Mean Squared Error (RMSE): 1484.5908882989518
R-squared (R²): 0.5373888190756129
Re-Defined Accuracy: 0.8465626037861176
None


'# Use the custom accuracy function\naccuracy_score = accuracy(y_test_list, predictions_list)\nprint(f"Custom Accuracy Score: {accuracy_score}")'

# Random Forest



In [19]:
# random forest model
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 400) # 400 trees
rf.fit(X_train, y_train)

# predictions
predictions_rf = rf.predict(X_test)

# print accuracy score
print(results(y_test, predictions_rf))

Mean Absolute Error (MAE): 481.8099547746834
Mean Squared Error (MSE): 690170.1718080787
Root Mean Squared Error (RMSE): 830.7648113684635
R-squared (R²): 0.855136581540733
Re-Defined Accuracy: 0.9637994021919628
None


# Gradient Boosting

## XGBoost

In [20]:
# applying xgboost
xg = xgb.XGBRegressor()

# fitting the model
xg.fit(X_train, y_train)

# predictions
predictions_xg = xg.predict(X_test) 

# print accuracy score
print(results(y_test, predictions_xg))

Mean Absolute Error (MAE): 603.0214842705082
Mean Squared Error (MSE): 853160.8484241929
Root Mean Squared Error (RMSE): 923.6670657895045
R-squared (R²): 0.8209256179898987
Re-Defined Accuracy: 0.9554965127864496
None


# Decision Tree

In [12]:
# applying decision trees
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)

# predictions
predictions_dt = dt.predict(X_test)
print(results(y_test, predictions_dt))

Mean Absolute Error (MAE): 622.3767745532588
Mean Squared Error (MSE): 1301568.8692961177
Root Mean Squared Error (RMSE): 1140.8632123511204
R-squared (R²): 0.7268069188321427
Re-Defined Accuracy: 0.9302557289936898
None


# SVM

In [14]:
# applying SVM
from sklearn.svm import SVR
svm = SVR()
svm.fit(X_train, y_train)

# predictions
predictions_SVM = svm.predict(X_test)
print(results(y_test, predictions_SVM))

Mean Absolute Error (MAE): 1714.9721868000245
Mean Squared Error (MSE): 4621680.866075736
Root Mean Squared Error (RMSE): 2149.8094952985334
R-squared (R²): 0.029931288491422237
Re-Defined Accuracy: 0.6021255396878114
None


# SVM Classifier with Linear
the worst results

In [17]:
# appling linear SVM Classifier
from sklearn import svm

svm_clf = svm.LinearSVC(multi_class='ovr', max_iter = 1000)
svm_clf.fit(X_train, y_train)

# predictions
predictions_svmClf = svm_clf.predict(X_test)
print(classification_report(predictions_svmClf, y_test))

# print accuracy score
acc_svmClf = metrics.accuracy_score(y_test, predictions_svmClf)
print(f"Accuracy: {acc_svmClf}")





              precision    recall  f1-score   support

           0       0.47      0.70      0.56       429
           1       0.37      0.46      0.41       473
           2       0.26      0.35      0.30       440
           3       0.67      0.35      0.46      1138
           4       0.51      0.57      0.54       531

    accuracy                           0.46      3011
   macro avg       0.46      0.49      0.45      3011
weighted avg       0.50      0.46      0.46      3011

Accuracy: 0.4559946861507805




# RBF Kernel 

In [19]:
# applying a RBF Kernel

from sklearn.svm import SVC
rbf_svc = SVC(kernel='rbf', random_state = 0, gamma=0.1, C = 1)
rbf_svc.fit(X_train, y_train)  

# predictions
predictions_rbf = rbf_svc.predict(X_test)
print(classification_report(predictions_rbf, y_test))

# print accuracy score
acc_rbf = metrics.accuracy_score(y_test, predictions_rbf)   
print(f"Accuracy: {acc_rbf}")



              precision    recall  f1-score   support

           0       0.70      0.79      0.74       564
           1       0.66      0.58      0.61       673
           2       0.60      0.63      0.62       567
           3       0.65      0.70      0.68       554
           4       0.83      0.75      0.79       653

    accuracy                           0.69      3011
   macro avg       0.69      0.69      0.69      3011
weighted avg       0.69      0.69      0.69      3011

Accuracy: 0.6878113583527068


# Neural Network
using entity embeddings to train the neural network


## setting up the model
creating entity embeddings <br>
`cat_col1` = `PROVINCE` <br>
`cat_col2` = `CHICKEN SIZE` <br>
`cat_col3` = `SIZE` <br>

In [None]:
# original dataset `df`

# importing libraries
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Reshape, Concatenate, Dense, Flatten

# getting the unique values of the categorical columns
cat_columns = ['PROVINCE', 'CHICKEN SIZE', 'UNIT']
unique_cat1 = df['PROVINCE'].nunique()
unique_cat2 = df['CHICKEN SIZE'].nunique()
unique_cat3 = df['UNIT'].nunique()

# getting numerical columns
num_columns = df.columns.difference(cat_columns)
numerical_data = df[num_columns]

# scaling the numerical features
scaler = StandardScaler()
numerical_data_scaled = scaler.fit_transform(numerical_data)

# defining the input layer
num_input = Input(shape=(numerical_data_scaled.shape[1],), name='num_input')

# embedding the categorical columns
cat_input1 = Input(shape = (1,), name='cat_input1')
cat_embed1 = Embedding(input_dim=unique_cat1, output_dim = int(min(np.ceil(unique_cat1 /2), 50)))(cat_input1)
cat_embed1 = Flatten()(cat_embed1)

cat_input2 = Input(shape = (1,), name='cat_input2')
cat_embed2 = Embedding(input_dim=unique_cat2, output_dim = int(min(np.ceil(unique_cat2 /2), 50)))(cat_input2)
cat_embed2 = Flatten()(cat_embed2)


cat_input3 = Input(shape = (1,), name='cat_input3')
cat_embed3 = Embedding(input_dim=unique_cat3, output_dim = int(min(np.ceil(unique_cat3 /2), 50)))(cat_input3)
cat_embed3 = Flatten()(cat_embed3)


# concatenate the embeddings with numerical input
concatenated = Concatenate()([cat_embed1, cat_embed2, cat_embed3, num_input])


# adding the dense layer ontop of the embeddings
dense_output = Dense(128, activation = 'relu')(concatenated)
dense_output = Dense(64, activation = 'relu')(dense_output)
output = Dense(1)(dense_output)

# creating the model
model = Model(inputs = [cat_input1, cat_input2, cat_input3, num_input], outputs = output)
model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()


## training the model


In [None]:
# train model
X_train_cat1 = X_train['PROVINCE']
X_train_cat2 = X_train['CHICKEN SIZE']
X_train_cat3 = X_train['UNIT']
X_train_num = X_train.drop(['PROVINCE', 'CHICKEN SIZE', 'UNIT'], axis = 1)

# fitting the model


In [None]:
df

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import numpy as np


# Define the neural network architecture
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)  # Single output node for regression
])

# Compile the model, specifying the optimizer, loss function, and metric
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Fit the model on the training data
history = model.fit(X_train, y_train, validation_split=0.1, epochs=100, verbose=1)

# Predict the target on the testing set
predictions = model.predict(X_test).flatten()

# Evaluate the model performance using RMSE
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f"RMSE: {rmse}")

# You can plot the training history to check how the loss and metric evolved over epochs
import matplotlib.pyplot as plt

plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='validation loss')
plt.legend()
plt.show()
