# forecasting selling price of chicken with Bins

## importing the relevant libraries and dataset

In [1]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import make_scorer, f1_score
import numpy as np
import xgboost as xgb


#!pip install category_encoders
import category_encoders as ce
from category_encoders import TargetEncoder


In [2]:
# reading file
df = pd.read_csv('/workspaces/forecasting/data/final.csv')
df.head()

Unnamed: 0,UNIT,TAILS,KG,SALES PER KG,ABW,CHICKEN SIZE,PROVINCE,YEAR,MONTH,DAY,SupplyProvince,DemandProvince
0,PAYAKUMBUH,1267,2856.0,18000.0,2.254144,LARGE,SUMATERA BARAT,19,12,1,3885216,5993194
1,SINJAI,3310,7799.5,22637.8973,2.356344,LARGE,SULAWESI SELATAN,19,12,1,5447863,5931514
2,BANDAR JAYA,8236,22197.0,15905.01419,2.695119,LARGE,LAMPUNG,19,12,1,5404511,5618463
3,BANDAR LAMPUNG,2565,4971.0,18060.47073,1.938012,MEDIUM,LAMPUNG,19,12,1,5404511,5618463
4,BANYUASIN,696,1464.4,16368.88828,2.104023,LARGE,SUMATERA SELATAN,19,12,1,5758528,7021239


In [3]:
df.describe()

Unnamed: 0,TAILS,KG,SALES PER KG,ABW,YEAR,MONTH,DAY,SupplyProvince,DemandProvince
count,15055.0,15055.0,15055.0,15055.0,15055.0,15055.0,15055.0,15055.0,15055.0
mean,8241.127532,15260.051938,19204.083254,1.829462,20.098439,12.0,15.780671,26170080.0,20402070.0
std,7135.717864,13785.152507,2153.773552,0.336812,0.985753,0.0,8.87931,23282170.0,16758780.0
min,15.0,24.8,6892.778929,0.5475,19.0,12.0,1.0,83222.0,391395.0
25%,3235.5,5698.0,17414.197155,1.596241,19.0,12.0,8.0,4424703.0,5141045.0
50%,6532.0,11683.2,19072.57049,1.821657,20.0,12.0,16.0,12562510.0,11238310.0
75%,11104.0,20707.45,20576.15414,2.06,21.0,12.0,23.0,49942530.0,34531310.0
max,103280.0,215666.4,28621.86139,3.481132,23.0,12.0,31.0,64997690.0,51933020.0


## finding the best number of bins


In [4]:

# Assuming df is your DataFrame
prices = df['SALES PER KG']

# Define the scoring function and the range of bin numbers to evaluate
scoring_function = make_scorer(f1_score, average='weighted')
bin_range = range(5, 16)  # Example range

cv_results = {}

for bins in bin_range:
    # making a copy of the original dataframe such that the original dataframe is not affected
    dfCopy = df.copy()
    # Bin the data
    dfCopy['price_bin'], bin_edges = pd.qcut(prices, q=bins, retbins=True, labels=False, duplicates='drop')
    
    # Prepare features and target
    X = dfCopy.drop(['price_bin', 'SALES PER KG'], axis=1)  # Exclude target variable and original price column
    y = dfCopy['price_bin']

    # Define preprocessing for categorical variables (encode 'UNIT', 'CHICKEN SIZE', 'PROVINCE')
    categorical_features = ['UNIT', 'CHICKEN SIZE', 'PROVINCE']
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical_features)],
        remainder='passthrough'  # Passthrough numerical features as is
    )

    # Create the modeling pipeline
    model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('classifier', xgb.XGBClassifier(objective='multi:softmax',
                                                             num_class=bins,
                                                             max_depth=5,
                                                             eval_metric='mlogloss',
                                                             use_label_encoder=False,
                                                             seed=42))])

    # Perform cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X, y, cv=cv, scoring=scoring_function)
    
    # Store the average cross-validation score
    cv_results[bins] = np.mean(cv_scores)

# Find the number of bins with the best average F1 score
best_bin_number = max(cv_results, key=cv_results.get)
print(f"Best number of bins: {best_bin_number} with F1 Score: {cv_results[best_bin_number]:.4f}")


Best number of bins: 5 with F1 Score: 0.7085


Best number of bins is 5, highest F1 score of 0.7085

In [5]:
# performing quantile based binning
df['price_bin'], bin_edges = pd.qcut(prices, q=best_bin_number, retbins=True, labels=False, duplicates='drop')
df.drop(['SALES PER KG'], axis=1, inplace=True) # redundant 
df

Unnamed: 0,UNIT,TAILS,KG,ABW,CHICKEN SIZE,PROVINCE,YEAR,MONTH,DAY,SupplyProvince,DemandProvince,price_bin
0,PAYAKUMBUH,1267,2856.0,2.254144,LARGE,SUMATERA BARAT,19,12,1,3885216,5993194,1
1,SINJAI,3310,7799.5,2.356344,LARGE,SULAWESI SELATAN,19,12,1,5447863,5931514,4
2,BANDAR JAYA,8236,22197.0,2.695119,LARGE,LAMPUNG,19,12,1,5404511,5618463,0
3,BANDAR LAMPUNG,2565,4971.0,1.938012,MEDIUM,LAMPUNG,19,12,1,5404511,5618463,1
4,BANYUASIN,696,1464.4,2.104023,LARGE,SUMATERA SELATAN,19,12,1,5758528,7021239,0
...,...,...,...,...,...,...,...,...,...,...,...,...
15050,BOYOLALI,430,901.4,2.100000,BIG,JAWA TENGAH,23,12,31,54958385,26285612,0
15051,BOYOLALI,294,605.0,2.060000,BIG,JAWA TENGAH,23,12,31,54958385,26285612,0
15052,BOYOLALI,432,962.4,2.230000,BIG,JAWA TENGAH,23,12,31,54958385,26285612,0
15053,GUNUNGKIDUL,200,370.8,1.850000,MEDIUM,DI YOGYAKARTA,23,12,31,3993820,5186026,0


In [6]:
# Creating bin labels and ranges for better understanding
bin_labels = [f"{bin_edges[i]:.2f}-{bin_edges[i+1]:.2f}" for i in range(len(bin_edges)-1)]
print("Bin ranges:", bin_labels)

Bin ranges: ['6892.78-17150.00', '17150.00-18496.56', '18496.56-19619.88', '19619.88-20973.08', '20973.08-28621.86']


In [7]:
df

Unnamed: 0,UNIT,TAILS,KG,ABW,CHICKEN SIZE,PROVINCE,YEAR,MONTH,DAY,SupplyProvince,DemandProvince,price_bin
0,PAYAKUMBUH,1267,2856.0,2.254144,LARGE,SUMATERA BARAT,19,12,1,3885216,5993194,1
1,SINJAI,3310,7799.5,2.356344,LARGE,SULAWESI SELATAN,19,12,1,5447863,5931514,4
2,BANDAR JAYA,8236,22197.0,2.695119,LARGE,LAMPUNG,19,12,1,5404511,5618463,0
3,BANDAR LAMPUNG,2565,4971.0,1.938012,MEDIUM,LAMPUNG,19,12,1,5404511,5618463,1
4,BANYUASIN,696,1464.4,2.104023,LARGE,SUMATERA SELATAN,19,12,1,5758528,7021239,0
...,...,...,...,...,...,...,...,...,...,...,...,...
15050,BOYOLALI,430,901.4,2.100000,BIG,JAWA TENGAH,23,12,31,54958385,26285612,0
15051,BOYOLALI,294,605.0,2.060000,BIG,JAWA TENGAH,23,12,31,54958385,26285612,0
15052,BOYOLALI,432,962.4,2.230000,BIG,JAWA TENGAH,23,12,31,54958385,26285612,0
15053,GUNUNGKIDUL,200,370.8,1.850000,MEDIUM,DI YOGYAKARTA,23,12,31,3993820,5186026,0


## getting holiday data

In [8]:
# reading holiday dataset
holiday = pd.read_csv('/workspaces/forecasting/data/holiday.csv')
holiday.head()

Unnamed: 0.1,Unnamed: 0,Date,Holiday
0,,01/12/19,True
1,,02/12/19,False
2,,03/12/19,False
3,,04/12/19,False
4,,05/12/19,False


* Every Sunday is marked as a holiday as well as the whole month of Ramadan

In [9]:
holiday.dtypes

Unnamed: 0    float64
Date           object
Holiday          bool
dtype: object

In [10]:
# dropping unnecessary columns
holiday = holiday.drop('Unnamed: 0', axis=1)

# changing the date column to datetime
holiday['Date'] = pd.to_datetime(holiday['Date'], format='%d/%m/%y')

# splitting the date column into day, month and year
holiday['DAY'] = holiday['Date'].dt.day
holiday['MONTH'] = holiday['Date'].dt.month
holiday['YEAR'] = holiday['Date'].dt.year
holiday['YEAR'] = holiday['YEAR'] % 2000

# changing the holiday column to object
holiday['Holiday'] = holiday['Holiday'].astype(str)

# changing values of holiday to numerical
holiday['Holiday'] = holiday['Holiday'].map({'True': 1, 'False': 0})

# drop date column
holiday = holiday.drop('Date', axis=1)
holiday.head()



Unnamed: 0,Holiday,DAY,MONTH,YEAR
0,1,1,12,19
1,0,2,12,19
2,0,3,12,19
3,0,4,12,19
4,0,5,12,19


In [11]:
# merging with main dataset, specifying custom suffixes
df = pd.merge(df, holiday, left_on=['DAY', 'MONTH', 'YEAR'], right_on=['DAY', 'MONTH', 'YEAR'], how='left', suffixes=('', '_holiday'))
df

Unnamed: 0,UNIT,TAILS,KG,ABW,CHICKEN SIZE,PROVINCE,YEAR,MONTH,DAY,SupplyProvince,DemandProvince,price_bin,Holiday
0,PAYAKUMBUH,1267,2856.0,2.254144,LARGE,SUMATERA BARAT,19,12,1,3885216,5993194,1,1
1,SINJAI,3310,7799.5,2.356344,LARGE,SULAWESI SELATAN,19,12,1,5447863,5931514,4,1
2,BANDAR JAYA,8236,22197.0,2.695119,LARGE,LAMPUNG,19,12,1,5404511,5618463,0,1
3,BANDAR LAMPUNG,2565,4971.0,1.938012,MEDIUM,LAMPUNG,19,12,1,5404511,5618463,1,1
4,BANYUASIN,696,1464.4,2.104023,LARGE,SUMATERA SELATAN,19,12,1,5758528,7021239,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15050,BOYOLALI,430,901.4,2.100000,BIG,JAWA TENGAH,23,12,31,54958385,26285612,0,1
15051,BOYOLALI,294,605.0,2.060000,BIG,JAWA TENGAH,23,12,31,54958385,26285612,0,1
15052,BOYOLALI,432,962.4,2.230000,BIG,JAWA TENGAH,23,12,31,54958385,26285612,0,1
15053,GUNUNGKIDUL,200,370.8,1.850000,MEDIUM,DI YOGYAKARTA,23,12,31,3993820,5186026,0,1


In [25]:
# exporting data with holiday
df.to_csv('/workspaces/forecasting/data/final_holiday.csv', index=False)

## train test split


In [12]:
# train test split before label encoding to prevent data leakage


X = df.drop('price_bin', axis=1)
y = df['price_bin']



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# checking for any nan values after splitting
print(X_train.isna().sum())
print(X_test.isna().sum())
print(y_train.isna().sum())
print(y_test.isna().sum())




UNIT              0
TAILS             0
KG                0
ABW               0
CHICKEN SIZE      0
PROVINCE          0
YEAR              0
MONTH             0
DAY               0
SupplyProvince    0
DemandProvince    0
Holiday           0
dtype: int64
UNIT              0
TAILS             0
KG                0
ABW               0
CHICKEN SIZE      0
PROVINCE          0
YEAR              0
MONTH             0
DAY               0
SupplyProvince    0
DemandProvince    0
Holiday           0
dtype: int64
0
0


In [13]:
# scaling numerical variable

scaler = StandardScaler()
columns_to_scale = ['KG', 'TAILS', 'SupplyProvince', 'DemandProvince']
X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

In [14]:
X_train

Unnamed: 0,UNIT,TAILS,KG,ABW,CHICKEN SIZE,PROVINCE,YEAR,MONTH,DAY,SupplyProvince,DemandProvince,Holiday
12179,SANGATTA,-0.868153,-0.903847,1.385429,SMALL,KALIMANTAN TIMUR,21,12,17,-1.024406,-0.964375,0
14448,SIJUNJUNG,-0.279093,-0.318778,1.739214,MEDIUM,SUMATERA BARAT,21,12,31,-0.995010,-0.935591,0
13769,BOJONEGORO,-0.943107,-0.881517,2.050574,LARGE,JAWA TIMUR,21,12,27,0.087111,0.845235,1
12070,BENGKULU,0.485797,0.670116,2.089701,LARGE,BENGKULU,21,12,16,-1.104780,-1.153716,0
11502,PALANGKARAYA,-0.470844,-0.300075,2.272398,LARGE,KALIMANTAN TENGAH,21,12,13,-1.060115,-1.051460,0
...,...,...,...,...,...,...,...,...,...,...,...,...
5191,GIANYAR,-0.105658,-0.134086,1.791001,MEDIUM,BALI,20,12,1,-0.918898,-0.925787,0
13418,50 KOTA,0.028610,-0.052511,1.721377,MEDIUM,SUMATERA BARAT,21,12,24,-0.995010,-0.935591,1
5390,CIREBON,0.012689,0.020821,1.864943,MEDIUM,JAWA BARAT,20,12,2,1.468594,1.720036,0
860,SUBANG(P),1.917237,1.183325,1.440357,SMALL,JAWA BARAT,19,12,5,1.665132,1.884218,0


In [15]:
# creating function to evaluate the model
def results(y_test, predictions):
  mae = mean_absolute_error(y_test, predictions)
  mse = mean_squared_error(y_test, predictions)
  rmse = np.sqrt(mse)
  r2 = r2_score(y_test, predictions)

  print(f"Mean Absolute Error (MAE): {mae}")
  print(f"Mean Squared Error (MSE): {mse}")
  print(f"Root Mean Squared Error (RMSE): {rmse}")
  print(f"R-squared (R²): {r2}")


# encoding

In [16]:
# encoding of columns
# label encoding for province

label_encoder = LabelEncoder()
X_train['SIZE ENCODE'] = label_encoder.fit_transform(X_train['CHICKEN SIZE'])
X_test['SIZE ENCODE'] = label_encoder.transform(X_test['CHICKEN SIZE'])

# dropping chicken size column
X_train = X_train.drop(['CHICKEN SIZE'], axis = 1)
X_test = X_test.drop(['CHICKEN SIZE'], axis = 1)


# one hot encoding for province

encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X_train[['PROVINCE']])
X_train_encoded = encoder.transform(X_train[['PROVINCE']])
X_test_encoded = encoder.transform(X_test[['PROVINCE']])

# Convert the encoded matrices back to dataframes for easier manipulation
columns = encoder.get_feature_names_out(['PROVINCE'])
X_train_encoded_df = pd.DataFrame(X_train_encoded.toarray(), columns=columns, index=X_train.index)
X_test_encoded_df = pd.DataFrame(X_test_encoded.toarray(), columns=columns, index=X_test.index)

# merge back the dataframe
X_train = pd.concat([X_train.drop('PROVINCE', axis=1), X_train_encoded_df], axis=1)
X_test = pd.concat([X_test.drop('PROVINCE', axis=1), X_test_encoded_df], axis=1)


# one hot encoding for unit
encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X_train[['UNIT']])
X_train_encoded_unit = encoder.transform(X_train[['UNIT']])
X_test_encoded_unit = encoder.transform(X_test[['UNIT']])

# Convert the encoded matrices back to dataframes for easier manipulation
columns = encoder.get_feature_names_out(['UNIT'])
X_train_encoded_df_unit = pd.DataFrame(X_train_encoded_unit.toarray(), columns=columns, index=X_train.index)
X_test_encoded_df_unit = pd.DataFrame(X_test_encoded_unit.toarray(), columns=columns, index=X_test.index)

# merge back the dataframe, dropping unit column
X_train = pd.concat([X_train.drop('UNIT', axis=1), X_train_encoded_df_unit], axis=1)
X_test = pd.concat([X_test.drop('UNIT', axis=1), X_test_encoded_df_unit], axis=1)




# trying lazy predict

In [17]:
pip install lazypredict

Note: you may need to restart the kernel to use updated packages.


In [18]:
# libraries
import lazypredict
from lazypredict.Supervised import LazyClassifier
'''from sklearn.utils import all_estimators
from sklearn.base import RegressorMixin'''


'from sklearn.utils import all_estimators\nfrom sklearn.base import RegressorMixin'

In [19]:
print(lazypredict.Supervised.CLASSIFIERS)

[('AdaBoostClassifier', <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>), ('BaggingClassifier', <class 'sklearn.ensemble._bagging.BaggingClassifier'>), ('BernoulliNB', <class 'sklearn.naive_bayes.BernoulliNB'>), ('CalibratedClassifierCV', <class 'sklearn.calibration.CalibratedClassifierCV'>), ('CategoricalNB', <class 'sklearn.naive_bayes.CategoricalNB'>), ('DecisionTreeClassifier', <class 'sklearn.tree._classes.DecisionTreeClassifier'>), ('DummyClassifier', <class 'sklearn.dummy.DummyClassifier'>), ('ExtraTreeClassifier', <class 'sklearn.tree._classes.ExtraTreeClassifier'>), ('ExtraTreesClassifier', <class 'sklearn.ensemble._forest.ExtraTreesClassifier'>), ('GaussianNB', <class 'sklearn.naive_bayes.GaussianNB'>), ('KNeighborsClassifier', <class 'sklearn.neighbors._classification.KNeighborsClassifier'>), ('LabelPropagation', <class 'sklearn.semi_supervised._label_propagation.LabelPropagation'>), ('LabelSpreading', <class 'sklearn.semi_supervised._label_propagation.LabelSp

In [20]:
'''
    chosen_regressors = [
    'SVR', 
    'BaggingRegressor',
    'ExtraTreesRegressor',
    'RandomForestRegressor',
    'GradientBoostingRegressor',
    'LGBMRegressor',
    'XGBRegressor',
    'CatBoostRegressor',
    'HistGradientBoostingRegressor',
    'AdaBoostRegressor',
    'KNeighborsRegressor',
    'DecisionTreeRegressor'
]

REGRESSORS = [
    est
    for est in all_estimators()
    if (issubclass(est[1], RegressorMixin) and est[0] in chosen_regressors)

]'''

# initialising the regressor with chosen regressors
reg = LazyClassifier(verbose=1, ignore_warnings=False, custom_metric=None)


In [21]:

# fitting and evaluate models
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
models

  0%|          | 0/29 [00:00<?, ?it/s]

  3%|▎         | 1/29 [00:01<00:28,  1.00s/it]

ROC AUC couldn't be calculated for AdaBoostClassifier
multi_class must be in ('ovo', 'ovr')
{'Model': 'AdaBoostClassifier', 'Accuracy': 0.46496180670873466, 'Balanced Accuracy': 0.4645024031811561, 'ROC AUC': None, 'F1 Score': 0.4643338200488807, 'Time taken': 1.002105474472046}


 10%|█         | 3/29 [00:01<00:16,  1.59it/s]

ROC AUC couldn't be calculated for BaggingClassifier
multi_class must be in ('ovo', 'ovr')
{'Model': 'BaggingClassifier', 'Accuracy': 0.7356360013284623, 'Balanced Accuracy': 0.734941901152864, 'ROC AUC': None, 'F1 Score': 0.7360805580806883, 'Time taken': 0.7636623382568359}
ROC AUC couldn't be calculated for BernoulliNB
multi_class must be in ('ovo', 'ovr')
{'Model': 'BernoulliNB', 'Accuracy': 0.42909332447691795, 'Balanced Accuracy': 0.42601129936614723, 'ROC AUC': None, 'F1 Score': 0.3898159581942892, 'Time taken': 0.08462786674499512}





KeyboardInterrupt: 

In [None]:
# filtering out the top 5 models
top_5_models = models.head(5)
top_5_models

In [None]:
y_train.nunique()

# Logistic Regression

In [33]:
#Applying Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(multi_class = 'multinomial', solver = 'saga',  max_iter = 10000)
lr.fit(X_train, y_train)

# predictions
predictions_lr = lr.predict(X_test)
print(classification_report(predictions, y_test))

# print accuracy score
acc_lr = metrics.accuracy_score(y_test, predictions)
print(f"Accuracy: {acc_lr}")

              precision    recall  f1-score   support

           0       0.64      0.60      0.62       672
           1       0.41      0.41      0.41       582
           2       0.39      0.42      0.40       552
           3       0.39      0.47      0.43       503
           4       0.66      0.56      0.61       702

    accuracy                           0.50      3011
   macro avg       0.50      0.49      0.49      3011
weighted avg       0.51      0.50      0.50      3011

Accuracy: 0.4995018266356692


# Random Forest



In [34]:
# random forest model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 400) # 400 trees
rf.fit(X_train, y_train)

# predictions
predictions_rf = rf.predict(X_test)
print(classification_report(predictions_rf, y_test))

# print accuracy score
acc_rf = metrics.accuracy_score(y_test, predictions_rf)
print(f"Accuracy: {acc_rf}")

              precision    recall  f1-score   support

           0       0.77      0.78      0.77       620
           1       0.65      0.68      0.66       567
           2       0.70      0.65      0.67       642
           3       0.70      0.73      0.72       579
           4       0.86      0.85      0.85       603

    accuracy                           0.74      3011
   macro avg       0.74      0.74      0.74      3011
weighted avg       0.74      0.74      0.74      3011

Accuracy: 0.7363002324809034


# Gradient Boosting

## XGBoost

In [22]:
# applying xgboost
xgb = xgb.XGBClassifier(objective='multi:softmax', num_class=best_bin_number, max_depth=5, eval_metric='mlogloss', use_label_encoder=False, seed=42)
xgb.fit(X_train, y_train)

# predictions
predictions_xgb = xgb.predict(X_test)
print(classification_report(predictions_xgb, y_test))

# accuracy
acc_xgb = metrics.accuracy_score(y_test, predictions_xgb)
print(f"Accuracy: {acc_xgb}")

              precision    recall  f1-score   support

           0       0.75      0.77      0.76       619
           1       0.64      0.64      0.64       594
           2       0.65      0.67      0.66       575
           3       0.68      0.67      0.68       610
           4       0.84      0.82      0.83       613

    accuracy                           0.71      3011
   macro avg       0.71      0.71      0.71      3011
weighted avg       0.72      0.71      0.71      3011

Accuracy: 0.7140484888741282


## CatBoost

In [24]:
# applying Catboost
from catboost import CatBoostClassifier
cat = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, loss_function='MultiClass')

# predictions
cat.fit(X_train, y_train)
predictions_cat = cat.predict(X_test)

# accuracy_score
acc_cat = metrics.accuracy_score(y_test, predictions_cat)
print(f"Accuracy: {acc_cat}")



0:	learn: 1.5498542	total: 56.2ms	remaining: 56.1s
1:	learn: 1.4957460	total: 65.5ms	remaining: 32.7s
2:	learn: 1.4495605	total: 80.6ms	remaining: 26.8s
3:	learn: 1.4103174	total: 87.8ms	remaining: 21.9s
4:	learn: 1.3732804	total: 95.1ms	remaining: 18.9s
5:	learn: 1.3464259	total: 102ms	remaining: 16.9s
6:	learn: 1.3194497	total: 109ms	remaining: 15.5s
7:	learn: 1.2966463	total: 117ms	remaining: 14.4s
8:	learn: 1.2722093	total: 125ms	remaining: 13.7s
9:	learn: 1.2539168	total: 132ms	remaining: 13.1s
10:	learn: 1.2354226	total: 139ms	remaining: 12.5s
11:	learn: 1.2177092	total: 147ms	remaining: 12.1s
12:	learn: 1.2046153	total: 157ms	remaining: 11.9s
13:	learn: 1.1908928	total: 164ms	remaining: 11.5s
14:	learn: 1.1776773	total: 171ms	remaining: 11.2s
15:	learn: 1.1653959	total: 178ms	remaining: 10.9s
16:	learn: 1.1535093	total: 185ms	remaining: 10.7s
17:	learn: 1.1422866	total: 192ms	remaining: 10.5s
18:	learn: 1.1325437	total: 199ms	remaining: 10.3s
19:	learn: 1.1233001	total: 206ms	re

# KNN Classification

In [35]:
# applying KNN classification
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# predictions 
predictions_knn = knn.predict(X_test)
print(classification_report(predictions_knn, y_test))

# print accuracy score
acc_knn = metrics.accuracy_score(y_test, predictions_knn)
print(f"Accuracy: {acc_knn}")

              precision    recall  f1-score   support

           0       0.74      0.68      0.71       694
           1       0.60      0.57      0.58       627
           2       0.58      0.62      0.60       557
           3       0.60      0.67      0.64       539
           4       0.77      0.76      0.76       594

    accuracy                           0.66      3011
   macro avg       0.66      0.66      0.66      3011
weighted avg       0.66      0.66      0.66      3011

Accuracy: 0.6592494187977416


# Decision Tree

In [21]:
# applying decision trees
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

# predictions
predictions_dt = dt.predict(X_test)
print(classification_report(predictions_dt, y_test))

# print accuracy score
acc_dt = metrics.accuracy_score(y_test, predictions_dt)
print(f"Accuracy: {acc_dt}")

              precision    recall  f1-score   support

           0       0.72      0.75      0.74       605
           1       0.63      0.61      0.62       614
           2       0.62      0.60      0.61       620
           3       0.65      0.67      0.66       587
           4       0.81      0.82      0.81       585

    accuracy                           0.69      3011
   macro avg       0.69      0.69      0.69      3011
weighted avg       0.69      0.69      0.69      3011

Accuracy: 0.6884755895051478


# SVM Classifier with Linear
the worst results

In [17]:
# appling linear SVM Classifier
from sklearn import svm

svm_clf = svm.LinearSVC(multi_class='ovr', max_iter = 1000)
svm_clf.fit(X_train, y_train)

# predictions
predictions_svmClf = svm_clf.predict(X_test)
print(classification_report(predictions_svmClf, y_test))

# print accuracy score
acc_svmClf = metrics.accuracy_score(y_test, predictions_svmClf)
print(f"Accuracy: {acc_svmClf}")





              precision    recall  f1-score   support

           0       0.47      0.70      0.56       429
           1       0.37      0.46      0.41       473
           2       0.26      0.35      0.30       440
           3       0.67      0.35      0.46      1138
           4       0.51      0.57      0.54       531

    accuracy                           0.46      3011
   macro avg       0.46      0.49      0.45      3011
weighted avg       0.50      0.46      0.46      3011

Accuracy: 0.4559946861507805




# RBF Kernel 

In [19]:
# applying a RBF Kernel

from sklearn.svm import SVC
rbf_svc = SVC(kernel='rbf', random_state = 0, gamma=0.1, C = 1)
rbf_svc.fit(X_train, y_train)  

# predictions
predictions_rbf = rbf_svc.predict(X_test)
print(classification_report(predictions_rbf, y_test))

# print accuracy score
acc_rbf = metrics.accuracy_score(y_test, predictions_rbf)   
print(f"Accuracy: {acc_rbf}")



              precision    recall  f1-score   support

           0       0.70      0.79      0.74       564
           1       0.66      0.58      0.61       673
           2       0.60      0.63      0.62       567
           3       0.65      0.70      0.68       554
           4       0.83      0.75      0.79       653

    accuracy                           0.69      3011
   macro avg       0.69      0.69      0.69      3011
weighted avg       0.69      0.69      0.69      3011

Accuracy: 0.6878113583527068


# gaussian naive bayes

In [20]:
# applying Gaussian NB classifier
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# print classification report
predictions_gnb = gnb.predict(X_test)
print(classification_report(predictions_gnb, y_test))

# print accuracy score
acc_gnb = metrics.accuracy_score(y_test, predictions_gnb)
print(f"Accuracy: {acc_gnb}")


              precision    recall  f1-score   support

           0       0.87      0.31      0.46      1784
           1       0.14      0.36      0.20       224
           2       0.10      0.30      0.15       192
           3       0.03      0.42      0.05        38
           4       0.65      0.50      0.56       773

    accuracy                           0.36      3011
   macro avg       0.35      0.38      0.28      3011
weighted avg       0.70      0.36      0.44      3011

Accuracy: 0.36067751577548984


# XGB MODEL

In [None]:
# trying XGBM model
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

# label encoding the target variable
label_encoder = LabelEncoder().fit(y_train_resampled)  # Fit on y_train to learn all classes
y_train_encoded = label_encoder.transform(y_train_resampled)
y_test_encoded = label_encoder.transform(y_test)

# Dynamically set the number of classes
num_classes = len(label_encoder.classes_)



# Specify the parameters for XGBoost
params = {
    'objective': 'multi:softmax',  # Specify multi-class classification
    'num_class': num_classes,  # Dynamically set the number of classes
    'max_depth': 4,  # Depth of the trees
    'learning_rate': 0.1,  # Learning rate
    'n_estimators': 100,  # Number of trees
    'seed': 42  # Random seed for reproducibility
}

# Initialize the XGBoost classifier
clf = xgb.XGBClassifier(**params)

# Train the model with encoded targets
clf.fit(X_train_resampled, y_train_encoded)

# Make predictions
y_pred = clf.predict(X_test)

# getting unique classes for predicted labels
unique_y_test = np.unique(y_test_encoded)
unique_y_pred = np.unique(y_pred)

# get unique classes 
all_unique_classes = np.unique(np.concatenate((y_test_encoded, y_pred)))

# map the unique encoded classes back to original names
target_names_adjusted = label_encoder.inverse_transform(all_unique_classes)

# Evaluate the model
accuracy = accuracy_score(y_test_encoded, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_matrix(y_test_encoded, y_pred))
print("Classification Report:")
print(classification_report(y_test_encoded, y_pred))



Accuracy: 61.91%
Confusion Matrix:
[[  0   0   0   0   2   0   0   0   0   0   0   0]
 [  0   0   0   0   2   2   0   0   0   0   0   0]
 [  0   0   0   0   0   2   0   0   0   0   0   0]
 [  0   1   1  25  15   4   1   0   0   0   0   0]
 [  0   1   0  50 507  99  24  11   4   5   0   0]
 [  1   1   2  25 148 470 111  34  25   4   1   0]
 [  0   0   0   5  30  95 513 141  58   5   2   0]
 [  0   0   0   1   4   8  63 237  79  10   0   0]
 [  0   0   0   1   4   4   4  29  79   8   0   0]
 [  0   0   0   0   0   0   0   5  12  32   0   0]
 [  0   0   0   0   0   0   0   0   2   0   1   0]
 [  0   0   0   0   0   0   0   0   1   0   0   0]]
Classification Report:
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         2
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00         2
           4       0.23      0.53      0.32        47
           5       0.71      0.72      0.72       701
    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Neural Network
using entity embeddings to train the neural network


## setting up the model
creating entity embeddings <br>
`cat_col1` = `PROVINCE` <br>
`cat_col2` = `CHICKEN SIZE` <br>
`cat_col3` = `SIZE` <br>

In [None]:
# original dataset `df`

# importing libraries
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Reshape, Concatenate, Dense, Flatten

# getting the unique values of the categorical columns
cat_columns = ['PROVINCE', 'CHICKEN SIZE', 'UNIT']
unique_cat1 = df['PROVINCE'].nunique()
unique_cat2 = df['CHICKEN SIZE'].nunique()
unique_cat3 = df['UNIT'].nunique()

# getting numerical columns
num_columns = df.columns.difference(cat_columns)
numerical_data = df[num_columns]

# scaling the numerical features
scaler = StandardScaler()
numerical_data_scaled = scaler.fit_transform(numerical_data)

# defining the input layer
num_input = Input(shape=(numerical_data_scaled.shape[1],), name='num_input')

# embedding the categorical columns
cat_input1 = Input(shape = (1,), name='cat_input1')
cat_embed1 = Embedding(input_dim=unique_cat1, output_dim = int(min(np.ceil(unique_cat1 /2), 50)))(cat_input1)
cat_embed1 = Flatten()(cat_embed1)

cat_input2 = Input(shape = (1,), name='cat_input2')
cat_embed2 = Embedding(input_dim=unique_cat2, output_dim = int(min(np.ceil(unique_cat2 /2), 50)))(cat_input2)
cat_embed2 = Flatten()(cat_embed2)


cat_input3 = Input(shape = (1,), name='cat_input3')
cat_embed3 = Embedding(input_dim=unique_cat3, output_dim = int(min(np.ceil(unique_cat3 /2), 50)))(cat_input3)
cat_embed3 = Flatten()(cat_embed3)


# concatenate the embeddings with numerical input
concatenated = Concatenate()([cat_embed1, cat_embed2, cat_embed3, num_input])


# adding the dense layer ontop of the embeddings
dense_output = Dense(128, activation = 'relu')(concatenated)
dense_output = Dense(64, activation = 'relu')(dense_output)
output = Dense(1)(dense_output)

# creating the model
model = Model(inputs = [cat_input1, cat_input2, cat_input3, num_input], outputs = output)
model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()


## training the model


In [None]:
# train model
X_train_cat1 = X_train['PROVINCE']
X_train_cat2 = X_train['CHICKEN SIZE']
X_train_cat3 = X_train['UNIT']
X_train_num = X_train.drop(['PROVINCE', 'CHICKEN SIZE', 'UNIT'], axis = 1)

# fitting the model


In [None]:
df

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import numpy as np


# Define the neural network architecture
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)  # Single output node for regression
])

# Compile the model, specifying the optimizer, loss function, and metric
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# Fit the model on the training data
history = model.fit(X_train, y_train, validation_split=0.1, epochs=100, verbose=1)

# Predict the target on the testing set
predictions = model.predict(X_test).flatten()

# Evaluate the model performance using RMSE
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f"RMSE: {rmse}")

# You can plot the training history to check how the loss and metric evolved over epochs
import matplotlib.pyplot as plt

plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='validation loss')
plt.legend()
plt.show()
