In [1]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# reading file
df = pd.read_csv('/workspaces/forecasting/data/data.csv')
df

Unnamed: 0,DAY_DATE,UNIT,TAILS,KG,SALES PER KG,TOTAL SALES,ABW,CHICKEN SIZE,PROVINCE
0,2017-01-01,LOMBOK,6237,11356.5,22000.00000,249843000,1.820827,MEDIUM,NUSA TENGGARA BARAT
1,2017-01-01,GARUT,2625,5446.8,17000.00000,92595600,2.074971,LARGE,JAWA BARAT
2,2017-01-01,SUMEDANG,7788,14176.8,16106.05355,228332300,1.820339,MEDIUM,JAWA BARAT
3,2017-01-01,CIREBON,7504,15958.0,17000.00000,271286000,2.126599,LARGE,JAWA BARAT
4,2017-01-01,PEKALONGAN,9121,16250.0,16500.00000,268125000,1.781603,MEDIUM,JAWA TENGAH
...,...,...,...,...,...,...,...,...,...
382897,2024-01-27,SRAGEN,510,618.4,16500.00000,10203600,1.210000,SMALL,JAWA TENGAH
382898,2024-01-28,SLEMAN,48,63.2,12000.00000,758400,1.320000,SMALL,DIY YOGYAKARTA
382899,2024-01-28,SRAGEN,315,415.6,16800.00000,6982080,1.320000,SMALL,JAWA TENGAH
382900,2024-01-28,BOYOLALI,785,974.8,16000.00000,15596800,1.240000,SMALL,JAWA TENGAH


In [3]:
# changing the value name such that it matches the values in the supply and demand dataset
# finding for any nan values
df.isna().sum()
# pring out the rows with nan values
df[df.isna().any(axis=1)]

# change value name of the province column
df['PROVINCE'] = df['PROVINCE'].replace('DIY YOGYAKARTA', 'DI YOGYAKARTA')
# print out the unique values of the province column
df['PROVINCE'].unique()

# removing outlier
df = df[df['SALES PER KG'] <= 35000] # removing values that are more than 35000
df = df[df['SALES PER KG'] > 0] # taking values that are more than 0

# removing 'total sales' column
df = df.drop('TOTAL SALES', axis = 1)
df.describe()


Unnamed: 0,TAILS,KG,SALES PER KG,ABW
count,382895.0,382895.0,382895.0,382895.0
mean,6216.17515,11514.746673,18014.29458,1.864465
std,6125.450527,11925.749132,3273.056777,0.396896
min,4.0,6.5,7.461648,0.23
25%,1872.0,2999.1,16000.0,1.589617
50%,4480.0,8057.8,17900.0,1.833759
75%,8672.0,15943.05,20003.15958,2.134752
max,141479.0,286486.8,34417.44841,3.9757


# pre processing
- extracting date column
- splitting, scaling
- creating function to evaluate the model [call `results(y_test, predictions)`]


In [4]:
# extracting year and month from date column for seasonality trends
df['DAY_DATE'] = pd.to_datetime(df['DAY_DATE'], errors='coerce').dt.normalize()
df['YEAR'] = df['DAY_DATE'].dt.year
df['MONTH'] = df['DAY_DATE'].dt.month
df['DAY'] = df['DAY_DATE'].dt.day
df = df.drop('DAY_DATE', axis = 1)


## demand and supply
importing the demand and supply dataset


In [5]:
# filtering for rows for year 2019 to 2023
df = df[(df['YEAR'] >= 2019) & (df['YEAR'] <= 2023)]

In [6]:
df

Unnamed: 0,UNIT,TAILS,KG,SALES PER KG,ABW,CHICKEN SIZE,PROVINCE,YEAR,MONTH,DAY
109756,SENGKANG,1770,3881.5,23686.3326,2.192938,LARGE,SULAWESI SELATAN,2019,1,1
109757,BANJARMASIN,11290,19484.6,25500.0000,1.725828,MEDIUM,KALIMANTAN SELATAN,2019,1,1
109758,MAROS,350,468.7,25000.0000,1.339143,SMALL,SULAWESI SELATAN,2019,1,1
109759,MAMUJU,930,1347.0,26500.0000,1.448387,SMALL,SULAWESI BARAT,2019,1,1
109760,MAKASAR,285,776.3,25000.0000,2.723860,LARGE,SULAWESI SELATAN,2019,1,1
...,...,...,...,...,...,...,...,...,...,...
382498,BOYOLALI,430,901.4,15800.0000,2.100000,BIG,JAWA TENGAH,2023,12,31
382499,BOYOLALI,294,605.0,15800.0000,2.060000,BIG,JAWA TENGAH,2023,12,31
382500,BOYOLALI,432,962.4,15800.0000,2.230000,BIG,JAWA TENGAH,2023,12,31
382501,GUNUNGKIDUL,200,370.8,16200.0000,1.850000,MEDIUM,DI YOGYAKARTA,2023,12,31


In [9]:
filtered_df = df[(df['YEAR'] == 2019) & (df['MONTH'] < 12)]
filtered_df

df = df.drop(filtered_df.index)
df

Unnamed: 0,UNIT,TAILS,KG,SALES PER KG,ABW,CHICKEN SIZE,PROVINCE,YEAR,MONTH,DAY
165114,PAYAKUMBUH,1267,2856.0,18000.00000,2.254144,LARGE,SUMATERA BARAT,2019,12,1
165115,SINJAI,3310,7799.5,22637.89730,2.356344,LARGE,SULAWESI SELATAN,2019,12,1
165116,BANDAR JAYA,8236,22197.0,15905.01419,2.695119,LARGE,LAMPUNG,2019,12,1
165117,BANDAR LAMPUNG,2565,4971.0,18060.47073,1.938012,MEDIUM,LAMPUNG,2019,12,1
165118,BANYUASIN,696,1464.4,16368.88828,2.104023,LARGE,SUMATERA SELATAN,2019,12,1
...,...,...,...,...,...,...,...,...,...,...
382498,BOYOLALI,430,901.4,15800.00000,2.100000,BIG,JAWA TENGAH,2023,12,31
382499,BOYOLALI,294,605.0,15800.00000,2.060000,BIG,JAWA TENGAH,2023,12,31
382500,BOYOLALI,432,962.4,15800.00000,2.230000,BIG,JAWA TENGAH,2023,12,31
382501,GUNUNGKIDUL,200,370.8,16200.00000,1.850000,MEDIUM,DI YOGYAKARTA,2023,12,31


In [10]:
df.describe()

Unnamed: 0,TAILS,KG,SALES PER KG,ABW,YEAR,MONTH,DAY
count,217386.0,217386.0,217386.0,217386.0,217386.0,217386.0,217386.0
mean,6354.093796,11910.584185,18292.703849,1.907814,2021.2299,6.033958,15.593778
std,6904.340224,13567.563524,3401.00787,0.420927,1.034077,3.402449,8.732707
min,5.0,6.5,19.090447,0.23,2019.0,1.0,1.0
25%,1428.0,2850.0,16131.276495,1.603477,2020.0,3.0,8.0
50%,4110.0,7259.45,18259.870585,1.897434,2021.0,6.0,16.0
75%,9115.0,16797.5,20500.0,2.21,2022.0,9.0,23.0
max,141479.0,286486.8,32000.0,3.92,2023.0,12.0,31.0


In [11]:


# importing demand and supply dataset
df_demand_supply = pd.read_csv('/workspaces/forecasting/data/provinceDD&SS.csv')
df_demand_supply

# splitting the month_year column into separate month and year columns
df_demand_supply[['MONTH', 'YEAR']] = df_demand_supply['Month_Year'].str.split(' ', expand = True)

# mapping month to numerical values
month_num = {
    'Jan' : 1, 'Feb' : 2, 'Mar' : 3, 'Apr' : 4, 'May' : 5, 'Jun' : 6, 'Jul' : 7, 
    'Aug' : 8, 'Sep' : 9, 'Oct' : 10, 'Nov' : 11, 'Dec' : 12
}

# replacing month with numerical values
df_demand_supply['MONTH'] = df_demand_supply['MONTH'].replace(month_num)
df_demand_supply['YEAR'] = df_demand_supply['YEAR'].astype(int)

# dropping original month_year column
df_demand_supply.drop('Month_Year', axis = 1, inplace = True)
df_demand_supply

  df_demand_supply['MONTH'] = df_demand_supply['MONTH'].replace(month_num)


Unnamed: 0,PROVINCE,SupplyProvince,DemandProvince,MONTH,YEAR
0,ACEH,2687246,3930723,12,2019
1,BALI,5120529,5141045,12,2019
2,BANTEN,16349469,13433846,12,2019
3,BENGKULU,554903,1339242,12,2019
4,DI YOGYAKARTA,4206148,6313628,12,2019
...,...,...,...,...,...
1723,SULAWESI TENGGARA,104238,705272,12,2023
1724,SULAWESI UTARA,396941,1307670,12,2023
1725,SUMATERA BARAT,3689089,4922821,12,2023
1726,SUMATERA SELATAN,5467836,5767260,12,2023


In [12]:
# merging the demand and supply dataset with the main dataset
df = pd.merge(df, df_demand_supply, how = 'inner', on = ['YEAR', 'MONTH', 'PROVINCE'])
df

Unnamed: 0,UNIT,TAILS,KG,SALES PER KG,ABW,CHICKEN SIZE,PROVINCE,YEAR,MONTH,DAY,SupplyProvince,DemandProvince
0,PAYAKUMBUH,1267,2856.0,18000.00000,2.254144,LARGE,SUMATERA BARAT,2019,12,1,3885216,5993194
1,SINJAI,3310,7799.5,22637.89730,2.356344,LARGE,SULAWESI SELATAN,2019,12,1,5447863,5931514
2,BANDAR JAYA,8236,22197.0,15905.01419,2.695119,LARGE,LAMPUNG,2019,12,1,5404511,5618463
3,BANDAR LAMPUNG,2565,4971.0,18060.47073,1.938012,MEDIUM,LAMPUNG,2019,12,1,5404511,5618463
4,BANYUASIN,696,1464.4,16368.88828,2.104023,LARGE,SUMATERA SELATAN,2019,12,1,5758528,7021239
...,...,...,...,...,...,...,...,...,...,...,...,...
214665,BOYOLALI,430,901.4,15800.00000,2.100000,BIG,JAWA TENGAH,2023,12,31,54958385,26285612
214666,BOYOLALI,294,605.0,15800.00000,2.060000,BIG,JAWA TENGAH,2023,12,31,54958385,26285612
214667,BOYOLALI,432,962.4,15800.00000,2.230000,BIG,JAWA TENGAH,2023,12,31,54958385,26285612
214668,GUNUNGKIDUL,200,370.8,16200.00000,1.850000,MEDIUM,DI YOGYAKARTA,2023,12,31,3993820,5186026


In [13]:
# changing the values of the year column to be only the last 2 digits.
df['YEAR'] = df['YEAR'] % 100
df


Unnamed: 0,UNIT,TAILS,KG,SALES PER KG,ABW,CHICKEN SIZE,PROVINCE,YEAR,MONTH,DAY,SupplyProvince,DemandProvince
0,PAYAKUMBUH,1267,2856.0,18000.00000,2.254144,LARGE,SUMATERA BARAT,19,12,1,3885216,5993194
1,SINJAI,3310,7799.5,22637.89730,2.356344,LARGE,SULAWESI SELATAN,19,12,1,5447863,5931514
2,BANDAR JAYA,8236,22197.0,15905.01419,2.695119,LARGE,LAMPUNG,19,12,1,5404511,5618463
3,BANDAR LAMPUNG,2565,4971.0,18060.47073,1.938012,MEDIUM,LAMPUNG,19,12,1,5404511,5618463
4,BANYUASIN,696,1464.4,16368.88828,2.104023,LARGE,SUMATERA SELATAN,19,12,1,5758528,7021239
...,...,...,...,...,...,...,...,...,...,...,...,...
214665,BOYOLALI,430,901.4,15800.00000,2.100000,BIG,JAWA TENGAH,23,12,31,54958385,26285612
214666,BOYOLALI,294,605.0,15800.00000,2.060000,BIG,JAWA TENGAH,23,12,31,54958385,26285612
214667,BOYOLALI,432,962.4,15800.00000,2.230000,BIG,JAWA TENGAH,23,12,31,54958385,26285612
214668,GUNUNGKIDUL,200,370.8,16200.00000,1.850000,MEDIUM,DI YOGYAKARTA,23,12,31,3993820,5186026


In [14]:
df.describe()

Unnamed: 0,TAILS,KG,SALES PER KG,ABW,YEAR,MONTH,DAY,SupplyProvince,DemandProvince
count,214670.0,214670.0,214670.0,214670.0,214670.0,214670.0,214670.0,214670.0,214670.0
mean,6421.047822,12036.998042,18299.877575,1.906868,21.220157,5.958476,15.578935,32929820.0,19566180.0
std,6921.436457,13605.508503,3416.740043,0.419967,1.036941,3.356649,8.729411,27589520.0,15142360.0
min,5.0,6.5,19.090447,0.23,19.0,1.0,1.0,83222.0,308772.0
25%,1470.0,2850.0,16128.206187,1.603159,20.0,3.0,8.0,5195385.0,5149320.0
50%,4231.0,7553.175,18260.16854,1.894998,21.0,6.0,16.0,34127610.0,20480620.0
75%,9200.0,16950.3,20512.682845,2.209885,22.0,9.0,23.0,59652950.0,30178300.0
max,141479.0,286486.8,32000.0,3.92,23.0,12.0,31.0,88407350.0,64062720.0


# adding holiday dataset

In [15]:
# reading holiiday dataset
holiday = pd.read_csv('/workspaces/forecasting/data/holiday.csv')
holiday.head()

Unnamed: 0.1,Unnamed: 0,Date,Holiday
0,,01/12/19,True
1,,02/12/19,False
2,,03/12/19,False
3,,04/12/19,False
4,,05/12/19,False


In [16]:
# dropping unnecessary columns
holiday = holiday.drop('Unnamed: 0', axis=1)

# changing the date column to datetime
holiday['Date'] = pd.to_datetime(holiday['Date'], format='%d/%m/%y')

# splitting the date column into day, month and year
holiday['DAY'] = holiday['Date'].dt.day
holiday['MONTH'] = holiday['Date'].dt.month
holiday['YEAR'] = holiday['Date'].dt.year
holiday['YEAR'] = holiday['YEAR'] % 2000

# changing the holiday column to object
holiday['Holiday'] = holiday['Holiday'].astype(str)

# changing values of holiday to numerical
holiday['Holiday'] = holiday['Holiday'].map({'True': 1, 'False': 0})

# drop date column
holiday = holiday.drop('Date', axis=1)
holiday.head()

Unnamed: 0,Holiday,DAY,MONTH,YEAR
0,1,1,12,19
1,0,2,12,19
2,0,3,12,19
3,0,4,12,19
4,0,5,12,19


In [17]:
# merging the holiday dataset with the main dataset
df = pd.merge(df, holiday, left_on=['DAY', 'MONTH', 'YEAR'], right_on=['DAY', 'MONTH', 'YEAR'], how='left', suffixes=('', '_holiday'))
df

Unnamed: 0,UNIT,TAILS,KG,SALES PER KG,ABW,CHICKEN SIZE,PROVINCE,YEAR,MONTH,DAY,SupplyProvince,DemandProvince,Holiday
0,PAYAKUMBUH,1267,2856.0,18000.00000,2.254144,LARGE,SUMATERA BARAT,19,12,1,3885216,5993194,1
1,SINJAI,3310,7799.5,22637.89730,2.356344,LARGE,SULAWESI SELATAN,19,12,1,5447863,5931514,1
2,BANDAR JAYA,8236,22197.0,15905.01419,2.695119,LARGE,LAMPUNG,19,12,1,5404511,5618463,1
3,BANDAR LAMPUNG,2565,4971.0,18060.47073,1.938012,MEDIUM,LAMPUNG,19,12,1,5404511,5618463,1
4,BANYUASIN,696,1464.4,16368.88828,2.104023,LARGE,SUMATERA SELATAN,19,12,1,5758528,7021239,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
214665,BOYOLALI,430,901.4,15800.00000,2.100000,BIG,JAWA TENGAH,23,12,31,54958385,26285612,1
214666,BOYOLALI,294,605.0,15800.00000,2.060000,BIG,JAWA TENGAH,23,12,31,54958385,26285612,1
214667,BOYOLALI,432,962.4,15800.00000,2.230000,BIG,JAWA TENGAH,23,12,31,54958385,26285612,1
214668,GUNUNGKIDUL,200,370.8,16200.00000,1.850000,MEDIUM,DI YOGYAKARTA,23,12,31,3993820,5186026,1


In [18]:
# exporting the dataset 
df.to_csv('/workspaces/forecasting/data/bigDataHoliday.csv', index = False)