In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.subplots import make_subplots
import plotly.graph_objs as go
import calendar
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from category_encoders import TargetEncoder
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
from catboost import CatBoostRegressor, Pool, cv
import pickle

In [22]:
holiday = pd.read_parquet('data/holidays_events.parquet.gzip')
oil = pd.read_parquet('data/oil.parquet.gzip')
sample = pd.read_parquet('data/sample_submission.parquet.gzip')
store = pd.read_parquet('data/stores.parquet.gzip')
test = pd.read_parquet('data/test.parquet.gzip')
train = pd.read_parquet('data/train.parquet.gzip')
trans = pd.read_parquet('data/transactions.parquet.gzip')

In [23]:
df = pd.merge(train, holiday, on='date', how='left')
df = pd.merge(df, oil, on='date', how='left')
df = pd.merge(df, trans, on=['date','store_nbr'], how='left')
df = pd.merge(df, store, on='store_nbr', how='left')

In [24]:
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
# df['weekday'] = df['date'].dt.day_name()
# df['week'] = df['date'].dt.isocalendar().week
# df['quarter'] = df['date'].dt.quarter
df['date_num'] = df['date'].dt.day

In [27]:
df = df.rename(columns = {"type_x" : "type_holiday", "type_y" : "type_store"})

In [28]:
df.locale.value_counts()

National    261954
Local       208494
Regional     32076
Name: locale, dtype: int64

In [29]:
df.isnull().sum()

id                    0
date                  0
store_nbr             0
family                0
sales                 0
onpromotion           0
type_holiday    2551824
locale          2551824
locale_name     2551824
description     2551824
transferred     2551824
dcoilwtico       955152
transactions     249117
city                  0
state                 0
type_store            0
cluster               0
year                  0
month                 0
date_num              0
dtype: int64

In [30]:
# Calculate the percentage of null values in each column
null_counts_all_columns = df.isnull().sum()
total_rows = len(df)
null_percentages_all_columns = (null_counts_all_columns / total_rows) * 100
print(null_percentages_all_columns)

id               0.000000
date             0.000000
store_nbr        0.000000
family           0.000000
sales            0.000000
onpromotion      0.000000
type_holiday    83.547258
locale          83.547258
locale_name     83.547258
description     83.547258
transferred     83.547258
dcoilwtico      31.271879
transactions     8.156143
city             0.000000
state            0.000000
type_store       0.000000
cluster          0.000000
year             0.000000
month            0.000000
date_num         0.000000
dtype: float64


In [31]:
#  Dealing with null value columns

df.drop(['type_holiday','locale','locale_name','description','transferred'], axis=1,inplace=True)
df['dcoilwtico'].fillna(df['dcoilwtico'].mean(), inplace=True)
df['transactions'].fillna(df['transactions'].mean(), inplace=True)

In [32]:
df.shape

(3054348, 15)

In [33]:
df.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,dcoilwtico,transactions,city,state,type_store,cluster,year,month,date_num
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,68.015874,1697.071441,Quito,Pichincha,D,13,2013,1,1
1,1,2013-01-01,1,BABY CARE,0.0,0,68.015874,1697.071441,Quito,Pichincha,D,13,2013,1,1
2,2,2013-01-01,1,BEAUTY,0.0,0,68.015874,1697.071441,Quito,Pichincha,D,13,2013,1,1
3,3,2013-01-01,1,BEVERAGES,0.0,0,68.015874,1697.071441,Quito,Pichincha,D,13,2013,1,1
4,4,2013-01-01,1,BOOKS,0.0,0,68.015874,1697.071441,Quito,Pichincha,D,13,2013,1,1


In [34]:
df.shape

(3054348, 15)

# Model building

In [36]:
df.drop('id', axis=1,inplace=True)

In [37]:
df.head()

Unnamed: 0,date,store_nbr,family,sales,onpromotion,dcoilwtico,transactions,city,state,type_store,cluster,year,month,date_num
0,2013-01-01,1,AUTOMOTIVE,0.0,0,68.015874,1697.071441,Quito,Pichincha,D,13,2013,1,1
1,2013-01-01,1,BABY CARE,0.0,0,68.015874,1697.071441,Quito,Pichincha,D,13,2013,1,1
2,2013-01-01,1,BEAUTY,0.0,0,68.015874,1697.071441,Quito,Pichincha,D,13,2013,1,1
3,2013-01-01,1,BEVERAGES,0.0,0,68.015874,1697.071441,Quito,Pichincha,D,13,2013,1,1
4,2013-01-01,1,BOOKS,0.0,0,68.015874,1697.071441,Quito,Pichincha,D,13,2013,1,1


In [38]:
X = df.drop('sales',axis=1)
y = df['sales']

In [39]:
X.drop('date', axis=1,inplace=True)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
num_cols = X_train.select_dtypes(include=['float', 'int']).columns.tolist()

In [42]:
scaler = MinMaxScaler()
scaler.fit(X_train[num_cols])

In [43]:
# Transform training and testing data
X_train[num_cols] = scaler.transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

Encoding

In [44]:
encoder = TargetEncoder()

In [45]:
# Fit encoder on training data
encoder.fit(X_train,y_train)

In [46]:
# Apply encoding on training and testing data
X_train_encoded = encoder.transform(X_train)
X_test_encoded = encoder.transform(X_test)

In [47]:
X_train_encoded.head()

Unnamed: 0,store_nbr,family,onpromotion,dcoilwtico,transactions,city,state,type_store,cluster,year,month,date_num
580605,0.075472,2399.913397,0.0,0.791662,0.176083,216.086327,216.086327,352.605504,0.1875,0.0,0.909091,0.566667
457845,0.09434,2399.913397,0.0,0.962809,0.196553,560.288132,557.673722,352.605504,0.75,0.0,0.727273,0.3
2585113,0.773585,270.809323,0.011019,0.226223,0.105219,295.086803,295.086803,352.605504,0.0625,0.75,0.909091,0.933333
1839313,0.301887,270.809323,0.0,0.495391,0.191525,560.288132,557.673722,198.071872,0.6875,0.5,0.818182,0.566667
2829206,0.075472,270.809323,0.012397,0.495391,0.138856,216.086327,216.086327,352.605504,0.1875,1.0,0.272727,0.433333


In [48]:
X_train_encoded.dtypes

store_nbr       float64
family          float64
onpromotion     float64
dcoilwtico      float64
transactions    float64
city            float64
state           float64
type_store      float64
cluster         float64
year            float64
month           float64
date_num        float64
dtype: object

In [92]:
X_train.head()

Unnamed: 0,store_nbr,family,onpromotion,dcoilwtico,transactions,city,state,type_store,cluster,year,month,date_num
580605,0.075472,BEVERAGES,0.0,0.791662,0.176083,Santo Domingo,Santo Domingo de los Tsachilas,D,0.1875,0.0,0.909091,0.566667
457845,0.09434,BEVERAGES,0.0,0.962809,0.196553,Quito,Pichincha,D,0.75,0.0,0.727273,0.3
2585113,0.773585,PERSONAL CARE,0.011019,0.226223,0.105219,Cuenca,Azuay,D,0.0625,0.75,0.909091,0.933333
1839313,0.301887,PERSONAL CARE,0.0,0.495391,0.191525,Quito,Pichincha,C,0.6875,0.5,0.818182,0.566667
2829206,0.075472,PERSONAL CARE,0.012397,0.495391,0.138856,Santo Domingo,Santo Domingo de los Tsachilas,D,0.1875,1.0,0.272727,0.433333


In [91]:
# Define the XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, learning_rate=0.05, max_depth=5)

In [93]:
# Train the model
xgb_model.fit(X_train_encoded, y_train)

In [94]:
# Make predictions on the testing set
y_pred = xgb_model.predict(X_test_encoded)

In [95]:
# Evaluate the model using mean squared error
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)

Mean Squared Error: 66393.85010823068


In [96]:
r2 = r2_score(y_test, y_pred)
print("R2 score: ", r2)

R2 score:  0.9444997807750841


In [97]:
# calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("RMSE:", rmse)

RMSE: 257.6700411538576


In [98]:
print('y_pred \t y_test')
for i in range(len(y_pred)):
    print(f'{y_pred[i]:.2f} \t\t {y_test.iloc[i]:.2f}')

y_pred 	 y_test
486.77 		 471.00
5.04 		 1.00
10.87 		 0.00
-12.64 		 0.00
6.31 		 0.00
3345.82 		 7866.91
77.70 		 107.00
-632.20 		 0.00
12.59 		 11.00
-5.52 		 0.00
1.42 		 0.00
0.56 		 0.00
29.57 		 9.00
80.90 		 46.00
2.83 		 28.00
6.33 		 0.00
313.91 		 253.80
801.35 		 1057.00
13.32 		 0.00
8.08 		 2.00
2170.55 		 2382.00
-19.48 		 0.00
219.66 		 255.00
131.25 		 0.00
5.71 		 9.00
31.51 		 10.00
241.33 		 371.44
4.93 		 6.00
6.05 		 5.00
6.99 		 0.00
6.10 		 0.00
361.14 		 458.00
244.90 		 237.00
-3.61 		 0.00
-2.50 		 0.00
156.12 		 84.00
6.40 		 7.00
155.86 		 102.47
9.09 		 4.00
465.92 		 386.00
16.33 		 16.00
-12.13 		 0.00
-7.88 		 0.00
-0.34 		 0.00
7.06 		 0.00
84.98 		 50.78
-0.48 		 1.00
67.50 		 40.03
1.49 		 16.00
15.79 		 18.00
-17.42 		 0.00
1016.38 		 679.00
57.45 		 68.00
148.53 		 147.32
27.96 		 8.00
66.70 		 57.67
32.89 		 0.00
20.98 		 23.00
9.61 		 1.00
-2.45 		 0.00
255.34 		 159.25
-0.50 		 0.00
0.53 		 0.00
-24.20 		 0.00
1045.28 		 1363.00
0.62 		 0.00
-4

10.98 		 0.00
1.11 		 0.00
57.81 		 54.00
-13.85 		 0.00
5.40 		 2.00
11.58 		 6.00
-7.98 		 0.00
-10.57 		 0.00
2621.10 		 2348.76
22.78 		 7.00
22.91 		 13.00
3.10 		 0.00
1038.59 		 1590.79
2988.66 		 3371.00
0.39 		 1.00
71.49 		 0.00
4.24 		 4.00
238.11 		 0.00
1365.73 		 1532.00
11.23 		 0.00
36.41 		 35.00
18.38 		 0.00
708.07 		 573.00
6.10 		 8.00
-3.77 		 1.00
305.97 		 190.00
426.07 		 519.08
3285.04 		 2510.00
54.90 		 96.00
221.45 		 329.70
-2.23 		 0.00
1810.63 		 1941.00
-4.05 		 0.00
468.76 		 514.35
469.54 		 1602.15
1214.96 		 1237.74
10.64 		 15.00
96.90 		 151.49
113.86 		 501.11
130.13 		 127.00
1580.75 		 1622.00
1667.86 		 1790.00
51.97 		 0.00
3332.28 		 3508.00
93.47 		 275.00
199.17 		 95.64
6755.08 		 6058.63
597.51 		 527.59
68.29 		 31.00
1.87 		 3.00
12.72 		 8.00
51.48 		 8.00
31.57 		 14.00
-2.68 		 0.00
3.89 		 11.00
6.76 		 0.00
-1.27 		 9.00
382.07 		 300.41
-4.77 		 0.00
41.75 		 88.00
195.77 		 115.00
388.98 		 420.00
7.27 		 1.00
-1.59 		 0.00
1971

3009.34 		 3892.00
345.28 		 146.84
10.71 		 9.00
17.99 		 1.00
578.67 		 199.16
273.10 		 383.52
-0.80 		 2.00
103.47 		 55.00
124.84 		 92.00
17.48 		 0.00
78.24 		 113.17
1.68 		 0.00
7.99 		 0.00
747.05 		 605.00
207.98 		 115.00
-7.18 		 7.00
27.32 		 31.53
19.47 		 0.00
79.21 		 27.00
-6.52 		 0.00
0.37 		 1.00
63.98 		 60.03
37.35 		 48.00
-4.06 		 2.00
104.61 		 83.00
-7.17 		 0.00
5.59 		 4.00
1492.57 		 1550.74
372.85 		 380.96
145.92 		 172.00
-1.82 		 0.00
55.00 		 33.69
8.49 		 4.00
1.97 		 11.00
515.87 		 718.00
1.18 		 22.00
134.52 		 308.16
291.64 		 306.00
-1.33 		 0.00
262.12 		 166.65
3.61 		 0.00
-6.58 		 0.00
1081.78 		 480.13
-1.39 		 0.00
17.57 		 11.00
18.96 		 16.00
461.58 		 449.82
1966.61 		 2004.00
692.98 		 784.00
16.25 		 4.00
5.53 		 0.00
7.96 		 0.00
-42.75 		 0.00
9.52 		 1.00
338.75 		 262.00
-38.19 		 0.00
12.54 		 11.00
-5.13 		 4.00
71.83 		 62.73
1945.20 		 2025.00
6.63 		 0.00
120.95 		 213.00
-6.96 		 0.00
185.69 		 216.00
623.72 		 615.00
11.87 

435.07 		 461.17
0.41 		 0.00
34.22 		 0.00
-2.82 		 0.00
3.25 		 0.00
84.11 		 113.00
78.26 		 83.00
3.72 		 1.00
49.41 		 10.00
11.03 		 36.00
18.22 		 0.00
15.55 		 10.00
345.91 		 343.00
6.53 		 28.00
-14.19 		 0.00
0.29 		 6.00
211.50 		 288.00
3.35 		 0.00
70.24 		 36.00
-2.43 		 0.00
3.95 		 5.00
649.70 		 579.00
18.36 		 0.00
-2.70 		 0.00
47.63 		 60.00
-24.09 		 4.00
300.84 		 231.32
-9.59 		 1.00
-6.50 		 0.00
-6.85 		 0.00
738.33 		 635.00
5.54 		 0.00
506.47 		 491.13
998.68 		 878.11
-2.89 		 0.00
-0.77 		 0.00
8322.96 		 9166.00
6.88 		 11.00
807.36 		 895.73
268.54 		 221.00
1491.14 		 1521.00
225.36 		 349.00
15.53 		 10.00
-6.89 		 0.00
34.28 		 24.51
331.41 		 466.00
52.57 		 63.00
125.69 		 0.00
-9.68 		 0.00
1.87 		 0.00
7.14 		 0.00
33.36 		 0.00
11.46 		 25.00
12.15 		 0.00
765.86 		 658.14
0.05 		 2.00
174.84 		 161.00
-1.04 		 0.00
318.02 		 266.78
-0.20 		 1.00
95.50 		 138.62
1077.56 		 756.81
110.96 		 31.00
1328.06 		 1332.00
10.19 		 37.00
1813.31 		 1214.

2297.91 		 2530.00
2.47 		 0.00
60.33 		 74.00
-0.86 		 0.00
1289.46 		 1206.00
5.30 		 0.00
225.58 		 120.67
62.25 		 0.00
73.39 		 84.00
7.18 		 4.00
55.64 		 2.00
45.97 		 0.00
12.61 		 0.00
22.89 		 0.00
269.19 		 50.30
1.63 		 4.00
6.37 		 0.00
797.19 		 722.00
18.83 		 36.00
120.16 		 0.00
244.93 		 381.00
-0.92 		 0.00
-0.06 		 3.00
49.08 		 0.00
99.94 		 0.00
482.56 		 642.63
413.96 		 591.78
6.22 		 8.00
-1.67 		 0.00
216.18 		 137.00
823.46 		 853.00
-8.97 		 0.00
161.64 		 87.72
805.48 		 733.33
258.00 		 280.63
-5.02 		 1.00
3.65 		 0.00
723.40 		 625.55
-5.11 		 0.00
544.30 		 463.98
107.75 		 38.93
6.08 		 17.00
903.01 		 1349.09
2700.56 		 2275.00
1009.61 		 1011.00
-389.29 		 0.00
195.98 		 209.00
308.32 		 261.00
12.76 		 0.00
261.50 		 321.00
374.60 		 341.29
40.06 		 29.00
-0.56 		 6.00
180.05 		 150.00
-10.88 		 0.00
416.97 		 92.00
39.43 		 0.00
0.64 		 0.00
236.55 		 85.72
6953.35 		 16105.00
265.54 		 262.83
0.18 		 0.00
4.34 		 0.00
8.40 		 9.00
100.85 		 62.54


8.44 		 0.00
4.91 		 0.00
-0.04 		 0.00
174.06 		 109.84
12.37 		 0.00
14.31 		 11.00
5.36 		 0.00
4.79 		 0.00
10.18 		 11.00
2.76 		 0.00
-18.82 		 0.00
15.92 		 1.00
89.73 		 75.00
18.95 		 4.00
3162.37 		 3091.00
4.62 		 8.00
103.19 		 70.00
9.46 		 0.00
42.63 		 15.00
16.22 		 10.00
4089.80 		 3974.64
3.89 		 6.00
13.80 		 0.00
394.03 		 321.00
-8.05 		 0.00
59.66 		 103.00
10.06 		 15.00
-2.65 		 10.00
-11.05 		 0.00
675.06 		 0.00
-2.30 		 0.00
13.95 		 10.00
105.39 		 100.30
16.15 		 75.00
271.38 		 249.59
365.90 		 180.50
5.98 		 8.00
41.33 		 89.00
-7.55 		 0.00
-5.43 		 0.00
298.91 		 224.87
5.14 		 0.00
42.58 		 73.03
3.63 		 0.00
23.95 		 15.24
-5.08 		 10.00
5.20 		 17.00
1007.73 		 897.00
29.00 		 37.00
1714.74 		 993.59
2.06 		 0.00
25.65 		 0.00
45.88 		 76.00
34.26 		 34.00
1246.28 		 1084.00
5.56 		 11.00
43.24 		 29.84
8.22 		 4.00
44.30 		 47.00
6108.94 		 6197.00
2181.76 		 1854.00
930.75 		 1088.00
-3.50 		 0.00
18.10 		 9.00
197.47 		 194.00
-5.59 		 0.00
-0.61 

1046.40 		 1037.21
310.36 		 449.00
115.96 		 86.50
91.49 		 55.00
1.93 		 0.00
3723.40 		 3934.00
575.81 		 439.00
57.92 		 0.00
64.78 		 33.00
439.81 		 578.00
3.52 		 0.00
249.59 		 250.00
1425.96 		 1547.00
257.44 		 146.00
19.32 		 0.00
-0.11 		 0.00
0.33 		 0.00
0.63 		 0.00
-10.66 		 0.00
10.22 		 31.00
-19.66 		 0.00
0.51 		 1.00
404.80 		 288.00
-5.58 		 3.00
5.88 		 0.00
206.01 		 162.53
39.43 		 20.46
15.34 		 6.00
246.54 		 173.98
-3.22 		 0.00
95.27 		 68.00
1.18 		 19.00
-0.29 		 0.00
269.86 		 199.01
5633.79 		 4697.31
58.27 		 58.00
14.76 		 2.00
154.28 		 30.00
8.48 		 9.00
0.23 		 1.00
1.35 		 0.00
12.72 		 0.00
195.81 		 167.83
14.81 		 0.00
34.63 		 21.00
1597.95 		 1548.00
4798.48 		 4638.99
3.98 		 11.00
584.95 		 534.04
-12.16 		 1.00
148.30 		 0.00
-4.18 		 0.00
10.05 		 5.00
-40.49 		 0.00
3751.21 		 2377.00
20.75 		 19.00
8404.27 		 8016.70
11.70 		 9.00
6.58 		 36.00
677.05 		 451.94
402.71 		 593.00
10.66 		 10.00
278.40 		 322.00
-1.13 		 0.00
13.11 		 0.00

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



163.54 		 256.00
23.12 		 0.00
19.74 		 13.00
-1.34 		 2.00
340.99 		 340.00
231.99 		 131.00
4.45 		 0.00
236.19 		 211.00
348.07 		 328.00
-4.05 		 0.00
-2.53 		 2.00
111.53 		 69.00
244.54 		 218.00
1135.10 		 1357.00
4.66 		 0.00
24.91 		 8.06
1577.46 		 1494.00
1.67 		 0.00
20.49 		 0.00
27.62 		 6.00
11.31 		 10.00
1.80 		 0.00
2913.73 		 2866.00
-6.97 		 4.00
172.70 		 172.00
122.65 		 0.00
3.17 		 3.00
-7.82 		 0.00
-1.34 		 5.00
1709.59 		 1717.00
12.33 		 4.00
45.40 		 69.46
279.69 		 279.93
335.91 		 159.29
4.62 		 0.00
10.32 		 0.00
-2.49 		 0.00
476.16 		 435.93
-7.04 		 0.00
38.54 		 0.00
119.50 		 60.00
23.99 		 19.00
2.97 		 0.00
1.36 		 0.00
595.65 		 422.33
-1.64 		 0.00
1.62 		 0.00
9.30 		 14.00
58.31 		 123.00
1.14 		 0.00
60.79 		 23.00
40.04 		 24.00
946.16 		 813.00
-18.63 		 30.00
5.35 		 0.00
6.78 		 8.00
445.60 		 337.00
14.62 		 0.00
414.98 		 273.00
268.71 		 137.46
780.72 		 0.00
1.55 		 4.00
1.07 		 7.00
209.63 		 361.00
-11.88 		 0.00
25.13 		 0.00
208.9

237.34 		 184.00
-4.94 		 0.00
900.72 		 694.10
168.71 		 179.00
-3.69 		 0.00
1.82 		 0.00
883.24 		 1312.00
82.57 		 84.10
1087.90 		 1106.00
855.86 		 800.00
2.90 		 4.00
179.58 		 140.00
390.89 		 643.47
3818.21 		 23.00
18.34 		 13.00
12.35 		 8.94
34.84 		 23.00
1.13 		 0.00
25.09 		 38.00
-14.08 		 0.00
1.85 		 9.00
3.75 		 17.00
22.18 		 45.00
2266.36 		 1739.00
-0.25 		 4.00
1.83 		 2.00
39.01 		 7.00
17.00 		 1.00
-6.25 		 1.00
36.63 		 45.00
3.96 		 0.00
-9.92 		 0.00
-3.30 		 0.00
559.91 		 580.28
14.74 		 33.00
130.50 		 144.09
11.69 		 5.00
9.99 		 14.00
4390.64 		 4422.00
-6.11 		 0.00
152.47 		 121.00
3944.40 		 4235.00
121.76 		 122.00
100.14 		 133.00
-8.52 		 0.00
541.57 		 541.00
1.84 		 0.00
12.17 		 1.00
7.94 		 0.00
31.81 		 3.00
27.53 		 0.00
7.99 		 7.00
265.34 		 156.97
-0.47 		 9.00
3.66 		 1.00
18.42 		 0.00
3.22 		 2.00
3634.39 		 3630.00
40.47 		 5.00
493.22 		 1037.96
45.81 		 40.00
467.86 		 415.00
3715.17 		 3289.00
-6.64 		 0.00
1.58 		 4.00
1469.28 		

358.76 		 366.00
57.16 		 280.00
184.53 		 134.00
7.01 		 7.00
-1.93 		 0.00
263.51 		 368.00
1.65 		 2.00
673.15 		 586.00
2772.40 		 3008.00
36.48 		 245.00
-2.80 		 0.00
3.37 		 1.00
16.07 		 14.00
452.63 		 480.74
2813.45 		 2631.00
7231.84 		 8172.00
7.00 		 0.00
107.04 		 0.00
241.82 		 0.00
-6.79 		 0.00
2.63 		 0.00
5.06 		 0.00
453.85 		 442.25
7.43 		 5.00
-15.72 		 13.00
1.75 		 0.00
2840.78 		 2758.00
-3.47 		 0.00
49.10 		 52.00
40.81 		 7.00
83.11 		 37.00
4.69 		 6.00
-8.58 		 0.00
154.51 		 152.00
18.03 		 20.00
2167.62 		 2186.00
12.08 		 0.00
2.89 		 2.00
121.01 		 176.00
-6.46 		 0.00
181.76 		 109.00
-3.95 		 1.00
34.94 		 25.00
139.46 		 99.00
-5.23 		 0.00
-6.45 		 1.00
-31.98 		 0.00
76.77 		 58.17
5.47 		 0.00
14.28 		 19.00
312.06 		 220.53
66.28 		 78.00
-7.14 		 0.00
6.59 		 3.00
83.62 		 32.00
6894.17 		 8379.00
-3.36 		 0.00
24.45 		 6.00
8.58 		 1.00
-15.46 		 0.00
98.95 		 82.52
101.98 		 58.88
3.51 		 12.00
3.50 		 3.00
-0.04 		 3.00
74.12 		 24.00
69.02

-5.19 		 0.00
3706.37 		 4234.00
14.54 		 2.00
-6.20 		 0.00
26.28 		 37.00
3080.85 		 2478.83
-6.92 		 0.00
-664.43 		 0.00
26.21 		 3.00
27.47 		 0.00
191.22 		 161.40
233.30 		 263.00
96.02 		 63.00
-3.77 		 1.00
10.23 		 13.00
43.61 		 57.00
1.71 		 4.00
18.78 		 25.00
19.93 		 7.00
-2.60 		 0.00
76.34 		 84.00
12.22 		 2.00
10.35 		 0.00
3052.36 		 3186.00
0.71 		 0.00
29.09 		 25.00
-3.23 		 1.00
1994.36 		 1864.00
53.95 		 44.00
184.42 		 221.58
-7.02 		 0.00
-4.13 		 0.00
-5.59 		 0.00
-0.67 		 1.00
701.09 		 334.54
5.26 		 0.00
239.36 		 119.06
7.11 		 0.00
1349.32 		 1169.12
4.97 		 6.00
13.28 		 8.00
-9.65 		 0.00
5.10 		 9.00
-27.21 		 0.00
18.37 		 35.00
1194.76 		 1336.26
-247.32 		 0.00
478.87 		 732.00
100.12 		 19.00
126.50 		 84.00
172.66 		 104.63
0.05 		 0.00
1.89 		 0.00
467.21 		 530.48
10.43 		 0.00
43.02 		 41.97
168.94 		 181.00
-294.08 		 0.00
1.96 		 8.00
154.26 		 133.00
-5.79 		 0.00
1013.57 		 1062.00
2.26 		 13.00
-0.70 		 0.00
-7.07 		 0.00
3.87 		 0.00


9.54 		 11.63
7.68 		 3.00
-10.02 		 0.00
116.53 		 138.00
-3.12 		 0.00
-6.93 		 0.00
7.95 		 2.00
5.20 		 0.00
850.14 		 900.00
11.81 		 6.00
139.86 		 93.09
2.21 		 0.00
-2.81 		 5.00
5.68 		 6.00
2.99 		 0.00
125.64 		 90.00
29.72 		 20.44
241.83 		 317.00
183.93 		 88.00
157.94 		 48.00
5.71 		 0.00
5247.60 		 4977.00
3139.36 		 2779.00
1476.67 		 1454.00
-0.82 		 7.00
84.38 		 99.00
246.99 		 480.00
68.57 		 28.00
-2.35 		 0.00
230.94 		 0.00
1.54 		 0.00
24.67 		 6.00
-1.67 		 1.00
7790.70 		 8235.00
14.61 		 0.00
6073.63 		 6204.00
11.70 		 0.00
2.45 		 9.00
-0.40 		 0.00
1645.95 		 1603.00
1.86 		 0.00
1001.09 		 1274.00
-6.56 		 0.00
18.10 		 12.00
24.02 		 41.00
17.98 		 0.00
147.51 		 138.00
2.40 		 0.00
3.68 		 0.00
110.49 		 111.28
-0.33 		 0.00
32.04 		 21.00
-0.01 		 0.00
16.06 		 0.00
-11.16 		 0.00
287.25 		 297.25
1124.10 		 1115.72
10.55 		 18.00
196.76 		 199.00
330.40 		 228.00
-6.73 		 0.00
206.89 		 0.00
140.51 		 102.00
2298.51 		 1911.41
3.70 		 11.00
4.29 		 

-21.59 		 0.00
-1.96 		 1.00
625.63 		 493.00
502.68 		 442.62
-0.06 		 0.00
5.80 		 0.00
12.37 		 0.00
-2.75 		 0.00
60.67 		 49.00
2.33 		 0.00
40.60 		 27.00
10.20 		 6.00
135.89 		 269.00
0.32 		 0.00
707.48 		 541.69
209.65 		 82.48
2.12 		 0.00
139.01 		 218.00
123.46 		 127.00
243.01 		 354.42
12.45 		 5.00
4.90 		 0.00
375.18 		 448.00
153.65 		 118.00
3.19 		 0.00
-4.90 		 3.00
19.19 		 0.00
5.56 		 3.00
143.01 		 152.97
9.08 		 8.00
423.63 		 516.25
106.96 		 105.00
1335.29 		 1323.80
-2.04 		 0.00
-13.33 		 0.00
4.62 		 23.00
-0.45 		 0.00
3.03 		 0.00
918.33 		 1022.00
1602.33 		 2266.00
-5.61 		 16.00
-1.68 		 0.00
226.92 		 53.00
2.64 		 0.00
56.96 		 53.00
10.98 		 0.00
495.37 		 512.00
127.85 		 109.75
-3.16 		 0.00
104.58 		 68.00
56.65 		 55.76
123.80 		 50.00
-0.61 		 0.00
-9.34 		 3.00
-5.75 		 0.00
13.81 		 0.00
255.63 		 241.58
37.98 		 10.00
6.51 		 0.00
8.43 		 8.00
7.60 		 3.00
564.86 		 452.00
15.41 		 4.00
2.62 		 3.00
3.76 		 9.00
-0.79 		 0.00
228.76 		 196

16.06 		 0.00
7.55 		 7.00
224.79 		 356.49
-3.02 		 0.00
14.33 		 61.00
104.29 		 126.00
420.50 		 393.41
2121.80 		 1917.00
25.79 		 6.42
441.00 		 348.00
541.32 		 651.00
11.20 		 8.00
827.18 		 794.27
8.87 		 0.00
920.29 		 590.25
5.97 		 6.00
-12.13 		 0.00
114.52 		 76.00
53.55 		 76.00
454.73 		 605.00
83.69 		 31.00
17.92 		 0.00
36.41 		 24.00
-0.89 		 0.00
11.77 		 5.00
-47.05 		 0.00
4685.90 		 3660.00
-0.82 		 1.00
2719.07 		 1932.00
15.69 		 22.00
10.75 		 13.79
214.21 		 256.00
5.36 		 0.00
-3.01 		 6.00
10.40 		 50.00
87.85 		 96.00
5.57 		 1.00
158.00 		 197.00
-2.86 		 0.00
22.97 		 6.00
238.38 		 249.57
3.56 		 0.00
-6.05 		 0.00
98.94 		 0.00
5.80 		 0.00
4.89 		 0.00
1251.13 		 1384.00
3010.75 		 2684.00
9.04 		 3.00
-669.65 		 0.00
-31.50 		 0.00
8.62 		 20.00
36.07 		 28.00
9.54 		 2.00
188.40 		 211.00
15.05 		 10.00
221.64 		 0.00
5.86 		 5.00
7.63 		 14.00
307.56 		 390.31
5.30 		 4.00
57.05 		 45.00
-28.51 		 0.00
149.98 		 116.67
21.27 		 24.00
7.42 		 7.00
3

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [104]:
# Saving encoder as pickle file
with open('tencoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)

In [105]:
# Saving model as pickle file
with open('xgbmodel.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

In [111]:
# calling encoder and model for prediction
def make_prediction(new_data):
    # Load the saved model from the pickle file
    with open('model.pkl', 'rb') as f:
        model = pickle.load(f)

    # Load the encoding schemes from the pickle file
    with open('tencoder.pkl', 'rb') as f:
        encoder = pickle.load(f)

    # Encode the new data using the same encoding scheme used during training
    new_data_encoded = encoder.transform(new_data)

    # Make predictions using the trained model
    predictions = xgb_model.predict(new_data_encoded)

    # Return the predicted value
    return predictions[0]

In [120]:
# Function to take user input and make a prediction
def predict_sale():
    store_nbr = int(input("Enter store number: "))
    family = input("Enter family: ")
    onpromotion = int(input("Enter on promotion (0 or 1): "))
    dcoilwtico = float(input("Enter oil price: "))
    transactions = int(input("Enter number of transactions: "))
    city = input("Enter city: ")
    state = input("Enter state: ")
    type_store = input("Enter type of store: ")
    cluster = int(input("Enter cluster number: "))
    year = int(input("Enter year: "))
    month = int(input("Enter month: "))
    date_num = int(input("Enter date: "))

    # Create a new DataFrame from the user input
    new_data = pd.DataFrame({
        "store_nbr": [store_nbr],
        "family": [family],
        "onpromotion": [onpromotion],
        "dcoilwtico": [dcoilwtico],
        "transactions": [transactions],
        "city": [city],
        "state": [state],
        "type_store": [type_store],
        "cluster": [cluster],
        "year": [year],
        "month": [month],
        'date_num':[date_num]
    })

    # Make a prediction using the trained model
    prediction = make_prediction(new_data)
    print('Predicted Sale: $', prediction)

In [123]:
predicted_sale = predict_sale()

Enter store number: 4
Enter family: AUTOMOTIVE
Enter on promotion (0 or 1): 1
Enter oil price: 15
Enter number of transactions: 550
Enter city: Quinto
Enter state: Pichincha
Enter type of store: D
Enter cluster number: 12
Enter year: 2020
Enter month: 12
Enter date: 25
Predicted Sale: $ 1764.6661
