In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

In [2]:
# Setting the working directory
import os
os.chdir('/Users/vishalraj/GitHub/SalesMaster-ML')

In [3]:
# Import custom functions from 'make_dataset' module
from src.data.make_dataset import load_data, display_head

# Load data
sales_train, sales_test, calendar, sell_prices, calendar_events = load_data()

# Display dataset heads
train_head, test_head, calendar_head, sell_prices_head, calendar_events_head = display_head(sales_train, sales_test, calendar, sell_prices, calendar_events)

In [4]:
# Selecting a fraction of the dataset to run the model quickly for debugging purposes
#sales_train = sales_train.sample(frac=0.10, random_state=1)

In [5]:
# Viewing the dataset
sales_train

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1532,d_1533,d_1534,d_1535,d_1536,d_1537,d_1538,d_1539,d_1540,d_1541
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,1,0,1,0,1,0,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,8,2,0,8,2,3,1,1,3,8
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,0,1,3,2,1,1,2,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,0,0,2,2,...,1,2,0,0,0,1,0,0,0,2
30486,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,0,0,0,0,12,0,0,0
30487,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,0,6,0,2,...,1,0,3,2,1,1,0,1,3,0
30488,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
from src.data.data_preprocessor import DataPreparation
# Merge data appropriately
data_prep = DataPreparation(sales_train, calendar, calendar_events, sell_prices)

In [7]:
# Prepare data for modeling
X_train, X_test, y_train, y_test, merged_data, data_with_features, X_transformed = data_prep.prepare_data()

# Now, we can print out the merged_data
print(merged_data.head())

After melting sales_train: (46985090, 8)
After merging with calendar: (46985090, 10)
After merging with sell prices: (46985090, 11)
After merging with aggregated calendar_events: (46985090, 13)
Encoder saved at models/preprocessor and encoder/encoder.joblib
                              id        item_id    dept_id   cat_id store_id  \
0  HOBBIES_1_001_CA_1_evaluation  HOBBIES_1_001  HOBBIES_1  HOBBIES     CA_1   
1  HOBBIES_1_002_CA_1_evaluation  HOBBIES_1_002  HOBBIES_1  HOBBIES     CA_1   
2  HOBBIES_1_003_CA_1_evaluation  HOBBIES_1_003  HOBBIES_1  HOBBIES     CA_1   
3  HOBBIES_1_004_CA_1_evaluation  HOBBIES_1_004  HOBBIES_1  HOBBIES     CA_1   
4  HOBBIES_1_005_CA_1_evaluation  HOBBIES_1_005  HOBBIES_1  HOBBIES     CA_1   

  state_id    d  sales       date  wm_yr_wk  sell_price  revenue  day_of_week  \
0       CA  d_1      0 2011-01-29     11101         0.0      0.0            5   
1       CA  d_1      0 2011-01-29     11101         0.0      0.0            5   
2       CA  d_1   

In [8]:
# Viewing the dataset where revenue in not 0
example = merged_data[merged_data['revenue'] > 0]
example

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,date,wm_yr_wk,sell_price,event_name,event_type,revenue,day_of_week,month,year
7,HOBBIES_1_008_CA_1_evaluation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,d_1,12,2011-01-29,11101,0.46,,,5.52,5,1,2011
8,HOBBIES_1_009_CA_1_evaluation,HOBBIES_1_009,HOBBIES_1,HOBBIES,CA_1,CA,d_1,2,2011-01-29,11101,1.56,,,3.12,5,1,2011
14,HOBBIES_1_015_CA_1_evaluation,HOBBIES_1_015,HOBBIES_1,HOBBIES,CA_1,CA,d_1,4,2011-01-29,11101,0.70,,,2.80,5,1,2011
15,HOBBIES_1_016_CA_1_evaluation,HOBBIES_1_016,HOBBIES_1,HOBBIES,CA_1,CA,d_1,5,2011-01-29,11101,0.70,,,3.50,5,1,2011
21,HOBBIES_1_022_CA_1_evaluation,HOBBIES_1_022,HOBBIES_1,HOBBIES,CA_1,CA,d_1,2,2011-01-29,11101,6.86,,,13.72,5,1,2011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46985082,FOODS_3_820_WI_3_evaluation,FOODS_3_820,FOODS_3,FOODS,WI_3,WI,d_1541,6,2015-04-18,11512,1.98,,,11.88,5,4,2015
46985083,FOODS_3_821_WI_3_evaluation,FOODS_3_821,FOODS_3,FOODS,WI_3,WI,d_1541,1,2015-04-18,11512,4.98,,,4.98,5,4,2015
46985084,FOODS_3_822_WI_3_evaluation,FOODS_3_822,FOODS_3,FOODS,WI_3,WI,d_1541,1,2015-04-18,11512,4.28,,,4.28,5,4,2015
46985085,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,d_1541,2,2015-04-18,11512,2.88,,,5.76,5,4,2015


In [9]:
# Viewing first five rows of X_transformed
print(X_transformed.head())

   day_of_week     month      year  event_type_encoded  item_id  dept_id  \
0     0.999351 -1.538933 -1.371295            0.098728   1437.0      3.0   
1     0.999351 -1.538933 -1.371295            0.098728   1438.0      3.0   
2     0.999351 -1.538933 -1.371295            0.098728   1439.0      3.0   
3     0.999351 -1.538933 -1.371295            0.098728   1440.0      3.0   
4     0.999351 -1.538933 -1.371295            0.098728   1441.0      3.0   

   cat_id  store_id  state_id  
0     1.0       0.0       0.0  
1     1.0       0.0       0.0  
2     1.0       0.0       0.0  
3     1.0       0.0       0.0  
4     1.0       0.0       0.0  


In [10]:
# Listing columns of X_transformed
list_of_columns = X_transformed.columns.tolist()
list_of_columns

['day_of_week',
 'month',
 'year',
 'event_type_encoded',
 'item_id',
 'dept_id',
 'cat_id',
 'store_id',
 'state_id']

In [11]:
# Assigning the variable with pre-obtained values for smooth code execution
best_hyperparams = {
    'colsample_bytree': 0.8640492906125685,
    'gamma': 0.0241995213596943,
    'learning_rate': 0.1463148353813622,
    'max_depth': 13,
    'min_child_weight': 6,
    'n_estimators': 26,
    'subsample': 0.8257961586938035
}

In [12]:
from src.models.xgboost_predict import XGBoostRegressor

# Initialize the model
model = XGBoostRegressor(X_train, y_train, X_test, y_test)

# Run HyperOpt optimization
#best_hyperparams = model.optimize()

# Print the best hyperparameters
print(f"Optimal hyperparameters: {best_hyperparams}")

Optimal hyperparameters: {'colsample_bytree': 0.8640492906125685, 'gamma': 0.0241995213596943, 'learning_rate': 0.1463148353813622, 'max_depth': 13, 'min_child_weight': 6, 'n_estimators': 26, 'subsample': 0.8257961586938035}


In [13]:
# Train final model with best hyperparameters
final_model = model.train_final_model(best_hyperparams)

# Save the trained model
model.save_model(final_model, 'models/predictive/final_xgboost_model.joblib')

'Model saved successfully at models/predictive/final_xgboost_model.joblib'

In [14]:
# Loading the saved model
import joblib
model_path = 'models/predictive/final_xgboost_model.joblib'
final_model = joblib.load(model_path)

In [15]:
# Preparing an example input for prediction
data_prep = DataPreparation(sales_train, calendar, calendar_events, sell_prices)
X_transformed_single = data_prep.prepare_single_data_point('FOODS_3_820', 'WI_3', '2015-04-18')

Shape before calendar merge: (1, 6)
Shape after calendar merge: (1, 8)
Shape before sell_prices merge: (1, 8)
Shape after sell_prices merge: (1, 10)
Shape before calendar_events merge: (1, 10)
Shape after calendar_events merge: (1, 12)
Encoder loaded from models/preprocessor and encoder/encoder.joblib
Preprocessor loaded from models/preprocessor and encoder/preprocessor.joblib


In [16]:
# Printing the shape of X_transformed
print(X_transformed_single.shape)

(1, 9)


In [17]:
X_transformed_single

Unnamed: 0,day_of_week,month,year,event_type_encoded,item_id,dept_id,cat_id,store_id,state_id
0,0.999351,-0.672686,1.851221,0.098728,0.0,0.0,0.0,0.0,0.0


In [18]:
# Print and inspect the shape and columns of X_transformed_single
print(X_transformed_single.shape)
print(X_transformed_single.columns)

# If using a NumPy array, convert it to DataFrame temporarily for inspection
if isinstance(X_transformed_single, np.ndarray):
    X_transformed_single = pd.DataFrame(X_transformed_single)

# Print and inspect again
print(X_transformed_single.shape)
print(X_transformed_single.columns)

# Ensure that the number of columns matches the expected number of features
assert X_transformed_single.shape[1] == 9, "Number of features does not match the expected count!"

(1, 9)
Index(['day_of_week', 'month', 'year', 'event_type_encoded', 'item_id',
       'dept_id', 'cat_id', 'store_id', 'state_id'],
      dtype='object')
(1, 9)
Index(['day_of_week', 'month', 'year', 'event_type_encoded', 'item_id',
       'dept_id', 'cat_id', 'store_id', 'state_id'],
      dtype='object')


In [19]:
# Make prediction
#predicted_revenue = final_model.predict(X_transformed_single)
predicted_revenue = final_model.predict(X_transformed_single.iloc[0].values.reshape(1, -1))


# Output the result
print(f"Predicted Revenue: {predicted_revenue[0]}")

Predicted Revenue: 6.812928199768066


In [20]:
# Evaluate the final model on the test set
model.evaluate_model(final_model, X_test, y_test)

Model Performance on Test Set:
MAE: 4.068437364487667
MSE: 77.63715218047247
RMSE: 8.811194707897021


(4.068437364487667, 77.63715218047247, 8.811194707897021)

In [22]:
# Save the transformed dataset to CSV
#X_transformed.to_csv('data/processed/X_transformed.csv', index=False)