In [18]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.compose import ColumnTransformer
import statsmodels.api as sma
from sklearn.model_selection import cross_val_score,GridSearchCV,RandomizedSearchCV
from sklearn import tree
from scipy.stats import randint
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.decomposition import PCA

In [2]:
df1=pd.read_csv("df_trim.csv")
df1.head()

Unnamed: 0,product_id,time_id,customer_id,promotion_id,store_id,store_sales,store_cost,unit_sales,product_class_id,brand_name,...,houseowner,num_cars_owned,fullname,the_date,the_day,the_year,the_month,day_of_month,day_since_epoch,promotion_period
0,451,677,7366,207,3,7.12,2.2784,4,55,Red Wing,...,Y,4,Ann Smith,11/7/1997,Friday,1997,11,7,34279,4
1,1293,677,7366,207,3,5.76,1.7856,4,14,Booker,...,Y,4,Ann Smith,11/7/1997,Friday,1997,11,7,34279,4
2,738,677,7366,207,3,5.55,2.0535,3,39,Consolidated,...,Y,4,Ann Smith,11/7/1997,Friday,1997,11,7,34279,4
3,1426,677,8947,207,3,7.48,2.244,2,61,Hermanos,...,Y,2,Robert Bell,11/7/1997,Friday,1997,11,7,34279,4
4,338,677,8947,207,3,2.28,1.0488,4,58,Better,...,Y,2,Robert Bell,11/7/1997,Friday,1997,11,7,34279,4


In [3]:
#Impute null values with median for specified columns
columns_to_impute = ['store_sqft', 'grocery_sqft', 'frozen_sqft', 'meat_sqft']
imputer = SimpleImputer(strategy='median')
df1[columns_to_impute] = imputer.fit_transform(df1[columns_to_impute])

In [4]:
# Check for remaining null values
df_null = df1.isnull().sum()
df_null

product_id          0
time_id             0
customer_id         0
promotion_id        0
store_id            0
                   ..
the_year            0
the_month           0
day_of_month        0
day_since_epoch     0
promotion_period    0
Length: 94, dtype: int64

In [5]:
# Remove specified columns
columns_to_remove = ["lname", "fname", "mi", "address1", "address2"]
df1 = df1.drop(columns=columns_to_remove)

In [6]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70751 entries, 0 to 70750
Data columns (total 89 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   product_id             70751 non-null  int64  
 1   time_id                70751 non-null  int64  
 2   customer_id            70751 non-null  int64  
 3   promotion_id           70751 non-null  int64  
 4   store_id               70751 non-null  int64  
 5   store_sales            70751 non-null  float64
 6   store_cost             70751 non-null  float64
 7   unit_sales             70751 non-null  int64  
 8   product_class_id       70751 non-null  int64  
 9   brand_name             70751 non-null  object 
 10  product_name           70751 non-null  object 
 11  SKU                    70751 non-null  int64  
 12  SRP                    70751 non-null  float64
 13  gross_weight           70751 non-null  float64
 14  net_weight             70751 non-null  float64
 15  re

# PCA for numerical columns

# numerical columns

In [7]:
df_num=df1[['store_sales', 'store_cost', 'unit_sales','SRP', 'gross_weight',
       'net_weight', 'units_per_case',
       'cases_per_pallet', 'shelf_width', 'shelf_height', 'shelf_depth',
       'cost','store_sqft','grocery_sqft', 'frozen_sqft', 'meat_sqft', 
       'total_children', 'num_children_at_home', 'num_cars_owned','day_since_epoch', 'promotion_period']]

In [8]:
df_num.shape

(70751, 21)

In [10]:
df_num.corr()

Unnamed: 0,store_sales,store_cost,unit_sales,SRP,gross_weight,net_weight,units_per_case,cases_per_pallet,shelf_width,shelf_height,...,cost,store_sqft,grocery_sqft,frozen_sqft,meat_sqft,total_children,num_children_at_home,num_cars_owned,day_since_epoch,promotion_period
store_sales,1.0,0.954721,0.506463,0.831683,0.035323,0.030964,-0.006682,0.020763,-0.007104,-0.00297,...,-0.00213,0.014345,0.009673,0.016534,0.016531,0.079918,0.028998,0.004698,0.004083,-0.007396
store_cost,0.954721,1.0,0.483684,0.79369,0.034339,0.030173,-0.006083,0.01905,-0.007882,-0.004228,...,-0.001552,0.016543,0.012013,0.017911,0.017907,0.07572,0.024283,0.00362,0.006454,-0.005551
unit_sales,0.506463,0.483684,1.0,-0.001458,0.002653,0.001956,0.002895,-0.005181,-0.004848,0.004326,...,-0.010038,0.029064,0.023259,0.028641,0.028635,0.160142,0.063025,0.018828,0.012703,0.001676
SRP,0.831683,0.79369,-0.001458,1.0,0.042143,0.037472,-0.009469,0.028148,-0.003762,-0.006952,...,0.002819,0.001447,-0.000662,0.003761,0.003761,-0.002013,-0.002002,-0.005722,-0.002604,-0.008061
gross_weight,0.035323,0.034339,0.002653,0.042143,1.0,0.988892,-0.012524,-0.009303,-0.009721,-0.018058,...,0.000151,-0.004841,-0.004352,-0.003824,-0.003824,0.000464,-0.003467,0.005372,-0.000495,0.000158
net_weight,0.030964,0.030173,0.001956,0.037472,0.988892,1.0,-0.015189,-0.010974,-0.010019,-0.017077,...,0.000164,-0.004028,-0.003406,-0.003502,-0.003502,0.000498,-0.003591,0.004886,-9.2e-05,0.000469
units_per_case,-0.006682,-0.006083,0.002895,-0.009469,-0.012524,-0.015189,1.0,-0.01744,0.002274,0.022014,...,-0.002231,0.001836,0.003488,-0.00115,-0.00115,0.002606,0.002079,-0.008283,-0.004951,0.007424
cases_per_pallet,0.020763,0.01905,-0.005181,0.028148,-0.009303,-0.010974,-0.01744,1.0,0.025709,-0.339911,...,-0.000974,0.002915,0.003496,0.000972,0.000971,-0.004819,-0.001166,-0.005075,0.002207,-0.001522
shelf_width,-0.007104,-0.007882,-0.004848,-0.003762,-0.009721,-0.010019,0.002274,0.025709,1.0,0.009367,...,-0.006242,0.005919,0.005414,0.004807,0.004806,0.005278,0.003106,0.004362,0.001629,-0.001965
shelf_height,-0.00297,-0.004228,0.004326,-0.006952,-0.018058,-0.017077,0.022014,-0.339911,0.009367,1.0,...,-0.002234,-7.5e-05,0.000646,-0.001491,-0.00149,0.002672,0.004461,0.004213,0.001404,0.001287


In [13]:
X=df_num.drop("cost",axis=1)
y=df_num["cost"]

In [35]:
ss=StandardScaler()
X_s=pd.DataFrame(data=ss.fit_transform(X),columns=X.columns)


In [36]:
pca=PCA()
X_pca=pd.DataFrame(data=pca.fit_transform(X_s),columns=[f"PC{i}" for i in range (1,21)])

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

In [50]:
pca.explained_variance_ratio_

array([1.61709892e-01, 1.45905938e-01, 9.91976433e-02, 7.38259999e-02,
       6.71085635e-02, 6.26504748e-02, 5.22057359e-02, 5.01458855e-02,
       4.94692439e-02, 4.75731717e-02, 4.67104604e-02, 4.09709374e-02,
       3.45611477e-02, 3.28047252e-02, 2.98746345e-02, 3.62297576e-03,
       1.06058455e-03, 5.54223339e-04, 4.77602681e-05, 2.76531593e-09])

In [39]:
df_var_pca=pd.DataFrame({"component":range(1,21),"Variance":np.cumsum(pca.explained_variance_ratio_)})

In [40]:
df_var_pca

Unnamed: 0,component,Variance
0,1,0.16171
1,2,0.307616
2,3,0.406813
3,4,0.480639
4,5,0.547748
5,6,0.610399
6,7,0.662604
7,8,0.71275
8,9,0.762219
9,10,0.809793


# Linear Regression

In [51]:
model_lr=LinearRegression().fit(X_train,y_train)

In [52]:
ypred_train=model_lr.predict(X_train)
ypred_test=model_lr.predict(X_test)

In [53]:
mse_train=mean_squared_error(y_train,ypred_train)
mse_test=mean_squared_error(y_test,ypred_test)
print("Mean squared error for train data:",mse_train)
print("Mean squared error for test data:",mse_test)

Mean squared error for train data: 8627373.272357
Mean squared error for test data: 8634543.143823057


In [54]:
from sklearn.model_selection import cross_val_score
model_lr=LinearRegression()
# Example: Cross-validation with 3 folds
scores = cross_val_score(model_lr, X_pca, y, cv=3, scoring='neg_mean_squared_error')
print("Cross-validated RMSE:", -scores)

Cross-validated RMSE: [8600910.33393185 9357799.82287908 8572833.87563128]


In [55]:
print(np.min(-scores),np.max(-scores),np.mean(-scores))

8572833.875631278 9357799.82287908 8843848.01081407


# Decision Tree Regressor

In [56]:
dtr=DecisionTreeRegressor(random_state=42)
model_dtr=dtr.fit(X_train,y_train)

In [57]:
df_imp_dt=pd.DataFrame({"feature":X_train.columns,"importance":model_dtr.feature_importances_})
df_imp_dt.sort_values(by="importance",ascending=False)

Unnamed: 0,feature,importance
12,PC13,0.226457
5,PC6,0.157876
19,PC20,0.140619
11,PC12,0.138914
18,PC19,0.061906
10,PC11,0.052147
0,PC1,0.043317
3,PC4,0.038115
8,PC9,0.024787
14,PC15,0.020999


In [58]:
ypred_train=model_dtr.predict(X_train)
ypred_test=model_dtr.predict(X_test)

In [59]:
mse_train=mean_squared_error(y_train,ypred_train)
mse_test=mean_squared_error(y_test,ypred_test)
print("Mean squared error for train data:",mse_train)
print("Mean squared error for test data:",mse_test)

Mean squared error for train data: 0.0
Mean squared error for test data: 5332801.537983181


In [60]:
# Example: Cross-validation with 3 folds
scores = cross_val_score(DecisionTreeRegressor(random_state=42), X_pca, y, cv=3, scoring='neg_mean_squared_error')
print("Cross-validated MSE:", -scores)

Cross-validated MSE: [14479850.99092605 15015151.64251187 14847995.61959886]


In [61]:
print(np.min(-scores),np.max(-scores),np.mean(-scores))

14479850.990926052 15015151.642511873 14780999.41767893


# Random Forest Regressor

In [62]:
rfr=RandomForestRegressor(random_state=42,n_jobs=-1)
model_rfr=rfr.fit(X_train,y_train)

In [63]:
df_imp_rfr=pd.DataFrame({"feature":X_train.columns,"importance":model_rfr.feature_importances_})
df_imp_rfr.sort_values(by="importance",ascending=False)

Unnamed: 0,feature,importance
12,PC13,0.198787
5,PC6,0.157791
19,PC20,0.147963
11,PC12,0.142847
18,PC19,0.070057
10,PC11,0.047159
0,PC1,0.039656
3,PC4,0.037243
8,PC9,0.024376
14,PC15,0.022074


In [64]:
ypred_train=model_rfr.predict(X_train)
ypred_test=model_rfr.predict(X_test)

In [65]:
mse_train=mean_squared_error(y_train,ypred_train)
mse_test=mean_squared_error(y_test,ypred_test)
print("Mean squared error for train data:",mse_train)
print("Mean squared error for test data:",mse_test)

Mean squared error for train data: 338642.76370522444
Mean squared error for test data: 2403670.2247216664


In [66]:
# Example: Cross-validation with 3 folds
scores = cross_val_score(RandomForestRegressor(random_state=42,n_jobs=-1), X, y, cv=3, scoring='neg_mean_squared_error')
print("Cross-validated RMSE:", -scores)

Cross-validated RMSE: [12356471.58222615 10615149.74954203  8684129.45815665]


In [67]:
print(np.min(-scores),np.max(-scores),np.mean(-scores))

8684129.45815665 12356471.582226152 10551916.929974945


# Ada Boost Regressor

In [68]:
abr=AdaBoostRegressor(random_state=42)
model_abr=abr.fit(X_train,y_train)

In [69]:
df_imp_abr=pd.DataFrame({"feature":X_train.columns,"importance":model_abr.feature_importances_})
df_imp_abr.sort_values(by="importance",ascending=False)

Unnamed: 0,feature,importance
5,PC6,0.305973
19,PC20,0.304547
12,PC13,0.174189
0,PC1,0.083758
11,PC12,0.066163
18,PC19,0.06262
10,PC11,0.002604
6,PC7,0.000145
4,PC5,0.0
3,PC4,0.0


In [70]:
ypred_train=model_abr.predict(X_train)
ypred_test=model_abr.predict(X_test)

In [71]:
mse_train=mean_squared_error(y_train,ypred_train)
mse_test=mean_squared_error(y_test,ypred_test)
print("Mean squared error for train data:",mse_train)
print("Mean squared error for test data:",mse_test)

Mean squared error for train data: 8295588.311108493
Mean squared error for test data: 8331066.337120172


In [72]:
# Example: Cross-validation with 3 folds
scores = cross_val_score(AdaBoostRegressor(random_state=42), X, y, cv=3, scoring='neg_mean_squared_error')
print("Cross-validated RMSE:", -scores)

Cross-validated RMSE: [8348333.13449341 9525967.29580601 8474380.87502543]


In [73]:
print(np.min(-scores),np.max(-scores),np.mean(-scores))

8348333.134493412 9525967.295806007 8782893.768441616


# Gradient Boost Regressor

In [74]:
gbr=GradientBoostingRegressor(random_state=42)
model_gbr=gbr.fit(X_train,y_train)

In [75]:
df_imp_gbr=pd.DataFrame({"feature":X_train.columns,"importance":model_gbr.feature_importances_})
df_imp_gbr.sort_values(by="importance",ascending=False)

Unnamed: 0,feature,importance
19,PC20,0.277103
12,PC13,0.252325
5,PC6,0.180833
11,PC12,0.153278
18,PC19,0.07081
0,PC1,0.06121
14,PC15,0.003455
10,PC11,0.00058
8,PC9,0.000197
3,PC4,0.000194


In [76]:
ypred_train=model_gbr.predict(X_train)
ypred_test=model_gbr.predict(X_test)

In [77]:
mse_train=mean_squared_error(y_train,ypred_train)
mse_test=mean_squared_error(y_test,ypred_test)
print("Mean squared error for train data:",mse_train)
print("Mean squared error for test data:",mse_test)

Mean squared error for train data: 6920652.226526251
Mean squared error for test data: 7054832.8326270105


In [78]:
# Example: Cross-validation with 3 folds
scores = cross_val_score(GradientBoostingRegressor(random_state=42), X, y, cv=3, scoring='neg_mean_squared_error')
print("Cross-validated RMSE:", -scores)

Cross-validated RMSE: [8038088.71550337 9273306.57472286 8805762.12188278]


In [79]:
print(np.min(-scores),np.max(-scores),np.mean(-scores))

8038088.715503374 9273306.574722856 8705719.13736967


In [81]:
# XGBoost Regressor

In [80]:
xgbr=XGBRegressor(random_state=42)
model_xgbr=xgbr.fit(X_train,y_train)

In [82]:
df_imp_xgbr=pd.DataFrame({"feature":X_train.columns,"importance":model_xgbr.feature_importances_})
df_imp_xgbr.sort_values(by="importance",ascending=False)

Unnamed: 0,feature,importance
19,PC20,0.158536
12,PC13,0.13022
11,PC12,0.116366
5,PC6,0.11073
0,PC1,0.093404
18,PC19,0.081165
10,PC11,0.042192
3,PC4,0.035167
14,PC15,0.034274
8,PC9,0.025708


In [83]:
ypred_train=model_xgbr.predict(X_train)
ypred_test=model_xgbr.predict(X_test)

In [84]:
mse_train=mean_squared_error(y_train,ypred_train)
mse_test=mean_squared_error(y_test,ypred_test)
print("Mean squared error for train data:",mse_train)
print("Mean squared error for test data:",mse_test)

Mean squared error for train data: 2961546.649330562
Mean squared error for test data: 4199890.220706725


In [85]:
# Example: Cross-validation with 3 folds
scores = cross_val_score(XGBRegressor(random_state=42), X, y, cv=3, scoring='neg_mean_squared_error')
print("Cross-validated RMSE:", -scores)

Cross-validated RMSE: [10244439.23136117  9441074.56212129  9173854.96164762]


In [86]:
print(np.min(-scores),np.max(-scores),np.mean(-scores))

9173854.961647622 10244439.231361175 9619789.585043363


# Stacking Regressor

In [87]:
base_estimators=[
    ("XGBoostRegressor", XGBRegressor(random_state=42)),
    ("RandomForestRegressor", RandomForestRegressor(random_state=42, n_jobs=-1))
]
sr=StackingRegressor(estimators=base_estimators,final_estimator=DecisionTreeRegressor(random_state=42))
model_sr=sr.fit(X_train,y_train)

In [88]:
ypred_train=model_sr.predict(X_train)
ypred_test=model_sr.predict(X_test)

In [89]:
mse_train=mean_squared_error(y_train,ypred_train)
mse_test=mean_squared_error(y_test,ypred_test)
print("Mean squared error for train data:",mse_train)
print("Mean squared error for test data:",mse_test)

Mean squared error for train data: 2283940.8838339224
Mean squared error for test data: 4609380.803900785


In [90]:
# Example: Cross-validation with 3 folds
scores = cross_val_score(sr, X, y, cv=3, scoring='neg_mean_squared_error')
print("Cross-validated RMSE:", -scores)

Cross-validated RMSE: [16300067.9140095  16144778.83823779 12693098.55179578]


In [92]:
print(np.min(-scores),np.max(-scores),np.mean(-scores))

12693098.551795784 16300067.914009498 15045981.768014356


In [93]:
base_estimators=[
    ("DecisionTreeRegressor", DecisionTreeRegressor(random_state=42)),
    ("RandomForestRegressor", RandomForestRegressor(random_state=42, n_jobs=-1))
]
sr=StackingRegressor(estimators=base_estimators,final_estimator=XGBRegressor(random_state=42))
model_sr=sr.fit(X_train,y_train)

In [94]:
ypred_train=model_sr.predict(X_train)
ypred_test=model_sr.predict(X_test)

In [95]:
mse_train=mean_squared_error(y_train,ypred_train)
mse_test=mean_squared_error(y_test,ypred_test)
print("Mean squared error for train data:",mse_train)
print("Mean squared error for test data:",mse_test)

Mean squared error for train data: 105712.78454069805
Mean squared error for test data: 2125626.4775746525


In [96]:
# Example: Cross-validation with 3 folds
scores = cross_val_score(StackingRegressor(estimators=[
    ("DecisionTreeRegressor", DecisionTreeRegressor(random_state=42)),
    ("RandomForestRegressor", RandomForestRegressor(random_state=42, n_jobs=-1))
],final_estimator=XGBRegressor(random_state=42)), X, y, cv=3, scoring='neg_mean_squared_error')
print("Cross-validated RMSE:", -scores)

Cross-validated RMSE: [12761187.6362493  10769341.80824901 13973703.2729621 ]


In [97]:
print(np.min(-scores),np.max(-scores),np.mean(-scores))

10769341.808249012 13973703.272962103 12501410.905820139


# Try PCA with best 16 columns (PCA components and additional categorical columns)

In [91]:
l=['department', 'promotion_name', 'media_type', 'sales_district', 
'store_type', 'video_store', 'yearly_income', 'member_card', 'occupation', 
'store_cost', 'unit_sales', 'SRP', 'net_weight', 'cost', 'store_sqft', 
'promotion_period']

* ignore cost(y variable), SRP,store_cost,unit_sales,net_weight,store_sqft,promotion_period

* add 9 columns ['department', 'promotion_name', 'media_type', 'sales_district', 
'store_type', 'video_store', 'yearly_income', 'member_card', 'occupation'] to 20 PCA columns.

In [98]:
df_cat=df1[['department', 'promotion_name', 'media_type', 'sales_district',
            'store_type', 'video_store', 'yearly_income', 'member_card', 'occupation']]

In [99]:
df_cat

Unnamed: 0,department,promotion_name,media_type,sales_district,store_type,video_store,yearly_income,member_card,occupation
0,Household,Super Duper Savers,Cash Register Handout,Bremerton,Supermarket,0,$30K - $50K,Bronze,Skilled Manual
1,Dairy,Super Duper Savers,Cash Register Handout,Bremerton,Supermarket,0,$30K - $50K,Bronze,Skilled Manual
2,Health and Hygiene,Super Duper Savers,Cash Register Handout,Bremerton,Supermarket,0,$30K - $50K,Bronze,Skilled Manual
3,Produce,Super Duper Savers,Cash Register Handout,Bremerton,Supermarket,0,$50K - $70K,Normal,Skilled Manual
4,Canned Foods,Super Duper Savers,Cash Register Handout,Bremerton,Supermarket,0,$50K - $70K,Normal,Skilled Manual
...,...,...,...,...,...,...,...,...,...
70746,Produce,Savings Galore,In-Store Coupon,Victoria,Mid-Size Grocery,0,$110K - $130K,Silver,Management
70747,Produce,Savings Galore,In-Store Coupon,Victoria,Mid-Size Grocery,0,$110K - $130K,Silver,Management
70748,Household,Savings Galore,In-Store Coupon,Victoria,Mid-Size Grocery,0,$90K - $110K,Golden,Professional
70749,Produce,Savings Galore,In-Store Coupon,Victoria,Mid-Size Grocery,0,$90K - $110K,Golden,Professional


In [100]:
le=LabelEncoder()
for col in df_cat.columns:
    df_cat[col] = le.fit_transform(df_cat[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat[col] = le.fit_transform(df_cat[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat[col] = le.fit_transform(df_cat[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cat[col] = le.fit_transform(df_cat[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try us

In [101]:
df_cat

Unnamed: 0,department,promotion_name,media_type,sales_district,store_type,video_store,yearly_income,member_card,occupation
0,14,38,1,2,4,0,4,0,4
1,9,38,1,2,4,0,4,0,4
2,13,38,1,2,4,0,4,0,4
3,17,38,1,2,4,0,5,2,4
4,5,38,1,2,4,0,5,2,4
...,...,...,...,...,...,...,...,...,...
70746,17,35,5,18,2,0,1,3,1
70747,17,35,5,18,2,0,1,3,1
70748,14,35,5,18,2,0,7,1,3
70749,17,35,5,18,2,0,7,1,3


In [102]:
X_pca_cat=pd.concat([df_cat,X_pca],axis=1)

In [103]:
X_pca_cat

Unnamed: 0,department,promotion_name,media_type,sales_district,store_type,video_store,yearly_income,member_card,occupation,PC1,...,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20
0,14,38,1,2,4,0,4,0,4,4.514427,...,-1.485732,-0.428173,-0.173639,-0.973417,-0.942388,-0.362827,0.045072,0.014808,-0.008978,-0.000206
1,9,38,1,2,4,0,4,0,4,4.407766,...,-1.697152,-0.464314,-0.159286,0.526051,-0.987803,-0.376559,-0.010194,0.140271,-0.008533,-0.000208
2,13,38,1,2,4,0,4,0,4,4.373638,...,-1.298399,-0.492705,-0.221501,-0.424091,-0.750266,-0.068166,0.037416,0.132735,-0.008790,-0.000213
3,17,38,1,2,4,0,5,2,4,4.524975,...,0.703133,-0.592110,-0.332274,-0.830183,0.524281,-0.727250,-0.171523,-0.160199,-0.009989,-0.000208
4,5,38,1,2,4,0,5,2,4,4.223732,...,-0.957563,-0.549126,-0.126123,-0.917360,-0.006334,-0.116851,-0.235099,0.027235,-0.008443,-0.000196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70746,17,35,5,18,2,0,1,3,1,1.025363,...,0.207094,2.223028,1.262995,-0.740191,1.249848,0.217350,-0.084863,-0.004281,-0.013234,-0.000404
70747,17,35,5,18,2,0,1,3,1,0.735296,...,-1.868233,2.165762,1.433547,-0.185573,0.816688,-0.611329,-0.427707,-0.009264,-0.011574,-0.000396
70748,14,35,5,18,2,0,7,1,3,0.709828,...,0.350121,2.224400,1.348922,0.231415,-0.307479,-0.113464,0.062389,0.155655,-0.011816,-0.000404
70749,17,35,5,18,2,0,7,1,3,0.789311,...,0.796232,2.170244,1.256801,0.563298,-0.072192,-0.086792,-0.078855,-0.007849,-0.011894,-0.000409


In [104]:
y=df_num["cost"]

In [105]:
X_train, X_test, y_train, y_test = train_test_split(X_pca_cat, y, test_size=0.2, random_state=42)

# Linear Regression

In [106]:
model_lr=LinearRegression().fit(X_train,y_train)

In [107]:
ypred_train=model_lr.predict(X_train)
ypred_test=model_lr.predict(X_test)

In [108]:
mse_train=mean_squared_error(y_train,ypred_train)
mse_test=mean_squared_error(y_test,ypred_test)
print("Mean squared error for train data:",mse_train)
print("Mean squared error for test data:",mse_test)

Mean squared error for train data: 8509614.52289576
Mean squared error for test data: 8517068.049463183


In [116]:
from sklearn.model_selection import cross_val_score
model_lr=LinearRegression()
# Example: Cross-validation with 3 folds
scores = cross_val_score(model_lr, X_pca_cat, y, cv=3, scoring='neg_mean_squared_error')
print("Cross-validated RMSE:", -scores)

Cross-validated RMSE: [8723641.25245674 9586837.11863598 8763684.31084193]


In [117]:
print(np.min(-scores),np.max(-scores),np.mean(-scores))

8723641.25245674 9586837.118635984 9024720.893978218


# Decision Tree Regressor

In [111]:
dtr=DecisionTreeRegressor(random_state=42)
model_dtr=dtr.fit(X_train,y_train)

In [112]:
df_imp_dt=pd.DataFrame({"feature":X_train.columns,"importance":model_dtr.feature_importances_})
df_imp_dt.sort_values(by="importance",ascending=False)

Unnamed: 0,feature,importance
1,promotion_name,0.3507065
2,media_type,0.1781645
28,PC20,0.1317007
3,sales_district,0.0892336
4,store_type,0.07944162
21,PC13,0.06276872
9,PC1,0.03447328
14,PC6,0.02524294
20,PC12,0.0245648
27,PC19,0.01617626


In [113]:
ypred_train=model_dtr.predict(X_train)
ypred_test=model_dtr.predict(X_test)

In [114]:
mse_train=mean_squared_error(y_train,ypred_train)
mse_test=mean_squared_error(y_test,ypred_test)
print("Mean squared error for train data:",mse_train)
print("Mean squared error for test data:",mse_test)

Mean squared error for train data: 0.0
Mean squared error for test data: 61562.024521235246


In [118]:
# Example: Cross-validation with 3 folds
scores = cross_val_score(DecisionTreeRegressor(random_state=42), X_pca_cat, y, cv=3, scoring='neg_mean_squared_error')
print("Cross-validated MSE:", -scores)

Cross-validated MSE: [14097829.23316655 16415181.89272388 15366373.84955264]


In [119]:
print(np.min(-scores),np.max(-scores),np.mean(-scores))

14097829.233166553 16415181.89272388 15293128.325147694


# Random Forest Regressor

In [120]:
rfr=RandomForestRegressor(random_state=42,n_jobs=-1)
model_rfr=rfr.fit(X_train,y_train)

In [121]:
df_imp_rfr=pd.DataFrame({"feature":X_train.columns,"importance":model_rfr.feature_importances_})
df_imp_rfr.sort_values(by="importance",ascending=False)

Unnamed: 0,feature,importance
1,promotion_name,0.362317
2,media_type,0.199884
28,PC20,0.091333
21,PC13,0.074402
3,sales_district,0.066219
4,store_type,0.057484
14,PC6,0.049449
20,PC12,0.037241
27,PC19,0.029041
9,PC1,0.022849


In [122]:
ypred_train=model_rfr.predict(X_train)
ypred_test=model_rfr.predict(X_test)

In [123]:
mse_train=mean_squared_error(y_train,ypred_train)
mse_test=mean_squared_error(y_test,ypred_test)
print("Mean squared error for train data:",mse_train)
print("Mean squared error for test data:",mse_test)

Mean squared error for train data: 3323.84784044523
Mean squared error for test data: 15450.722017595932


In [124]:
# Example: Cross-validation with 3 folds
scores = cross_val_score(RandomForestRegressor(random_state=42,n_jobs=-1), X_pca_cat, y, cv=3, scoring='neg_mean_squared_error')
print("Cross-validated RMSE:", -scores)

Cross-validated RMSE: [ 9847770.04112853 11861097.17149523 10561060.2508035 ]


In [125]:
print(np.min(-scores),np.max(-scores),np.mean(-scores))

9847770.04112853 11861097.17149523 10756642.487809086


# AdaBoost Regressor

In [126]:
abr=AdaBoostRegressor(random_state=42)
model_abr=abr.fit(X_train,y_train)

In [127]:
df_imp_abr=pd.DataFrame({"feature":X_train.columns,"importance":model_abr.feature_importances_})
df_imp_abr.sort_values(by="importance",ascending=False)

Unnamed: 0,feature,importance
1,promotion_name,0.312513
2,media_type,0.188346
14,PC6,0.090879
3,sales_district,0.090301
28,PC20,0.09002
9,PC1,0.073239
21,PC13,0.055307
27,PC19,0.043662
4,store_type,0.036567
20,PC12,0.019167


In [128]:
ypred_train=model_abr.predict(X_train)
ypred_test=model_abr.predict(X_test)

In [129]:
mse_train=mean_squared_error(y_train,ypred_train)
mse_test=mean_squared_error(y_test,ypred_test)
print("Mean squared error for train data:",mse_train)
print("Mean squared error for test data:",mse_test)

Mean squared error for train data: 8205610.765504865
Mean squared error for test data: 8233081.024915331


In [130]:
# Example: Cross-validation with 3 folds
scores = cross_val_score(AdaBoostRegressor(random_state=42), X_pca_cat, y, cv=3, scoring='neg_mean_squared_error')
print("Cross-validated RMSE:", -scores)

Cross-validated RMSE: [8819065.36465492 9931981.11488148 8745802.83489225]


In [131]:
print(np.min(-scores),np.max(-scores),np.mean(-scores))

8745802.834892245 9931981.11488148 9165616.43814288


# Gradient Boost Regressor

In [132]:
gbr=GradientBoostingRegressor(random_state=42)
model_gbr=gbr.fit(X_train,y_train)

In [133]:
df_imp_gbr=pd.DataFrame({"feature":X_train.columns,"importance":model_gbr.feature_importances_})
df_imp_gbr.sort_values(by="importance",ascending=False)

Unnamed: 0,feature,importance
1,promotion_name,0.303969
2,media_type,0.188461
21,PC13,0.103628
28,PC20,0.093743
27,PC19,0.068898
14,PC6,0.054935
4,store_type,0.051602
20,PC12,0.046985
3,sales_district,0.042909
9,PC1,0.033788


In [134]:
ypred_train=model_gbr.predict(X_train)
ypred_test=model_gbr.predict(X_test)

In [135]:
mse_train=mean_squared_error(y_train,ypred_train)
mse_test=mean_squared_error(y_test,ypred_test)
print("Mean squared error for train data:",mse_train)
print("Mean squared error for test data:",mse_test)

Mean squared error for train data: 4363206.553473026
Mean squared error for test data: 4425258.802412323


In [136]:
# Example: Cross-validation with 3 folds
scores = cross_val_score(GradientBoostingRegressor(random_state=42), X_pca_cat, y, cv=3, scoring='neg_mean_squared_error')
print("Cross-validated RMSE:", -scores)

Cross-validated RMSE: [8964169.95554146 9346512.30784925 9113079.88811732]


In [137]:
print(np.min(-scores),np.max(-scores),np.mean(-scores))

8964169.955541456 9346512.307849254 9141254.050502678


# XGBoost Regressor

In [138]:
xgbr=XGBRegressor(random_state=42)
model_xgbr=xgbr.fit(X_train,y_train)

In [139]:
df_imp_xgbr=pd.DataFrame({"feature":X_train.columns,"importance":model_xgbr.feature_importances_})
df_imp_xgbr.sort_values(by="importance",ascending=False)

Unnamed: 0,feature,importance
4,store_type,0.165125
5,video_store,0.159436
1,promotion_name,0.123891
2,media_type,0.109866
3,sales_district,0.102201
28,PC20,0.073417
9,PC1,0.070107
21,PC13,0.059201
27,PC19,0.050922
20,PC12,0.038369


In [140]:
ypred_train=model_xgbr.predict(X_train)
ypred_test=model_xgbr.predict(X_test)

In [141]:
mse_train=mean_squared_error(y_train,ypred_train)
mse_test=mean_squared_error(y_test,ypred_test)
print("Mean squared error for train data:",mse_train)
print("Mean squared error for test data:",mse_test)

Mean squared error for train data: 74203.94752592011
Mean squared error for test data: 107664.71672166094


In [142]:
# Example: Cross-validation with 3 folds
scores = cross_val_score(XGBRegressor(random_state=42), X_pca_cat, y, cv=3, scoring='neg_mean_squared_error')
print("Cross-validated RMSE:", -scores)

Cross-validated RMSE: [9740151.39785446 9771588.93474275 9586243.53502975]


In [143]:
print(np.min(-scores),np.max(-scores),np.mean(-scores))

9586243.535029747 9771588.934742752 9699327.955875652


# Stacking Regressor

In [144]:
base_estimators=[
    ("XGBoostRegressor", XGBRegressor(random_state=42)),
    ("RandomForestRegressor", RandomForestRegressor(random_state=42, n_jobs=-1))
]
sr=StackingRegressor(estimators=base_estimators,final_estimator=DecisionTreeRegressor(random_state=42))
model_sr=sr.fit(X_train,y_train)

In [145]:
ypred_train=model_sr.predict(X_train)
ypred_test=model_sr.predict(X_test)

In [146]:
mse_train=mean_squared_error(y_train,ypred_train)
mse_test=mean_squared_error(y_test,ypred_test)
print("Mean squared error for train data:",mse_train)
print("Mean squared error for test data:",mse_test)

Mean squared error for train data: 9214.986961130742
Mean squared error for test data: 24960.635361458553


In [147]:
# Example: Cross-validation with 3 folds
scores = cross_val_score(sr, X_pca_cat, y, cv=3, scoring='neg_mean_squared_error')
print("Cross-validated RMSE:", -scores)

Cross-validated RMSE: [14449016.78235244 14984929.97807836 15212527.92346182]


In [148]:
print(np.min(-scores),np.max(-scores),np.mean(-scores))

14449016.782352442 15212527.923461815 14882158.227964206


In [149]:
base_estimators=[
    ("DecisionTreeRegressor", DecisionTreeRegressor(random_state=42)),
    ("RandomForestRegressor", RandomForestRegressor(random_state=42, n_jobs=-1))
]
sr=StackingRegressor(estimators=base_estimators,final_estimator=XGBRegressor(random_state=42))
model_sr=sr.fit(X_train,y_train)

In [150]:
ypred_train=model_sr.predict(X_train)
ypred_test=model_sr.predict(X_test)

In [151]:
mse_train=mean_squared_error(y_train,ypred_train)
mse_test=mean_squared_error(y_test,ypred_test)
print("Mean squared error for train data:",mse_train)
print("Mean squared error for test data:",mse_test)

Mean squared error for train data: 1284.5074922993663
Mean squared error for test data: 7921.467827924814


In [152]:
# Example: Cross-validation with 3 folds
scores = cross_val_score(StackingRegressor(estimators=[
    ("DecisionTreeRegressor", DecisionTreeRegressor(random_state=42)),
    ("RandomForestRegressor", RandomForestRegressor(random_state=42, n_jobs=-1))
],final_estimator=XGBRegressor(random_state=42)), X_pca_cat, y, cv=3, scoring='neg_mean_squared_error')
print("Cross-validated RMSE:", -scores)

Cross-validated RMSE: [11512831.14157326 14195786.30278922 13670947.68130908]


In [153]:
print(np.min(-scores),np.max(-scores),np.mean(-scores))

11512831.141573256 14195786.302789224 13126521.708557187
