In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import warnings
warnings.filterwarnings('ignore')

In [2]:
# upload train dataset
from google.colab import files

# Open file picker
uploaded = files.upload()
df = pd.read_csv("train_v9rqX0R.csv")

Saving train_v9rqX0R.csv to train_v9rqX0R.csv


In [3]:
# upload test dataset
from google.colab import files

# Open file picker
uploaded = files.upload()
test = pd.read_csv("test_AbJTz2l.csv")

Saving test_AbJTz2l.csv to test_AbJTz2l.csv


In [None]:
# Combine datasets for consistent imputation
df['source'] = 'train'
test['source'] = 'test'
combined_data = pd.concat([df, test], ignore_index=True)

# --- Data Cleaning and Preprocessing ---
# Regularize 'Item_Fat_Content'
combined_data['Item_Fat_Content'] = combined_data['Item_Fat_Content'].replace({
    'low fat':'Low Fat', 'LF':'Low Fat', 'reg':'Regular'
})

# Handle missing 'Outlet_Size' values using mode by outlet type
size_map = combined_data.groupby(['Outlet_Type'])['Outlet_Size'].apply(lambda x: x.mode()[0]).to_dict()
combined_data['Outlet_Size'] = combined_data['Outlet_Size'].fillna(combined_data['Outlet_Type'].map(size_map))

# Handle missing 'Item_Weight' values by item identifier
item_weight_map = combined_data.groupby(['Item_Identifier'])['Item_Weight'].transform('mean')
combined_data['Item_Weight'] = combined_data['Item_Weight'].fillna(item_weight_map)

# Handle zero 'Item_Visibility' values by item identifier
# Create a series with mean visibility for each Item_Identifier where visibility is > 0
visibility_means = combined_data[combined_data['Item_Visibility'] > 0].groupby('Item_Identifier')['Item_Visibility'].transform('mean')
# Use the series to fill the zero values
combined_data['Item_Visibility'] = combined_data['Item_Visibility'].replace(0, np.nan).fillna(visibility_means)

# Create 'Outlet_Age' feature
combined_data['Outlet_Age'] = 2025 - combined_data['Outlet_Establishment_Year']

# Label Encoding for categorical features
categorical_cols = ['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
le = LabelEncoder()
for col in categorical_cols:
    combined_data[col] = le.fit_transform(combined_data[col])

# Check for any remaining NaNs after imputation
print(f"NaNs in combined data after imputation: {combined_data.isnull().sum().sum()}")

# Split data back into train and test
df_clean = combined_data[combined_data['source'] == 'train'].drop('source', axis=1)
test_clean = combined_data[combined_data['source'] == 'test'].drop(['Item_Outlet_Sales', 'source'], axis=1)

# --- Define features and target ---
features = ['Item_Weight', 'Item_Fat_Content', 'Item_Type', 'Item_MRP',
            'Item_Visibility', 'Outlet_Size', 'Outlet_Location_Type',
            'Outlet_Type', 'Outlet_Age']
X = df_clean[features]
y = df_clean['Item_Outlet_Sales']
X_test_final = test_clean[features]

# --- Train-Validation Split ---
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

# --- Polynomial Regression with Lasso Tuning ---
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scorer = make_scorer(
    lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
    greater_is_better=False
)

poly_lasso = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(include_bias=False)),
    ('lasso', Lasso(max_iter=500))
])

param_grid = {
    'poly__degree': [2, 3],
    'lasso__alpha': [0.001, 0.01, 0.05, 0.1, 0.5]
}

grid = GridSearchCV(poly_lasso, param_grid, cv=kf, scoring=rmse_scorer, n_jobs=-1, verbose=1)
numerical_features = X_train.select_dtypes(include=[np.number]).columns
grid.fit(X_train[numerical_features], y_train)

best_poly_lasso = grid.best_estimator_
y_pred_poly = best_poly_lasso.predict(X_val[numerical_features])
rmse_poly = np.sqrt(mean_squared_error(y_val, y_pred_poly))
cv_scores = -grid.best_score_

print("\nBest params:", grid.best_params_)
print(f"Polynomial Regression with Lasso RMSE: {rmse_poly:.2f}")
print(f"Polynomial Regression with Lasso CV RMSE: {cv_scores:.2f}")

NaNs in combined data after imputation: 5694
Fitting 5 folds for each of 10 candidates, totalling 50 fits

Best params: {'lasso__alpha': 0.5, 'poly__degree': 3}
Polynomial Regression with Lasso RMSE: 1044.03
Polynomial Regression with Lasso CV RMSE: 1099.08


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline

# Combine datasets for consistent imputation
df['source'] = 'train'
test['source'] = 'test'
combined_data = pd.concat([df, test], ignore_index=True)

# --- Data Cleaning and Preprocessing ---
# Regularize 'Item_Fat_Content'
combined_data['Item_Fat_Content'] = combined_data['Item_Fat_Content'].replace({
    'low fat':'Low Fat', 'LF':'Low Fat', 'reg':'Regular'
})

# Handle missing 'Outlet_Size' values using mode by outlet type
size_map = combined_data.groupby(['Outlet_Type'])['Outlet_Size'].apply(lambda x: x.mode()[0]).to_dict()
combined_data['Outlet_Size'] = combined_data['Outlet_Size'].fillna(combined_data['Outlet_Type'].map(size_map))

# Handle missing 'Item_Weight' values by item identifier
item_weight_map = combined_data.groupby(['Item_Identifier'])['Item_Weight'].transform('mean')
combined_data['Item_Weight'] = combined_data['Item_Weight'].fillna(item_weight_map)

# Handle zero 'Item_Visibility' values by item identifier
visibility_means = combined_data[combined_data['Item_Visibility'] > 0].groupby('Item_Identifier')['Item_Visibility'].transform('mean')
combined_data['Item_Visibility'] = combined_data['Item_Visibility'].replace(0, np.nan).fillna(visibility_means)

# Create 'Outlet_Age' feature
combined_data['Outlet_Age'] = 2025 - combined_data['Outlet_Establishment_Year']

# --- Advanced Feature Engineering ---
# 1. Create a broader item type category
combined_data['Item_Type_Combined'] = combined_data['Item_Identifier'].apply(lambda x: x[:2])
combined_data['Item_Type_Combined'] = combined_data['Item_Type_Combined'].replace({'FD':'Food', 'NC':'Non-Consumable', 'DR':'Drinks'})

# 2. Bin 'Item_MRP' into categories
combined_data['MRP_Category'] = pd.cut(combined_data['Item_MRP'], bins=4, labels=['Low', 'Medium', 'High', 'Very High'])

# 3. Create interaction features
combined_data['Item_Outlet_Interactions'] = combined_data['Item_Type'].astype(str) + '_' + combined_data['Outlet_Type'].astype(str)

# Label Encoding for categorical features, including the new ones
categorical_cols = ['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type',
                    'Outlet_Type', 'Item_Type_Combined', 'MRP_Category', 'Item_Outlet_Interactions']
le = LabelEncoder()
for col in categorical_cols:
    combined_data[col] = le.fit_transform(combined_data[col])

# Check for any remaining NaNs after imputation
print(f"NaNs in combined data after imputation: {combined_data.isnull().sum().sum()}")

# Split data back into train and test
df_clean = combined_data[combined_data['source'] == 'train'].drop('source', axis=1)
test_clean = combined_data[combined_data['source'] == 'test'].drop(['Item_Outlet_Sales', 'source'], axis=1)

# --- Define features and target ---
features = ['Item_Weight', 'Item_Fat_Content', 'Item_Type', 'Item_MRP',
            'Item_Visibility', 'Outlet_Size', 'Outlet_Location_Type',
            'Outlet_Type', 'Outlet_Age', 'Item_Type_Combined',
            'MRP_Category', 'Item_Outlet_Interactions']
X = df_clean[features]
y = df_clean['Item_Outlet_Sales']
X_test_final = test_clean[features]

# --- Train-Validation Split ---
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

# --- Polynomial Regression with Lasso Tuning ---
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scorer = make_scorer(
    lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
    greater_is_better=False
)

poly_lasso = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(include_bias=False)),
    ('lasso', Lasso(max_iter=1000))
])

param_grid = {
    'poly__degree': [2, 3],
    'lasso__alpha': [0.001, 0.01, 0.05, 0.1, 0.5]
}

grid = GridSearchCV(poly_lasso, param_grid, cv=kf, scoring=rmse_scorer, n_jobs=-1, verbose=1)
numerical_features = X_train.select_dtypes(include=[np.number]).columns
grid.fit(X_train[numerical_features], y_train)

best_poly_lasso = grid.best_estimator_
y_pred_poly = best_poly_lasso.predict(X_val[numerical_features])
rmse_poly = np.sqrt(mean_squared_error(y_val, y_pred_poly))
cv_scores = -grid.best_score_

print("\nBest params:", grid.best_params_)
print(f"Polynomial Regression with Lasso RMSE: {rmse_poly:.2f}")
print(f"Polynomial Regression with Lasso CV RMSE: {cv_scores:.2f}")

NaNs in combined data after imputation: 5694
Fitting 5 folds for each of 10 candidates, totalling 50 fits

Best params: {'lasso__alpha': 0.5, 'poly__degree': 2}
Polynomial Regression with Lasso RMSE: 1059.26
Polynomial Regression with Lasso CV RMSE: 1107.41


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

# Combine datasets for consistent imputation
df['source'] = 'train'
test['source'] = 'test'
combined_data = pd.concat([df, test], ignore_index=True)

# --- Data Cleaning and Preprocessing ---
# Regularize 'Item_Fat_Content'
combined_data['Item_Fat_Content'] = combined_data['Item_Fat_Content'].replace({
    'low fat':'Low Fat', 'LF':'Low Fat', 'reg':'Regular'
})

# Handle missing 'Outlet_Size' values using mode by outlet type
size_map = combined_data.groupby(['Outlet_Type'])['Outlet_Size'].apply(lambda x: x.mode()[0]).to_dict()
combined_data['Outlet_Size'] = combined_data['Outlet_Size'].fillna(combined_data['Outlet_Type'].map(size_map))

# Handle missing 'Item_Weight' values by item identifier
item_weight_map = combined_data.groupby(['Item_Identifier'])['Item_Weight'].transform('mean')
combined_data['Item_Weight'] = combined_data['Item_Weight'].fillna(item_weight_map)

# Handle zero 'Item_Visibility' values by item identifier
visibility_means = combined_data[combined_data['Item_Visibility'] > 0].groupby('Item_Identifier')['Item_Visibility'].transform('mean')
combined_data['Item_Visibility'] = combined_data['Item_Visibility'].replace(0, np.nan).fillna(visibility_means)

# Create 'Outlet_Age' feature
combined_data['Outlet_Age'] = 2025 - combined_data['Outlet_Establishment_Year']

# Label Encoding for categorical features
categorical_cols = ['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
le = LabelEncoder()
for col in categorical_cols:
    combined_data[col] = le.fit_transform(combined_data[col])

# Split data back into train and test
df_clean = combined_data[combined_data['source'] == 'train'].drop('source', axis=1)
test_clean = combined_data[combined_data['source'] == 'test'].drop(['Item_Outlet_Sales', 'source'], axis=1)

# --- Define features and target ---
features = ['Item_Weight', 'Item_Fat_Content', 'Item_Type', 'Item_MRP',
            'Item_Visibility', 'Outlet_Size', 'Outlet_Location_Type',
            'Outlet_Type', 'Outlet_Age']
X = df_clean[features]
y = df_clean['Item_Outlet_Sales']
X_test_final = test_clean[features]

# --- Train-Validation Split ---
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

# --- Train Base Models ---
print("Training base models...")

# Tuned Polynomial Lasso Model
poly_lasso = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('lasso', Lasso(alpha=0.1, max_iter=10000))
])
poly_lasso.fit(X_train, y_train)

# Tuned Random Forest Model
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

# --- Stacking Ensemble ---
print("\nCreating stacking ensemble...")

# Get predictions from base models on the validation set
numerical_features = X_val.select_dtypes(include=[np.number]).columns
val_pred_poly = poly_lasso.predict(X_val[numerical_features])
val_pred_rf = rf.predict(X_val)

# Create a new dataset for the meta-model with base model predictions as features
stacking_features = pd.DataFrame({
    'poly_lasso_pred': val_pred_poly,
    'rf_pred': val_pred_rf
})

# Train a simple Linear Regression meta-model on the new dataset
meta_model = LinearRegression()
meta_model.fit(stacking_features, y_val)

# Generate final predictions on the validation set
final_predictions = meta_model.predict(stacking_features)
final_rmse = np.sqrt(mean_squared_error(y_val, final_predictions))

print(f"Final Stacking Ensemble RMSE: {final_rmse:.2f}")

Training base models...

Creating stacking ensemble...
Final Stacking Ensemble RMSE: 1051.40


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

# Combine datasets for consistent imputation
df['source'] = 'train'
test['source'] = 'test'
combined_data = pd.concat([df, test], ignore_index=True)

# --- Data Cleaning and Preprocessing ---
# Regularize 'Item_Fat_Content'
combined_data['Item_Fat_Content'] = combined_data['Item_Fat_Content'].replace({
    'low fat':'Low Fat', 'LF':'Low Fat', 'reg':'Regular'
})

# Handle missing 'Outlet_Size' values using mode by outlet type
size_map = combined_data.groupby(['Outlet_Type'])['Outlet_Size'].apply(lambda x: x.mode()[0]).to_dict()
combined_data['Outlet_Size'] = combined_data['Outlet_Size'].fillna(combined_data['Outlet_Type'].map(size_map))

# Handle missing 'Item_Weight' values by item identifier
item_weight_map = combined_data.groupby(['Item_Identifier'])['Item_Weight'].transform('mean')
combined_data['Item_Weight'] = combined_data['Item_Weight'].fillna(item_weight_map)

# Handle zero 'Item_Visibility' values by item identifier
visibility_means = combined_data[combined_data['Item_Visibility'] > 0].groupby('Item_Identifier')['Item_Visibility'].transform('mean')
combined_data['Item_Visibility'] = combined_data['Item_Visibility'].replace(0, np.nan).fillna(visibility_means)

# Create 'Outlet_Age' feature
combined_data['Outlet_Age'] = 2025 - combined_data['Outlet_Establishment_Year']

# Label Encoding for categorical features
categorical_cols = ['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
le = LabelEncoder()
for col in categorical_cols:
    combined_data[col] = le.fit_transform(combined_data[col])

# Split data back into train and test
df_clean = combined_data[combined_data['source'] == 'train'].drop('source', axis=1)
test_clean = combined_data[combined_data['source'] == 'test'].drop(['Item_Outlet_Sales', 'source'], axis=1)

# --- Define features and target ---
features = ['Item_Weight', 'Item_Fat_Content', 'Item_Type', 'Item_MRP',
            'Item_Visibility', 'Outlet_Size', 'Outlet_Location_Type',
            'Outlet_Type', 'Outlet_Age']
X = df_clean[features]
y = df_clean['Item_Outlet_Sales']
X_test_final = test_clean[features]

# --- Train-Validation Split ---
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

# --- Train Base Models ---
print("Training base models...")

# Tuned Polynomial Lasso Model
poly_lasso = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('lasso', Lasso(alpha=0.1, max_iter=10000))
])
poly_lasso.fit(X_train, y_train)

# Tuned Random Forest Model
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

# Tuned XGBoost Model
xgb = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
xgb.fit(X_train, y_train)

# --- Stacking Ensemble ---
print("\nCreating stacking ensemble...")

# Get predictions from base models on the validation set
numerical_features = X_val.select_dtypes(include=[np.number]).columns
val_pred_poly = poly_lasso.predict(X_val[numerical_features])
val_pred_rf = rf.predict(X_val)
val_pred_xgb = xgb.predict(X_val)

# Create a new dataset for the meta-model with base model predictions as features
stacking_features = pd.DataFrame({
    'poly_lasso_pred': val_pred_poly,
    'rf_pred': val_pred_rf,
    'xgb_pred': val_pred_xgb
})

# Train a simple Linear Regression meta-model on the new dataset
meta_model = LinearRegression()
meta_model.fit(stacking_features, y_val)

# Generate final predictions on the validation set
final_predictions = meta_model.predict(stacking_features)
final_rmse = np.sqrt(mean_squared_error(y_val, final_predictions))

print(f"Final Stacking Ensemble RMSE: {final_rmse:.2f}")

Training base models...

Creating stacking ensemble...
Final Stacking Ensemble RMSE: 1051.32


In [None]:
# --- Imports ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.preprocessing import LabelEncoder
import sklearn

print("scikit-learn version:", sklearn.__version__)

# --- Combine train and test for consistent preprocessing ---
df['source'] = 'train'
test['source'] = 'test'
combined_data = pd.concat([df, test], ignore_index=True)

# --- Data Cleaning ---
combined_data['Item_Fat_Content'] = combined_data['Item_Fat_Content'].replace({
    'low fat':'Low Fat', 'LF':'Low Fat', 'reg':'Regular'
})

# Fill missing Outlet_Size by Outlet_Type mode
size_map = combined_data.groupby('Outlet_Type')['Outlet_Size'].apply(lambda x: x.mode()[0]).to_dict()
combined_data['Outlet_Size'] = combined_data['Outlet_Size'].fillna(combined_data['Outlet_Type'].map(size_map))

# Fill missing Item_Weight by Item_Identifier mean
item_weight_map = combined_data.groupby('Item_Identifier')['Item_Weight'].transform('mean')
combined_data['Item_Weight'] = combined_data['Item_Weight'].fillna(item_weight_map)

# Replace zero Item_Visibility with mean per item
visibility_means = combined_data[combined_data['Item_Visibility'] > 0].groupby('Item_Identifier')['Item_Visibility'].transform('mean')
combined_data['Item_Visibility'] = combined_data['Item_Visibility'].replace(0, np.nan).fillna(visibility_means)

# Outlet_Age feature
combined_data['Outlet_Age'] = 2025 - combined_data['Outlet_Establishment_Year']

# Label Encoding for ordinal features
le_cols = ['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
for col in le_cols:
    combined_data[col] = LabelEncoder().fit_transform(combined_data[col])

# --- Split data back ---
df_clean = combined_data[combined_data['source']=='train'].drop('source', axis=1)
test_clean = combined_data[combined_data['source']=='test'].drop(['Item_Outlet_Sales','source'], axis=1)

# --- Features and target ---
features = ['Item_Weight', 'Item_Fat_Content', 'Item_Type', 'Item_MRP',
            'Item_Visibility', 'Outlet_Size', 'Outlet_Location_Type',
            'Outlet_Type', 'Outlet_Age']

X = df_clean[features]
y = df_clean['Item_Outlet_Sales']
X_test_final = test_clean[features]

# --- Train/Validation Split ---
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

# --- Preprocessing ---
numerical_features = ['Item_Weight', 'Item_MRP', 'Item_Visibility', 'Outlet_Age']
categorical_features = ['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']

# Fix for scikit-learn version differences
if int(sklearn.__version__.split('.')[1]) >= 2:  # version >=1.2
    ohe = OneHotEncoder(drop='first', sparse_output=False)
else:
    ohe = OneHotEncoder(drop='first', sparse=False)

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', ohe, categorical_features)
])

# --- RMSE scorer ---
rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
                           greater_is_better=False)

# --- Polynomial Lasso Pipeline ---
poly_lasso = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(include_bias=False, interaction_only=True)),
    ('lasso', Lasso(max_iter=1000))
])

# --- Grid Search for best degree and alpha ---
param_grid = {
    'poly__degree': [2, 3],
    'lasso__alpha': [1e-5, 1e-4, 1e-3, 0.01, 0.05]
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(poly_lasso, param_grid, cv=kf, scoring=rmse_scorer, n_jobs=-1, verbose=1)

# --- Log-transform target to reduce skew ---
y_train_log = np.log1p(y_train)

grid.fit(X_train, y_train_log)
best_poly_lasso = grid.best_estimator_

# --- Predictions on validation set ---
y_val_pred_log = best_poly_lasso.predict(X_val)
y_val_pred = np.expm1(y_val_pred_log)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
cv_rmse = -grid.best_score_

print("\nBest Params:", grid.best_params_)
print(f"Validation RMSE: {rmse_val:.2f}")
print(f"Cross-Validated RMSE: {cv_rmse:.2f}")

# --- Predictions on test set ---
y_test_pred_log = best_poly_lasso.predict(X_test_final)
y_test_pred = np.expm1(y_test_pred_log)

scikit-learn version: 1.6.1
Fitting 5 folds for each of 10 candidates, totalling 50 fits

Best Params: {'lasso__alpha': 0.001, 'poly__degree': 2}
Validation RMSE: 1115.06
Cross-Validated RMSE: 0.54


ValueError: Input X contains NaN.
PolynomialFeatures does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
# --- Imports ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import sklearn

print("scikit-learn version:", sklearn.__version__)

# --- Load your data ---
# df = pd.read_csv('train.csv')
# test = pd.read_csv('test.csv')

# --- Combine train and test for consistent preprocessing ---
df['source'] = 'train'
test['source'] = 'test'
combined_data = pd.concat([df, test], ignore_index=True)

# --- Data Cleaning ---
combined_data['Item_Fat_Content'] = combined_data['Item_Fat_Content'].replace({
    'low fat':'Low Fat', 'LF':'Low Fat', 'reg':'Regular'
})

# Fill missing Outlet_Size by Outlet_Type mode
size_map = combined_data.groupby('Outlet_Type')['Outlet_Size'].apply(lambda x: x.mode()[0]).to_dict()
combined_data['Outlet_Size'] = combined_data['Outlet_Size'].fillna(combined_data['Outlet_Type'].map(size_map))

# Fill missing Item_Weight by Item_Identifier mean
item_weight_map = combined_data.groupby('Item_Identifier')['Item_Weight'].transform('mean')
combined_data['Item_Weight'] = combined_data['Item_Weight'].fillna(item_weight_map)

# Replace zero Item_Visibility with mean per item
visibility_means = combined_data[combined_data['Item_Visibility'] > 0].groupby('Item_Identifier')['Item_Visibility'].transform('mean')
combined_data['Item_Visibility'] = combined_data['Item_Visibility'].replace(0, np.nan).fillna(visibility_means)

# Outlet_Age feature
combined_data['Outlet_Age'] = 2025 - combined_data['Outlet_Establishment_Year']

# Label Encoding for categorical features
le_cols = ['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
for col in le_cols:
    combined_data[col] = LabelEncoder().fit_transform(combined_data[col])

# --- Split data back ---
df_clean = combined_data[combined_data['source']=='train'].drop('source', axis=1)
test_clean = combined_data[combined_data['source']=='test'].drop(['Item_Outlet_Sales','source'], axis=1)

# --- Features and target ---
features = ['Item_Weight', 'Item_Fat_Content', 'Item_Type', 'Item_MRP',
            'Item_Visibility', 'Outlet_Size', 'Outlet_Location_Type',
            'Outlet_Type', 'Outlet_Age']

X = df_clean[features]
y = df_clean['Item_Outlet_Sales']
X_test_final = test_clean[features]

# --- Train/Validation Split ---
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

# --- Preprocessing ---
numerical_features = ['Item_Weight', 'Item_MRP', 'Item_Visibility', 'Outlet_Age']
categorical_features = ['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']

# OneHotEncoder depending on sklearn version
if int(sklearn.__version__.split('.')[1]) >= 2:  # version >=1.2
    ohe = OneHotEncoder(drop='first', sparse_output=False)
else:
    ohe = OneHotEncoder(drop='first', sparse=False)

# Imputers
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

preprocessor = ColumnTransformer([
    ('num', Pipeline([('imputer', num_imputer), ('scaler', StandardScaler())]), numerical_features),
    ('cat', Pipeline([('imputer', cat_imputer), ('ohe', ohe)]), categorical_features)
])

# --- RMSE scorer ---
rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
                           greater_is_better=False)

# --- Polynomial Lasso Pipeline ---
poly_lasso = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(include_bias=False, interaction_only=True)),
    ('lasso', Lasso(max_iter=1000))
])

# --- Grid Search ---
param_grid = {
    'poly__degree': [2, 3],
    'lasso__alpha': [1e-5, 1e-4, 1e-3, 0.01, 0.05]
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(poly_lasso, param_grid, cv=kf, scoring=rmse_scorer, n_jobs=-1, verbose=1)

# Log-transform target to reduce skew
y_train_log = np.log1p(y_train)

grid.fit(X_train, y_train_log)
best_poly_lasso = grid.best_estimator_

# --- Predictions on validation set ---
y_val_pred_log = best_poly_lasso.predict(X_val)
y_val_pred = np.expm1(y_val_pred_log)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
cv_rmse = -grid.best_score_

print("\nBest Params:", grid.best_params_)
print(f"Validation RMSE: {rmse_val:.2f}")
print(f"Cross-Validated RMSE: {cv_rmse:.2f}")

# --- Predictions on test set ---
y_test_pred_log = best_poly_lasso.predict(X_test_final)
y_test_pred = np.expm1(y_test_pred_log)

# --- Save predictions ---
submission = pd.DataFrame({
    'Item_Identifier': test['Item_Identifier'],
    'Outlet_Identifier': test['Outlet_Identifier'],
    'Item_Outlet_Sales': y_test_pred
})
# submission.to_csv('submission_poly_lasso.csv', index=False)

scikit-learn version: 1.6.1
Fitting 5 folds for each of 10 candidates, totalling 50 fits

Best Params: {'lasso__alpha': 0.001, 'poly__degree': 2}
Validation RMSE: 1115.06
Cross-Validated RMSE: 0.54


In [None]:
# --- Import libraries ---
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer

# --- Load data ---
# df = pd.read_csv('train.csv')
# test = pd.read_csv('test.csv')

# --- Combine datasets for consistent preprocessing ---
df['source'] = 'train'
test['source'] = 'test'
combined_data = pd.concat([df, test], ignore_index=True)

# --- Data Cleaning & Preprocessing ---

# Regularize 'Item_Fat_Content'
combined_data['Item_Fat_Content'] = combined_data['Item_Fat_Content'].replace({
    'low fat':'Low Fat', 'LF':'Low Fat', 'reg':'Regular'
})

# Fill missing 'Outlet_Size' by mode per Outlet_Type
size_map = combined_data.groupby('Outlet_Type')['Outlet_Size'].apply(lambda x: x.mode()[0]).to_dict()
combined_data['Outlet_Size'] = combined_data['Outlet_Size'].fillna(combined_data['Outlet_Type'].map(size_map))

# Fill missing 'Item_Weight' by item mean
item_weight_map = combined_data.groupby('Item_Identifier')['Item_Weight'].transform('mean')
combined_data['Item_Weight'] = combined_data['Item_Weight'].fillna(item_weight_map)

# Replace zero 'Item_Visibility' with mean per item
visibility_means = combined_data[combined_data['Item_Visibility'] > 0].groupby('Item_Identifier')['Item_Visibility'].transform('mean')
combined_data['Item_Visibility'] = combined_data['Item_Visibility'].replace(0, np.nan).fillna(visibility_means)

# Create 'Outlet_Age'
combined_data['Outlet_Age'] = 2025 - combined_data['Outlet_Establishment_Year']

# Label encoding for categorical features
categorical_cols = ['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
le = LabelEncoder()
for col in categorical_cols:
    combined_data[col] = le.fit_transform(combined_data[col])

# --- Split back into train and test ---
df_clean = combined_data[combined_data['source'] == 'train'].drop('source', axis=1)
test_clean = combined_data[combined_data['source'] == 'test'].drop(['Item_Outlet_Sales', 'source'], axis=1)

# --- Define features & target ---
features = ['Item_Weight', 'Item_Fat_Content', 'Item_Type', 'Item_MRP',
            'Item_Visibility', 'Outlet_Size', 'Outlet_Location_Type',
            'Outlet_Type', 'Outlet_Age']
X = df_clean[features]
y = df_clean['Item_Outlet_Sales']
X_test_final = test_clean[features]

# --- Train-validation split ---
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

# Log-transform target
y_train_log = np.log1p(y_train)

# --- Polynomial Regression with Lasso ---

# RMSE scorer
rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
                           greater_is_better=False)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

poly_lasso = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(include_bias=False)),
    ('lasso', Lasso(max_iter=500))
])

param_grid = {
    'poly__degree': [2, 3],
    'lasso__alpha': [0.001, 0.01, 0.05, 0.1, 0.5]
}

numerical_features = X_train.select_dtypes(include=[np.number]).columns

grid = GridSearchCV(poly_lasso, param_grid, cv=kf, scoring=rmse_scorer, n_jobs=-1, verbose=1)
grid.fit(X_train[numerical_features], y_train_log)

best_poly_lasso = grid.best_estimator_

# --- Predictions on validation set ---
y_val_pred_log = best_poly_lasso.predict(X_val[numerical_features])
y_val_pred = np.expm1(y_val_pred_log)  # inverse log-transform
y_val_pred = np.clip(y_val_pred, 0, None)  # avoid negative predictions

rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))

results_val = pd.DataFrame({'Actual': y_val, 'Predicted': y_val_pred})
print(results_val.head())

print("\nBest parameters:", grid.best_params_)
print(f"Validation RMSE: {rmse_val:.2f}")

# --- Predictions on test set ---
# Fill NaNs in test if any
X_test_final[numerical_features] = X_test_final[numerical_features].fillna(X_test_final[numerical_features].mean())

y_test_pred_log = best_poly_lasso.predict(X_test_final[numerical_features])
y_test_pred = np.expm1(y_test_pred_log)
y_test_pred = np.clip(y_test_pred, 0, None)

submission = pd.DataFrame({
    'Item_Identifier': test['Item_Identifier'],
    'Outlet_Identifier': test['Outlet_Identifier'],
    'Item_Outlet_Sales': y_test_pred
})

print(submission.head())

Fitting 5 folds for each of 10 candidates, totalling 50 fits
         Actual    Predicted
7503  1743.0644  1084.408055
2957   356.8688   638.379893
7031   377.5086   617.024724
1084  5778.4782  4299.111010
856   2356.9320  2860.287037

Best parameters: {'lasso__alpha': 0.01, 'poly__degree': 3}
Validation RMSE: 1065.97
  Item_Identifier Outlet_Identifier  Item_Outlet_Sales
0           FDW58            OUT049        1548.344534
1           FDW14            OUT017        1253.110118
2           NCN55            OUT010         508.108978
3           FDQ58            OUT017        2292.807338
4           FDY38            OUT027        5997.499073


In [None]:
# --- Imports ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.impute import SimpleImputer
import sklearn

print("scikit-learn version:", sklearn.__version__)

# --- Load your data ---
# df = pd.read_csv('train.csv')
# test = pd.read_csv('test.csv')

# --- Combine train and test for consistent preprocessing ---
df['source'] = 'train'
test['source'] = 'test'
combined_data = pd.concat([df, test], ignore_index=True)

# --- Data Cleaning ---
combined_data['Item_Fat_Content'] = combined_data['Item_Fat_Content'].replace({
    'low fat':'Low Fat', 'LF':'Low Fat', 'reg':'Regular'
})

# Fill missing Outlet_Size by Outlet_Type mode
size_map = combined_data.groupby('Outlet_Type')['Outlet_Size'].apply(lambda x: x.mode()[0]).to_dict()
combined_data['Outlet_Size'] = combined_data['Outlet_Size'].fillna(combined_data['Outlet_Type'].map(size_map))

# Fill missing Item_Weight by Item_Identifier mean
item_weight_map = combined_data.groupby('Item_Identifier')['Item_Weight'].transform('mean')
combined_data['Item_Weight'] = combined_data['Item_Weight'].fillna(item_weight_map)

# Replace zero Item_Visibility with mean per item
visibility_means = combined_data[combined_data['Item_Visibility'] > 0].groupby('Item_Identifier')['Item_Visibility'].transform('mean')
combined_data['Item_Visibility'] = combined_data['Item_Visibility'].replace(0, np.nan).fillna(visibility_means)

# Create Outlet_Age feature
combined_data['Outlet_Age'] = 2025 - combined_data['Outlet_Establishment_Year']

# Label Encoding for categorical features
le_cols = ['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
for col in le_cols:
    combined_data[col] = LabelEncoder().fit_transform(combined_data[col])

# --- Split back into train and test ---
df_clean = combined_data[combined_data['source']=='train'].drop('source', axis=1)
test_clean = combined_data[combined_data['source']=='test'].drop(['Item_Outlet_Sales','source'], axis=1)

# --- Features and target ---
features = ['Item_Weight', 'Item_Fat_Content', 'Item_Type', 'Item_MRP',
            'Item_Visibility', 'Outlet_Size', 'Outlet_Location_Type',
            'Outlet_Type', 'Outlet_Age']

X = df_clean[features]
y = df_clean['Item_Outlet_Sales']
X_test_final = test_clean[features]

# --- Train/Validation Split ---
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

# --- Preprocessing ---
numerical_features = ['Item_Weight', 'Item_MRP', 'Item_Visibility', 'Outlet_Age']
categorical_features = ['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']

# OneHotEncoder depending on sklearn version
if int(sklearn.__version__.split('.')[1]) >= 2:  # version >=1.2
    ohe = OneHotEncoder(drop='first', sparse_output=False)
else:
    ohe = OneHotEncoder(drop='first', sparse=False)

# Imputers
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

preprocessor = ColumnTransformer([
    ('num', Pipeline([('imputer', num_imputer), ('scaler', StandardScaler())]), numerical_features),
    ('cat', Pipeline([('imputer', cat_imputer), ('ohe', ohe)]), categorical_features)
])

# --- RMSE scorer ---
rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
                           greater_is_better=False)

# --- Polynomial Lasso Pipeline ---
poly_lasso = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(include_bias=False, interaction_only=True)),
    ('lasso', Lasso(max_iter=1000))
])

# --- Grid Search ---
param_grid = {
    'poly__degree': [2, 3],
    'lasso__alpha': [1e-5, 1e-4, 1e-3, 0.01, 0.05]
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(poly_lasso, param_grid, cv=kf, scoring=rmse_scorer, n_jobs=-1, verbose=1)

# Log-transform target to reduce skew
y_train_log = np.log1p(y_train)

grid.fit(X_train, y_train_log)
best_poly_lasso = grid.best_estimator_

# --- Predictions on validation set ---
y_val_pred_log = best_poly_lasso.predict(X_val)
y_val_pred = np.expm1(y_val_pred_log)  # Convert back to original scale
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
cv_rmse = -grid.best_score_

print("\nBest Params:", grid.best_params_)
print(f"Validation RMSE (original scale): {rmse_val:.2f}")
print(f"Cross-Validated RMSE (log scale): {cv_rmse:.2f}")

# --- Predictions on test set ---
y_test_pred_log = best_poly_lasso.predict(X_test_final)
y_test_pred = np.expm1(y_test_pred_log)  # Convert back to original scale

# --- Save predictions ---
submission = pd.DataFrame({
    'Item_Identifier': test['Item_Identifier'],
    'Outlet_Identifier': test['Outlet_Identifier'],
    'Item_Outlet_Sales': y_test_pred
})
# submission.to_csv('submission_poly_lasso.csv', index=False)

scikit-learn version: 1.6.1
Fitting 5 folds for each of 10 candidates, totalling 50 fits

Best Params: {'lasso__alpha': 0.001, 'poly__degree': 2}
Validation RMSE (original scale): 1115.06
Cross-Validated RMSE (log scale): 0.54


In [None]:
# --- Imports ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")

# --- Combine datasets for consistent imputation ---
df['source'] = 'train'
test['source'] = 'test'
combined_data = pd.concat([df, test], ignore_index=True)

# --- Data Cleaning ---
combined_data['Item_Fat_Content'] = combined_data['Item_Fat_Content'].replace({
    'low fat':'Low Fat', 'LF':'Low Fat', 'reg':'Regular'
})

size_map = combined_data.groupby(['Outlet_Type'])['Outlet_Size'].apply(lambda x: x.mode()[0]).to_dict()
combined_data['Outlet_Size'] = combined_data['Outlet_Size'].fillna(combined_data['Outlet_Type'].map(size_map))

item_weight_map = combined_data.groupby(['Item_Identifier'])['Item_Weight'].transform('mean')
combined_data['Item_Weight'] = combined_data['Item_Weight'].fillna(item_weight_map)

visibility_means = combined_data[combined_data['Item_Visibility'] > 0].groupby('Item_Identifier')['Item_Visibility'].transform('mean')
combined_data['Item_Visibility'] = combined_data['Item_Visibility'].replace(0, np.nan).fillna(visibility_means)

combined_data['Outlet_Age'] = 2025 - combined_data['Outlet_Establishment_Year']

categorical_cols = ['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
le = LabelEncoder()
for col in categorical_cols:
    combined_data[col] = le.fit_transform(combined_data[col])

print(f"NaNs in combined data after imputation: {combined_data.isnull().sum().sum()}")

# --- Split back ---
df_clean = combined_data[combined_data['source'] == 'train'].drop('source', axis=1)
test_clean = combined_data[combined_data['source'] == 'test'].drop(['Item_Outlet_Sales','source'], axis=1)

# --- Features & Target ---
features = ['Item_Weight', 'Item_Fat_Content', 'Item_Type', 'Item_MRP',
            'Item_Visibility', 'Outlet_Size', 'Outlet_Location_Type',
            'Outlet_Type', 'Outlet_Age']
X = df_clean[features]
y = df_clean['Item_Outlet_Sales']
X_test_final = test_clean[features]

# --- Train/Validation Split ---
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

# --- Common RMSE scorer ---
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)), greater_is_better=False)

# --- Polynomial Regression + Lasso ---
poly_lasso = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(include_bias=False)),
    ('lasso', Lasso(max_iter=500))
])
param_grid_lasso = {
    'poly__degree': [2, 3],
    'lasso__alpha': [0.001, 0.01, 0.05, 0.1, 0.5]
}
grid_lasso = GridSearchCV(poly_lasso, param_grid_lasso, cv=kf, scoring=rmse_scorer, n_jobs=-1, verbose=1)
numerical_features = X_train.select_dtypes(include=[np.number]).columns
grid_lasso.fit(X_train[numerical_features], y_train)

y_pred_poly = grid_lasso.predict(X_val[numerical_features])
rmse_poly = np.sqrt(mean_squared_error(y_val, y_pred_poly))
cv_scores_poly = -grid_lasso.best_score_

print("\nPolynomial Regression + Lasso:")
print("Best params:", grid_lasso.best_params_)
print(f"Validation RMSE: {rmse_poly:.2f}, CV RMSE: {cv_scores_poly:.2f}")

# --- Random Forest ---
rf = RandomForestRegressor(random_state=42)
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
grid_rf = GridSearchCV(rf, param_grid_rf, cv=kf, scoring=rmse_scorer, n_jobs=-1, verbose=1)
grid_rf.fit(X_train, y_train)
y_pred_rf = grid_rf.predict(X_val)
rmse_rf = np.sqrt(mean_squared_error(y_val, y_pred_rf))
cv_scores_rf = -grid_rf.best_score_

print("\nRandom Forest:")
print("Best params:", grid_rf.best_params_)
print(f"Validation RMSE: {rmse_rf:.2f}, CV RMSE: {cv_scores_rf:.2f}")

# --- XGBoost ---
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1, 0.2],
    'subsample': [0.7, 1]
}
grid_xgb = GridSearchCV(xgb_model, param_grid_xgb, cv=kf, scoring=rmse_scorer, n_jobs=-1, verbose=1)
grid_xgb.fit(X_train, y_train)
y_pred_xgb = grid_xgb.predict(X_val)
rmse_xgb = np.sqrt(mean_squared_error(y_val, y_pred_xgb))
cv_scores_xgb = -grid_xgb.best_score_

print("\nXGBoost:")
print("Best params:", grid_xgb.best_params_)
print(f"Validation RMSE: {rmse_xgb:.2f}, CV RMSE: {cv_scores_xgb:.2f}")

from sklearn.model_selection import RandomizedSearchCV

# --- XGBoost with RandomizedSearchCV ---
xgb_model = xgb.XGBRegressor(random_state=42, n_estimators=500)

param_dist = {
    'max_depth': [3, 4, 5, 6, 7, 8],
    'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.1],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'min_child_weight': [1, 3, 5, 7],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'reg_alpha': [0, 0.01, 0.1, 1, 10],
    'reg_lambda': [1, 1.5, 2, 3, 5]
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

rmse_scorer = make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)), greater_is_better=False)

random_search = RandomizedSearchCV(
    xgb_model,
    param_distributions=param_dist,
    n_iter=100,  # increase for more exhaustive search
    scoring=rmse_scorer,
    cv=kf,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

best_xgb = random_search.best_estimator_
y_pred_xgb = best_xgb.predict(X_val)
rmse_xgb = np.sqrt(mean_squared_error(y_val, y_pred_xgb))

print("\nTuned XGBoost:")
print("Best params:", random_search.best_params_)
print(f"Validation RMSE: {rmse_xgb:.2f}")

NaNs in combined data after imputation: 5694
Fitting 5 folds for each of 10 candidates, totalling 50 fits

Polynomial Regression + Lasso:
Best params: {'lasso__alpha': 0.5, 'poly__degree': 3}
Validation RMSE: 1044.03, CV RMSE: 1099.08
Fitting 5 folds for each of 12 candidates, totalling 60 fits

Random Forest:
Best params: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}
Validation RMSE: 1056.43, CV RMSE: 1110.52
Fitting 5 folds for each of 36 candidates, totalling 180 fits

XGBoost:
Best params: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1}
Validation RMSE: 1046.07, CV RMSE: 1090.36
Fitting 5 folds for each of 100 candidates, totalling 500 fits

Tuned XGBoost:
Best params: {'subsample': 0.7, 'reg_lambda': 1.5, 'reg_alpha': 0.1, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0.1, 'colsample_bytree': 0.9}
Validation RMSE: 1042.94


**GOOD RESULT**

**GOOD RESULT**

In [None]:
# --- Imports ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, make_scorer
from xgboost import XGBRegressor
import xgboost as xgb

# --- Load Data ---
#df = pd.read_csv("train.csv")
#test = pd.read_csv("test.csv")

# --- Combine datasets for consistent preprocessing ---
df['source'] = 'train'
test['source'] = 'test'
combined_data = pd.concat([df, test], ignore_index=True)

# --- Data Cleaning ---
combined_data['Item_Fat_Content'] = combined_data['Item_Fat_Content'].replace({
    'low fat':'Low Fat', 'LF':'Low Fat', 'reg':'Regular'
})

# Fill missing Outlet_Size by Outlet_Type mode
size_map = combined_data.groupby('Outlet_Type')['Outlet_Size'].apply(lambda x: x.mode()[0]).to_dict()
combined_data['Outlet_Size'] = combined_data['Outlet_Size'].fillna(combined_data['Outlet_Type'].map(size_map))

# Fill missing Item_Weight by Item_Identifier mean
item_weight_map = combined_data.groupby('Item_Identifier')['Item_Weight'].transform('mean')
combined_data['Item_Weight'] = combined_data['Item_Weight'].fillna(item_weight_map)

# Replace zero Item_Visibility with mean per item
visibility_means = combined_data[combined_data['Item_Visibility'] > 0] \
    .groupby('Item_Identifier')['Item_Visibility'].transform('mean')
combined_data['Item_Visibility'] = combined_data['Item_Visibility'] \
    .replace(0, np.nan).fillna(visibility_means)

# Create Outlet_Age feature
combined_data['Outlet_Age'] = 2025 - combined_data['Outlet_Establishment_Year']

# Label Encoding for categorical features
categorical_cols = ['Item_Fat_Content', 'Item_Type', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
for col in categorical_cols:
    combined_data[col] = LabelEncoder().fit_transform(combined_data[col])

# --- Split back into train/test ---
df_clean = combined_data[combined_data['source'] == 'train'].drop('source', axis=1)
test_clean = combined_data[combined_data['source'] == 'test'].drop(['Item_Outlet_Sales', 'source'], axis=1)

# --- Features & Target ---
features = ['Item_Weight', 'Item_Fat_Content', 'Item_Type', 'Item_MRP',
            'Item_Visibility', 'Outlet_Size', 'Outlet_Location_Type',
            'Outlet_Type', 'Outlet_Age']
X = df_clean[features]
y = df_clean['Item_Outlet_Sales']
X_test_final = test_clean[features]

# --- Train/Validation Split ---
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

# --- RMSE scorer ---
rmse_scorer = make_scorer(lambda yt, yp: np.sqrt(mean_squared_error(yt, yp)), greater_is_better=False)

# --- Hyperparameter Tuning with RandomizedSearchCV ---
xgb_model = XGBRegressor(random_state=42, n_estimators=500)

param_dist = {
    'max_depth': [3, 4, 5, 6, 7, 8],
    'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.1],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'min_child_weight': [1, 3, 5, 7],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'reg_alpha': [0, 0.01, 0.1, 1, 10],
    'reg_lambda': [1, 1.5, 2, 3, 5]
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

random_search = RandomizedSearchCV(
    xgb_model,
    param_distributions=param_dist,
    n_iter=50,
    scoring=rmse_scorer,
    cv=kf,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)
best_params = random_search.best_params_
print("Best params from CV:", best_params)

# --- Train Final Model with DMatrix for Early Stopping ---
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

params = best_params.copy()
params.update({
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse'
})

watchlist = [(dtrain, 'train'), (dval, 'eval')]

bst = xgb.train(
    params,
    dtrain,
    num_boost_round=2000,
    evals=watchlist,
    early_stopping_rounds=50,
    verbose_eval=50
)

# --- Validation Predictions ---
y_val_pred = bst.predict(dval)
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f"Validation RMSE (XGBoost): {rmse_val:.2f}")

# --- Test Predictions ---
dtest = xgb.DMatrix(X_test_final)
y_test_pred = bst.predict(dtest)

# --- Save Submission ---
submission = pd.DataFrame({
    'Item_Identifier': test['Item_Identifier'],
    'Outlet_Identifier': test['Outlet_Identifier'],
    'Item_Outlet_Sales': y_test_pred
})
submission.to_csv('submission_xgb_dmatrix.csv', index=False)
print("Submission file saved as submission_xgb_dmatrix.csv")

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best params from CV: {'subsample': 0.7, 'reg_lambda': 2, 'reg_alpha': 10, 'min_child_weight': 3, 'max_depth': 5, 'learning_rate': 0.01, 'gamma': 0.2, 'colsample_bytree': 1.0}
[0]	train-rmse:1710.10004	eval-rmse:1654.89882
[50]	train-rmse:1348.39905	eval-rmse:1300.29511
[100]	train-rmse:1183.75304	eval-rmse:1143.98030
[150]	train-rmse:1111.92954	eval-rmse:1080.62382
[200]	train-rmse:1079.92805	eval-rmse:1056.71540
[250]	train-rmse:1063.37162	eval-rmse:1047.91884
[300]	train-rmse:1050.83762	eval-rmse:1045.17648
[350]	train-rmse:1039.37227	eval-rmse:1044.04975
[400]	train-rmse:1029.55089	eval-rmse:1044.30767
[424]	train-rmse:1024.90278	eval-rmse:1044.84804
Validation RMSE (XGBoost): 1044.88
Submission file saved as submission_xgb_dmatrix.csv


In [21]:
# --- Imports ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")

# --- Load data ---
# df = pd.read_csv("train.csv")
# test = pd.read_csv("test.csv")

# 1) Standardize Item_Fat_Content
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({
    'low fat': 'Low Fat',
    'LF': 'Low Fat',
    'reg': 'Regular'
})
test['Item_Fat_Content'] = test['Item_Fat_Content'].replace({
    'low fat': 'Low Fat',
    'LF': 'Low Fat',
    'reg': 'Regular'
})

# 2) Handle missing values in Outlet_Size
size_map = {
    'Grocery Store': 'Small',
    'Supermarket Type2': 'Medium',
    'Supermarket Type3': 'Medium'
}
df['Outlet_Size'] = df['Outlet_Size'].fillna(df['Outlet_Type'].map(size_map))
test['Outlet_Size'] = test['Outlet_Size'].fillna(test['Outlet_Type'].map(size_map))

size_map2 = {'Tier 2': 'Small'}
df['Outlet_Size'] = df['Outlet_Size'].fillna(df['Outlet_Location_Type'].map(size_map2))
test['Outlet_Size'] = test['Outlet_Size'].fillna(test['Outlet_Location_Type'].map(size_map2))

# 3) Handle missing values in Item_Weight
df['Item_Weight'] = df['Item_Weight'].fillna(df.groupby(['Item_Identifier'])['Item_Weight'].transform('mean'))
test['Item_Weight'] = test['Item_Weight'].fillna(df.groupby(['Item_Identifier'])['Item_Weight'].transform('mean'))

df['Item_Weight'] = df.groupby('Item_Type')['Item_Weight'].transform('mean')
test['Item_Weight'] = test.groupby('Item_Type')['Item_Weight'].transform('mean')

# 4) Handle zeros in Item_Visibility
df['Item_Visibility'] = df.groupby('Item_Identifier')['Item_Visibility'].transform(
    lambda x: x.replace(0, x.mean())
)
test['Item_Visibility'] = test.groupby('Item_Identifier')['Item_Visibility'].transform(
    lambda x: x.replace(0, x.mean())
)
# Fill remaining zeros in test
test['Item_Visibility'] = test.groupby(['Item_Type', 'Item_Fat_Content'])['Item_Visibility'].transform('mean')

# --- Additional Feature: Outlet_Age ---
df['Outlet_Age'] = 2025 - df['Outlet_Establishment_Year']
test['Outlet_Age'] = 2025 - test['Outlet_Establishment_Year']

# --- Manual Encoding ---
# Item_Fat_Content → 1,2
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({'Low Fat': 1, 'Regular': 2})
test['Item_Fat_Content'] = test['Item_Fat_Content'].replace({'Low Fat': 1, 'Regular': 2})

# Outlet_Size → 1,2,3
df['Outlet_Size'] = df['Outlet_Size'].replace({'Small': 1, 'Medium': 2, 'High': 3})
test['Outlet_Size'] = test['Outlet_Size'].replace({'Small': 1, 'Medium': 2, 'High': 3})

# Outlet_Location_Type → 1,2,3
df['Outlet_Location_Type'] = df['Outlet_Location_Type'].replace({'Tier 1': 1, 'Tier 2': 2, 'Tier 3': 3})
test['Outlet_Location_Type'] = test['Outlet_Location_Type'].replace({'Tier 1': 1, 'Tier 2': 2, 'Tier 3': 3})

# Outlet_Type → 1,2,3,4
outlet_type_map = {
    'Grocery Store': 1,
    'Supermarket Type1': 2,
    'Supermarket Type2': 3,
    'Supermarket Type3': 4
}
df['Outlet_Type'] = df['Outlet_Type'].replace(outlet_type_map)
test['Outlet_Type'] = test['Outlet_Type'].replace(outlet_type_map)

# --- Encode Item_Type as 1,2,3,... ---
df['Item_Type'], uniques = pd.factorize(df['Item_Type'])
test['Item_Type'] = pd.Categorical(test['Item_Type'], categories=uniques).codes
df['Item_Type'] = df['Item_Type'] + 1
test['Item_Type'] = test['Item_Type'] + 1

# --- New Feature: Item_Sales_Frequency ---
# xgboost - 1041
#df['Item_Sales_Frequency'] = df['Outlet_Age'] * (df['Item_MRP'] - df['Item_Visibility'])/(df['Item_Weight'] + 1)
#test['Item_Sales_Frequency'] = test['Outlet_Age'] * (test['Item_MRP'] - test['Item_Visibility'])/(test['Item_Weight'] + 1)

# polynomial regression - 1039
#item_popularity = df['Item_Identifier'].value_counts(normalize=True)  # normalized frequency
#df['Item_Popularity'] = df['Item_Identifier'].map(item_popularity)
#test['Item_Popularity'] = test['Item_Identifier'].map(item_popularity).fillna(0)  # unseen items → 0

# --- 2. Item_Sales_Frequency ---
#df['Item_Sales_Frequency'] = (
#    np.log1p(df['Outlet_Age']) * (df['Item_MRP'] / (df['Item_Weight'] + 1)) * df['Item_Popularity']
#)

#test['Item_Sales_Frequency'] = (
#    np.log1p(test['Outlet_Age']) * (test['Item_MRP'] / (test['Item_Weight'] + 1)) * test['Item_Popularity']
#)

### polynomial regression - 1038.26
#df['Item_Sales_Frequency'] = (
#    np.log1p(df['Outlet_Age']) *
#    ((df['Item_MRP'] - df['Item_MRP'].mean()) / (df['Item_MRP'].std() + 1)) *
#    (df['Item_Popularity'] + 0.01)  # smoothing
#)

#test['Item_Sales_Frequency'] = (
#    np.log1p(test['Outlet_Age']) *
#    ((test['Item_MRP'] - df['Item_MRP'].mean()) / (df['Item_MRP'].std() + 1)) *
#    (test['Item_Popularity'] + 0.01)
#)


df['Item_Sales_Frequency'] = (
    np.log1p(df['Outlet_Age']) *
    ((df['Item_MRP'] - df['Item_MRP'].mean()) / (df['Item_MRP'].std() + 1)) *
    (df['Item_Popularity'] + 0.05) /  # slightly stronger smoothing
    (1 + np.sqrt(df['Item_Visibility']))  # gentler visibility penalty
)

test['Item_Sales_Frequency'] = (
    np.log1p(test['Outlet_Age']) *
    ((test['Item_MRP'] - df['Item_MRP'].mean()) / (df['Item_MRP'].std() + 1)) *
    (test['Item_Popularity'] + 0.05) /
    (1 + np.sqrt(test['Item_Visibility']))
)


# --- Handle Inf / NaN in Item_Sales_Frequency ---
df['Item_Sales_Frequency'].replace([np.inf, -np.inf], np.nan, inplace=True)
test['Item_Sales_Frequency'].replace([np.inf, -np.inf], np.nan, inplace=True)
df['Item_Sales_Frequency'].fillna(df['Item_Sales_Frequency'].mean(), inplace=True)
test['Item_Sales_Frequency'].fillna(test['Item_Sales_Frequency'].mean(), inplace=True)



# --- New Feature: Customer Outlet Preference ---
outlet_type_sales = df.groupby('Outlet_Type')['Item_Outlet_Sales'].sum()
outlet_type_percentage = outlet_type_sales / outlet_type_sales.sum()

df['Outlet_Type_Percentage'] = df['Outlet_Type'].map(outlet_type_percentage)
test['Outlet_Type_Percentage'] = test['Outlet_Type'].map(outlet_type_percentage)

# xgboost - 1041
#df['Customer_Outlet_Preference'] = (
#    df['Item_MRP'] * df['Outlet_Type_Percentage'] / (df['Item_Weight']+1)*(df['Item_Visibility']+1)
#)
#test['Customer_Outlet_Preference'] = (
#    test['Item_MRP'] * test['Outlet_Type_Percentage'] / (test['Item_Weight']+1)*(test['Item_Visibility']+1)
#)

# polynomial regression - 1039
#median_mrp = df['Item_MRP'].median()

#df['Customer_Outlet_Preference'] = (
#    ((df['Item_MRP'] / median_mrp) ** 0.5) *
#    np.exp(-df['Item_Visibility']) *
#    df['Outlet_Type_Percentage']
#)

#test['Customer_Outlet_Preference'] = (
#    ((test['Item_MRP'] / median_mrp) ** 0.5) *
#    np.exp(-test['Item_Visibility']) *
#    test['Outlet_Type_Percentage']
#)

### polynomial regression - 1038.26
#median_mrp = df['Item_MRP'].median()

#df['Customer_Outlet_Preference'] = (
#    np.sqrt(df['Item_MRP'] / median_mrp) *
#    (1 / (1 + np.log1p(df['Item_Visibility']))) *   # softer visibility penalty
#    (df['Outlet_Type_Percentage'] * (1 / df['Outlet_Location_Type']))
#)

#test['Customer_Outlet_Preference'] = (
#    np.sqrt(test['Item_MRP'] / median_mrp) *
#    (1 / (1 + np.log1p(test['Item_Visibility']))) *
#    (test['Outlet_Type_Percentage'] * (1 / test['Outlet_Location_Type']))
#)


median_mrp = df['Item_MRP'].median()

df['Customer_Outlet_Preference'] = (
    np.sqrt(df['Item_MRP'] / median_mrp) *
    (1 / (1 + np.sqrt(df['Item_Visibility']))) *
    (df['Outlet_Type_Percentage']) *
    (1 + 0.05 * df['Outlet_Size'])  # small bonus, not overpowering
)

test['Customer_Outlet_Preference'] = (
    np.sqrt(test['Item_MRP'] / median_mrp) *
    (1 / (1 + np.sqrt(test['Item_Visibility']))) *
    (test['Outlet_Type_Percentage']) *
    (1 + 0.05 * test['Outlet_Size'])
)



# Normalize to 0-1 range
#df['Customer_Outlet_Preference'] = (
#    (df['Customer_Outlet_Preference'] - df['Customer_Outlet_Preference'].min()) /
#    (df['Customer_Outlet_Preference'].max() - df['Customer_Outlet_Preference'].min())
#)
#test['Customer_Outlet_Preference'] = (
#    (test['Customer_Outlet_Preference'] - test['Customer_Outlet_Preference'].min()) /
#    (test['Customer_Outlet_Preference'].max() - test['Customer_Outlet_Preference'].min())
#)

# --- Features & Target ---
features = [
    'Item_Weight', 'Item_Fat_Content', 'Item_Type', 'Item_MRP',
    'Item_Visibility', 'Outlet_Size', 'Outlet_Location_Type',
    'Outlet_Type', 'Outlet_Age', 'Item_Sales_Frequency',
    'Customer_Outlet_Preference'
]

X = df[features]
y = df['Item_Outlet_Sales']
X_test_final = test[features]

# --- Train/Validation Split ---
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

# --- RMSE scorer ---
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scorer = make_scorer(lambda yt, yp: np.sqrt(mean_squared_error(yt, yp)), greater_is_better=False)

# --- Tuned XGBoost with RandomizedSearchCV ---
xgb_model = xgb.XGBRegressor(random_state=42, n_estimators=500)

param_dist = {
    'max_depth': [3, 4, 5, 6, 7, 8],
    'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.1],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'min_child_weight': [1, 3, 5, 7],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'reg_alpha': [0, 0.01, 0.1, 1, 10],
    'reg_lambda': [1, 1.5, 2, 3, 5]
}

random_search = RandomizedSearchCV(
    xgb_model,
    param_distributions=param_dist,
    n_iter=100,
    scoring=rmse_scorer,
    cv=kf,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

best_xgb = random_search.best_estimator_
y_pred_xgb = best_xgb.predict(X_val)
rmse_xgb = np.sqrt(mean_squared_error(y_val, y_pred_xgb))

print("\nTuned XGBoost:")
print("Best params:", random_search.best_params_)
print(f"Validation RMSE: {rmse_xgb:.2f}")

# --- Final predictions on test set ---
test_predictions = best_xgb.predict(X_test_final)
submission = pd.DataFrame({
    'Item_Identifier': test['Item_Identifier'],
    'Outlet_Identifier': test['Outlet_Identifier'],
    'Item_Outlet_Sales': test_predictions
})
# submission.to_csv("submission.csv", index=False)


Fitting 5 folds for each of 100 candidates, totalling 500 fits

Tuned XGBoost:
Best params: {'subsample': 0.7, 'reg_lambda': 1.5, 'reg_alpha': 0.1, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0.1, 'colsample_bytree': 0.9}
Validation RMSE: 1042.11


In [22]:
from sklearn.model_selection import GridSearchCV

# --- Polynomial Regression with Lasso Tuning ---
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scorer = make_scorer(
    lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
    greater_is_better=False
)

poly_lasso = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(include_bias=False)),
    ('lasso', Lasso(max_iter=500))
])

param_grid = {
    'poly__degree': [2, 3],
    'lasso__alpha': [0.001, 0.01, 0.05, 0.1, 0.5]
}

grid = GridSearchCV(poly_lasso, param_grid, cv=kf, scoring=rmse_scorer, n_jobs=-1, verbose=1)
numerical_features = X_train.select_dtypes(include=[np.number]).columns
grid.fit(X_train[numerical_features], y_train)

best_poly_lasso = grid.best_estimator_
y_pred_poly = best_poly_lasso.predict(X_val[numerical_features])
rmse_poly = np.sqrt(mean_squared_error(y_val, y_pred_poly))
cv_scores = -grid.best_score_

print("\nBest params:", grid.best_params_)
print(f"Polynomial Regression with Lasso RMSE: {rmse_poly:.2f}")
print(f"Polynomial Regression with Lasso CV RMSE: {cv_scores:.2f}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits

Best params: {'lasso__alpha': 0.5, 'poly__degree': 2}
Polynomial Regression with Lasso RMSE: 1040.30
Polynomial Regression with Lasso CV RMSE: 1089.18


In [23]:
# RMSE scorer
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scorer = make_scorer(lambda yt, yp: np.sqrt(mean_squared_error(yt, yp)), greater_is_better=False)

# --- Random Forest Regressor ---
rf = RandomForestRegressor(n_estimators=300, max_depth=12, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)

# Validation prediction
y_pred_rf = rf.predict(X_val)
rmse_rf = np.sqrt(mean_squared_error(y_val, y_pred_rf))
rf_scores = cross_val_score(rf, X, y, cv=kf, scoring=rmse_scorer)

print(f"Random Forest Validation RMSE: {rmse_rf:.2f}")
print(f"Random Forest CV RMSE: {-rf_scores.mean():.2f}")

# Final predictions
test_predictions = rf.predict(X_test_final)
submission = pd.DataFrame({
    'Item_Identifier': test['Item_Identifier'],
    'Outlet_Identifier': test['Outlet_Identifier'],
    'Item_Outlet_Sales': test_predictions
})
# submission.to_csv("rf_submission.csv", index=False)

Random Forest Validation RMSE: 1056.98
Random Forest CV RMSE: 1096.44


In [25]:
# --- Imports ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")

# --- Load data ---
# df = pd.read_csv("train.csv")
# test = pd.read_csv("test.csv")

# 1) Standardize Item_Fat_Content
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({
    'low fat': 'Low Fat',
    'LF': 'Low Fat',
    'reg': 'Regular'
})
test['Item_Fat_Content'] = test['Item_Fat_Content'].replace({
    'low fat': 'Low Fat',
    'LF': 'Low Fat',
    'reg': 'Regular'
})

# 2) Handle missing values in Outlet_Size
size_map = {
    'Grocery Store': 'Small',
    'Supermarket Type2': 'Medium',
    'Supermarket Type3': 'Medium'
}
df['Outlet_Size'] = df['Outlet_Size'].fillna(df['Outlet_Type'].map(size_map))
test['Outlet_Size'] = test['Outlet_Size'].fillna(test['Outlet_Type'].map(size_map))

size_map2 = {'Tier 2': 'Small'}
df['Outlet_Size'] = df['Outlet_Size'].fillna(df['Outlet_Location_Type'].map(size_map2))
test['Outlet_Size'] = test['Outlet_Size'].fillna(test['Outlet_Location_Type'].map(size_map2))

# 3) Handle missing values in Item_Weight
df['Item_Weight'] = df['Item_Weight'].fillna(df.groupby(['Item_Identifier'])['Item_Weight'].transform('mean'))
test['Item_Weight'] = test['Item_Weight'].fillna(df.groupby(['Item_Identifier'])['Item_Weight'].transform('mean'))

df['Item_Weight'] = df.groupby('Item_Type')['Item_Weight'].transform('mean')
test['Item_Weight'] = test.groupby('Item_Type')['Item_Weight'].transform('mean')

# 4) Handle zeros in Item_Visibility
df['Item_Visibility'] = df.groupby('Item_Identifier')['Item_Visibility'].transform(
    lambda x: x.replace(0, x.mean())
)
test['Item_Visibility'] = test.groupby('Item_Identifier')['Item_Visibility'].transform(
    lambda x: x.replace(0, x.mean())
)
# Fill remaining zeros in test
test['Item_Visibility'] = test.groupby(['Item_Type', 'Item_Fat_Content'])['Item_Visibility'].transform('mean')

# --- Additional Feature: Outlet_Age ---
df['Outlet_Age'] = 2025 - df['Outlet_Establishment_Year']
test['Outlet_Age'] = 2025 - test['Outlet_Establishment_Year']

# --- Manual Encoding ---
# Item_Fat_Content → 1,2
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({'Low Fat': 1, 'Regular': 2})
test['Item_Fat_Content'] = test['Item_Fat_Content'].replace({'Low Fat': 1, 'Regular': 2})

# Outlet_Size → 1,2,3
df['Outlet_Size'] = df['Outlet_Size'].replace({'Small': 1, 'Medium': 2, 'High': 3})
test['Outlet_Size'] = test['Outlet_Size'].replace({'Small': 1, 'Medium': 2, 'High': 3})

# Outlet_Location_Type → 1,2,3
df['Outlet_Location_Type'] = df['Outlet_Location_Type'].replace({'Tier 1': 1, 'Tier 2': 2, 'Tier 3': 3})
test['Outlet_Location_Type'] = test['Outlet_Location_Type'].replace({'Tier 1': 1, 'Tier 2': 2, 'Tier 3': 3})

# Outlet_Type → 1,2,3,4
outlet_type_map = {
    'Grocery Store': 1,
    'Supermarket Type1': 2,
    'Supermarket Type2': 3,
    'Supermarket Type3': 4
}
df['Outlet_Type'] = df['Outlet_Type'].replace(outlet_type_map)
test['Outlet_Type'] = test['Outlet_Type'].replace(outlet_type_map)

# --- Encode Item_Type as 1,2,3,... ---
df['Item_Type'], uniques = pd.factorize(df['Item_Type'])
test['Item_Type'] = pd.Categorical(test['Item_Type'], categories=uniques).codes
df['Item_Type'] = df['Item_Type'] + 1
test['Item_Type'] = test['Item_Type'] + 1

# --- New Feature: Item_Sales_Frequency ---
# xgboost - 1041
#df['Item_Sales_Frequency'] = df['Outlet_Age'] * (df['Item_MRP'] - df['Item_Visibility'])/(df['Item_Weight'] + 1)
#test['Item_Sales_Frequency'] = test['Outlet_Age'] * (test['Item_MRP'] - test['Item_Visibility'])/(test['Item_Weight'] + 1)

# polynomial regression - 1039
#item_popularity = df['Item_Identifier'].value_counts(normalize=True)  # normalized frequency
#df['Item_Popularity'] = df['Item_Identifier'].map(item_popularity)
#test['Item_Popularity'] = test['Item_Identifier'].map(item_popularity).fillna(0)  # unseen items → 0

# --- 2. Item_Sales_Frequency ---
#df['Item_Sales_Frequency'] = (
#    np.log1p(df['Outlet_Age']) * (df['Item_MRP'] / (df['Item_Weight'] + 1)) * df['Item_Popularity']
#)

#test['Item_Sales_Frequency'] = (
#    np.log1p(test['Outlet_Age']) * (test['Item_MRP'] / (test['Item_Weight'] + 1)) * test['Item_Popularity']
#)

### polynomial regression - 1038.26
df['Item_Sales_Frequency'] = (
    np.log1p(df['Outlet_Age']) *
    ((df['Item_MRP'] - df['Item_MRP'].mean()) / (df['Item_MRP'].std() + 1)) *
    (df['Item_Popularity'] + 0.01)  # smoothing
)

test['Item_Sales_Frequency'] = (
    np.log1p(test['Outlet_Age']) *
    ((test['Item_MRP'] - df['Item_MRP'].mean()) / (df['Item_MRP'].std() + 1)) *
    (test['Item_Popularity'] + 0.01)
)




# --- Handle Inf / NaN in Item_Sales_Frequency ---
df['Item_Sales_Frequency'].replace([np.inf, -np.inf], np.nan, inplace=True)
test['Item_Sales_Frequency'].replace([np.inf, -np.inf], np.nan, inplace=True)
df['Item_Sales_Frequency'].fillna(df['Item_Sales_Frequency'].mean(), inplace=True)
test['Item_Sales_Frequency'].fillna(test['Item_Sales_Frequency'].mean(), inplace=True)



# --- New Feature: Customer Outlet Preference ---
outlet_type_sales = df.groupby('Outlet_Type')['Item_Outlet_Sales'].sum()
outlet_type_percentage = outlet_type_sales / outlet_type_sales.sum()

df['Outlet_Type_Percentage'] = df['Outlet_Type'].map(outlet_type_percentage)
test['Outlet_Type_Percentage'] = test['Outlet_Type'].map(outlet_type_percentage)

# xgboost - 1041
#df['Customer_Outlet_Preference'] = (
#    df['Item_MRP'] * df['Outlet_Type_Percentage'] / (df['Item_Weight']+1)*(df['Item_Visibility']+1)
#)
#test['Customer_Outlet_Preference'] = (
#    test['Item_MRP'] * test['Outlet_Type_Percentage'] / (test['Item_Weight']+1)*(test['Item_Visibility']+1)
#)

# polynomial regression - 1039
#median_mrp = df['Item_MRP'].median()

#df['Customer_Outlet_Preference'] = (
#    ((df['Item_MRP'] / median_mrp) ** 0.5) *
#    np.exp(-df['Item_Visibility']) *
#    df['Outlet_Type_Percentage']
#)

#test['Customer_Outlet_Preference'] = (
#    ((test['Item_MRP'] / median_mrp) ** 0.5) *
#    np.exp(-test['Item_Visibility']) *
#    test['Outlet_Type_Percentage']
#)

### polynomial regression - 1038.26
median_mrp = df['Item_MRP'].median()

df['Customer_Outlet_Preference'] = (
    np.sqrt(df['Item_MRP'] / median_mrp) *
    (1 / (1 + np.log1p(df['Item_Visibility']))) *   # softer visibility penalty
    (df['Outlet_Type_Percentage'] * (1 / df['Outlet_Location_Type']))
)

test['Customer_Outlet_Preference'] = (
    np.sqrt(test['Item_MRP'] / median_mrp) *
    (1 / (1 + np.log1p(test['Item_Visibility']))) *
    (test['Outlet_Type_Percentage'] * (1 / test['Outlet_Location_Type']))
)



# --- New Features ---
# polynomial regression - 1038.14
# 1. Price per weight
df['Price_per_Weight'] = df['Item_MRP'] / (df['Item_Weight'] + 1)
test['Price_per_Weight'] = test['Item_MRP'] / (test['Item_Weight'] + 1)

# 2. Visibility × Price interaction
df['Visibility_Price_Interaction'] = df['Item_Visibility'] * df['Item_MRP']
test['Visibility_Price_Interaction'] = test['Item_Visibility'] * test['Item_MRP']

# 3. Outlet Age × Outlet Type
df['Outlet_Age_Type'] = df['Outlet_Age'] * df['Outlet_Type']
test['Outlet_Age_Type'] = test['Outlet_Age'] * test['Outlet_Type']

# 4. Fat × Type interaction
df['Fat_Type_Interaction'] = df['Item_Fat_Content'] * df['Item_Type']
test['Fat_Type_Interaction'] = test['Item_Fat_Content'] * test['Item_Type']

# 5. Outlet diversity
outlet_diversity = df.groupby('Outlet_Identifier')['Item_Identifier'].nunique()
df['Outlet_Diversity'] = df['Outlet_Identifier'].map(outlet_diversity)
test['Outlet_Diversity'] = test['Outlet_Identifier'].map(outlet_diversity).fillna(outlet_diversity.mean())

# 6. Log features
df['Log_MRP'] = np.log1p(df['Item_MRP'])
test['Log_MRP'] = np.log1p(test['Item_MRP'])

df['Log_Visibility'] = np.log1p(df['Item_Visibility'])
test['Log_Visibility'] = np.log1p(test['Item_Visibility'])



# Normalize to 0-1 range
#df['Customer_Outlet_Preference'] = (
#    (df['Customer_Outlet_Preference'] - df['Customer_Outlet_Preference'].min()) /
#    (df['Customer_Outlet_Preference'].max() - df['Customer_Outlet_Preference'].min())
#)
#test['Customer_Outlet_Preference'] = (
#    (test['Customer_Outlet_Preference'] - test['Customer_Outlet_Preference'].min()) /
#    (test['Customer_Outlet_Preference'].max() - test['Customer_Outlet_Preference'].min())
#)

# --- Features & Target ---
#features = [
#    'Item_Weight', 'Item_Fat_Content', 'Item_Type', 'Item_MRP',
#    'Item_Visibility', 'Outlet_Size', 'Outlet_Location_Type',
#    'Outlet_Type', 'Outlet_Age', 'Item_Sales_Frequency',
#    'Customer_Outlet_Preference'
#]


features = [
    'Item_Weight', 'Item_Fat_Content', 'Item_Type', 'Item_MRP',
    'Item_Visibility', 'Outlet_Size', 'Outlet_Location_Type',
    'Outlet_Type', 'Outlet_Age',
    'Item_Sales_Frequency', 'Customer_Outlet_Preference',
    'Price_per_Weight', 'Visibility_Price_Interaction',
    'Outlet_Age_Type', 'Fat_Type_Interaction',
    'Outlet_Diversity', 'Log_MRP', 'Log_Visibility'
]



X = df[features]
y = df['Item_Outlet_Sales']
X_test_final = test[features]

# --- Train/Validation Split ---
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

# --- RMSE scorer ---
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scorer = make_scorer(lambda yt, yp: np.sqrt(mean_squared_error(yt, yp)), greater_is_better=False)

# --- Tuned XGBoost with RandomizedSearchCV ---
xgb_model = xgb.XGBRegressor(random_state=42, n_estimators=500)

param_dist = {
    'max_depth': [3, 4, 5, 6, 7, 8],
    'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.1],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0],
    'min_child_weight': [1, 3, 5, 7],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'reg_alpha': [0, 0.01, 0.1, 1, 10],
    'reg_lambda': [1, 1.5, 2, 3, 5]
}

random_search = RandomizedSearchCV(
    xgb_model,
    param_distributions=param_dist,
    n_iter=100,
    scoring=rmse_scorer,
    cv=kf,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

best_xgb = random_search.best_estimator_
y_pred_xgb = best_xgb.predict(X_val)
rmse_xgb = np.sqrt(mean_squared_error(y_val, y_pred_xgb))

print("\nTuned XGBoost:")
print("Best params:", random_search.best_params_)
print(f"Validation RMSE: {rmse_xgb:.2f}")

# --- Final predictions on test set ---
test_predictions = best_xgb.predict(X_test_final)
submission = pd.DataFrame({
    'Item_Identifier': test['Item_Identifier'],
    'Outlet_Identifier': test['Outlet_Identifier'],
    'Item_Outlet_Sales': test_predictions
})
# submission.to_csv("submission.csv", index=False)

Fitting 5 folds for each of 100 candidates, totalling 500 fits

Tuned XGBoost:
Best params: {'subsample': 1.0, 'reg_lambda': 2, 'reg_alpha': 0, 'min_child_weight': 3, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0.1, 'colsample_bytree': 1.0}
Validation RMSE: 1044.13


In [29]:
from sklearn.model_selection import GridSearchCV

# --- Polynomial Regression with Lasso Tuning ---
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scorer = make_scorer(
    lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
    greater_is_better=False
)

poly_lasso = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(include_bias=False)),
    ('lasso', Lasso(max_iter=500))
])

param_grid = {
    'poly__degree': [2, 3],
    'lasso__alpha': [0.001, 0.01, 0.05, 0.1, 0.5]
}

grid = GridSearchCV(poly_lasso, param_grid, cv=kf, scoring=rmse_scorer, n_jobs=-1, verbose=1)
numerical_features = X_train.select_dtypes(include=[np.number]).columns
grid.fit(X_train[numerical_features], y_train)

best_poly_lasso = grid.best_estimator_
y_pred_poly = best_poly_lasso.predict(X_val[numerical_features])
rmse_poly = np.sqrt(mean_squared_error(y_val, y_pred_poly))
cv_scores = -grid.best_score_

print("\nBest params:", grid.best_params_)
print(f"Polynomial Regression with Lasso RMSE: {rmse_poly:.2f}")
print(f"Polynomial Regression with Lasso CV RMSE: {cv_scores:.2f}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits

Best params: {'lasso__alpha': 0.5, 'poly__degree': 2}
Polynomial Regression with Lasso RMSE: 1038.14
Polynomial Regression with Lasso CV RMSE: 1091.63


In [30]:
# --- Final predictions on test set (Poly + Lasso) ---
test_predictions_poly = best_poly_lasso.predict(X_test_final[numerical_features])

submission_poly = pd.DataFrame({
    'Item_Identifier': test['Item_Identifier'],
    'Outlet_Identifier': test['Outlet_Identifier'],
    'Item_Outlet_Sales': test_predictions_poly
})
submission_poly.to_csv("submission_poly.csv", index=False)

NotFittedError: Pipeline is not fitted yet.