In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import warnings
warnings.filterwarnings('ignore')

In [2]:
# upload train dataset
from google.colab import files

# Open file picker
uploaded = files.upload()
train_df = pd.read_csv("train_v9rqX0R.csv")

Saving train_v9rqX0R.csv to train_v9rqX0R.csv


In [3]:
# upload test dataset
from google.colab import files

# Open file picker
uploaded = files.upload()
valid_df = pd.read_csv("test_AbJTz2l.csv")

Saving test_AbJTz2l.csv to test_AbJTz2l.csv


In [13]:
# --- Load data ---
# df = pd.read_csv("train.csv")
# test = pd.read_csv("test.csv")

# 1) Standardize Item_Fat_Content
train_df['Item_Fat_Content'] = train_df['Item_Fat_Content'].replace({
    'low fat': 'Low Fat',
    'LF': 'Low Fat',
    'reg': 'Regular'
})
valid_df['Item_Fat_Content'] = valid_df['Item_Fat_Content'].replace({
    'low fat': 'Low Fat',
    'LF': 'Low Fat',
    'reg': 'Regular'
})

# 2) Handle missing values in Outlet_Size
size_map = {
    'Grocery Store': 'Small',
    'Supermarket Type2': 'Medium',
    'Supermarket Type3': 'Medium'
}
train_df['Outlet_Size'] = train_df['Outlet_Size'].fillna(train_df['Outlet_Type'].map(size_map))
valid_df['Outlet_Size'] = valid_df['Outlet_Size'].fillna(valid_df['Outlet_Type'].map(size_map))

size_map2 = {'Tier 2': 'Small'}
train_df['Outlet_Size'] = train_df['Outlet_Size'].fillna(train_df['Outlet_Location_Type'].map(size_map2))
valid_df['Outlet_Size'] = valid_df['Outlet_Size'].fillna(valid_df['Outlet_Location_Type'].map(size_map2))

# 3) Handle missing values in Item_Weight
train_df['Item_Weight'] = train_df['Item_Weight'].fillna(train_df.groupby(['Item_Identifier'])['Item_Weight'].transform('mean'))
valid_df['Item_Weight'] = valid_df['Item_Weight'].fillna(valid_df.groupby(['Item_Identifier'])['Item_Weight'].transform('mean'))

#train_df['Item_Weight'] = train_df.groupby('Item_Type')['Item_Weight'].transform('mean')
#valid_df['Item_Weight'] = valid_df.groupby('Item_Type')['Item_Weight'].transform('mean')

train_df['Item_Weight'] = train_df['Item_Weight'].fillna(train_df.groupby('Item_Type')['Item_Weight'].transform('mean'))
valid_df['Item_Weight'] = valid_df['Item_Weight'].fillna(valid_df.groupby('Item_Type')['Item_Weight'].transform('mean'))

# 4) Handle zeros in Item_Visibility
train_df['Item_Visibility'] = train_df.groupby('Item_Identifier')['Item_Visibility'].transform(
    lambda x: x.replace(0, x.mean())
)
valid_df['Item_Visibility'] = valid_df.groupby('Item_Identifier')['Item_Visibility'].transform(
    lambda x: x.replace(0, x.mean())
)
# Fill remaining zeros in test
valid_df['Item_Visibility'] = valid_df.groupby(['Item_Type', 'Item_Fat_Content'])['Item_Visibility'].transform('mean')

# --- Additional Feature: Outlet_Age ---
train_df['Outlet_Age'] = 2025 - train_df['Outlet_Establishment_Year']
valid_df['Outlet_Age'] = 2025 - valid_df['Outlet_Establishment_Year']

# --- Manual Encoding ---
# Item_Fat_Content → 1,2
train_df['Item_Fat_Content'] = train_df['Item_Fat_Content'].replace({'Low Fat': 1, 'Regular': 2})
valid_df['Item_Fat_Content'] = valid_df['Item_Fat_Content'].replace({'Low Fat': 1, 'Regular': 2})

# Outlet_Size → 1,2,3
train_df['Outlet_Size'] = train_df['Outlet_Size'].replace({'Small': 1, 'Medium': 2, 'High': 3})
valid_df['Outlet_Size'] = valid_df['Outlet_Size'].replace({'Small': 1, 'Medium': 2, 'High': 3})

# Outlet_Location_Type → 1,2,3
train_df['Outlet_Location_Type'] = train_df['Outlet_Location_Type'].replace({'Tier 1': 1, 'Tier 2': 2, 'Tier 3': 3})
valid_df['Outlet_Location_Type'] = valid_df['Outlet_Location_Type'].replace({'Tier 1': 1, 'Tier 2': 2, 'Tier 3': 3})

# Outlet_Type → 1,2,3,4
outlet_type_map = {
    'Grocery Store': 1,
    'Supermarket Type1': 2,
    'Supermarket Type2': 3,
    'Supermarket Type3': 4
}
train_df['Outlet_Type'] = train_df['Outlet_Type'].replace(outlet_type_map)
valid_df['Outlet_Type'] = valid_df['Outlet_Type'].replace(outlet_type_map)

# --- Encode Item_Type as 1,2,3,... ---
train_df['Item_Type'], uniques = pd.factorize(train_df['Item_Type'])
valid_df['Item_Type'] = pd.Categorical(valid_df['Item_Type'], categories=uniques).codes

train_df['Item_Type'] = train_df['Item_Type'] + 1
valid_df['Item_Type'] = valid_df['Item_Type'] + 1

# --- New Feature: Item_Sales_Frequency ---
# xgboost - 1041
#train_df['Item_Sales_Frequency'] = train_df['Outlet_Age'] * (train_df['Item_MRP'] - train_df['Item_Visibility'])/(train_df['Item_Weight'] + 1)
#valid_df['Item_Sales_Frequency'] = valid_df['Outlet_Age'] * (valid_df['Item_MRP'] - valid_df['Item_Visibility'])/(valid_df['Item_Weight'] + 1)

# polynomial regression - 1039
#item_popularity = train_df['Item_Identifier'].value_counts(normalize=True)  # normalized frequency
#train_df['Item_Popularity'] = train_df['Item_Identifier'].map(item_popularity)
#valid_df['Item_Popularity'] = valid_df['Item_Identifier'].map(item_popularity).fillna(0)  # unseen items → 0

# --- 2. Item_Sales_Frequency ---
#train_df['Item_Sales_Frequency'] = (
#    np.log1p(train_df['Outlet_Age']) * (train_df['Item_MRP'] / (train_df['Item_Weight'] + 1)) * train_df['Item_Popularity']
#)

#valid_df['Item_Sales_Frequency'] = (
#    np.log1p(valid_df['Outlet_Age']) * (valid_df['Item_MRP'] / (valid_df['Item_Weight'] + 1)) * valid_df['Item_Popularity']
#)

### polynomial regression - 1038.26
item_popularity = train_df['Item_Identifier'].value_counts(normalize=True)  # normalized frequency

train_df['Item_Popularity'] = train_df['Item_Identifier'].map(item_popularity)
valid_df['Item_Popularity'] = valid_df['Item_Identifier'].map(item_popularity).fillna(0)  # unseen items → 0

train_df['Item_Sales_Frequency'] = (
    np.log1p(train_df['Outlet_Age']) *
    ((train_df['Item_MRP'] - train_df['Item_MRP'].mean()) / (train_df['Item_MRP'].std() + 1)) *
    (train_df['Item_Popularity'] + 0.01)  # smoothing
)

valid_df['Item_Sales_Frequency'] = (
    np.log1p(valid_df['Outlet_Age']) *
    ((valid_df['Item_MRP'] - valid_df['Item_MRP'].mean()) / (valid_df['Item_MRP'].std() + 1)) *
    (valid_df['Item_Popularity'] + 0.01)
)




# --- Handle Inf / NaN in Item_Sales_Frequency ---
train_df['Item_Sales_Frequency'].replace([np.inf, -np.inf], np.nan, inplace=True)
valid_df['Item_Sales_Frequency'].replace([np.inf, -np.inf], np.nan, inplace=True)
train_df['Item_Sales_Frequency'].fillna(train_df['Item_Sales_Frequency'].mean(), inplace=True)
valid_df['Item_Sales_Frequency'].fillna(valid_df['Item_Sales_Frequency'].mean(), inplace=True)



# --- New Feature: Customer Outlet Preference ---
outlet_type_sales = train_df.groupby('Outlet_Type')['Item_Outlet_Sales'].sum()
outlet_type_percentage = outlet_type_sales / outlet_type_sales.sum()

train_df['Outlet_Type_Percentage'] = train_df['Outlet_Type'].map(outlet_type_percentage)
valid_df['Outlet_Type_Percentage'] = valid_df['Outlet_Type'].map(outlet_type_percentage)

# xgboost - 1041
#train_df['Customer_Outlet_Preference'] = (
#    train_df['Item_MRP'] * train_df['Outlet_Type_Percentage'] / (train_df['Item_Weight']+1)*(train_df['Item_Visibility']+1)
#)
#valid_df['Customer_Outlet_Preference'] = (
#    valid_df['Item_MRP'] * valid_df['Outlet_Type_Percentage'] / (valid_df['Item_Weight']+1)*(valid_df['Item_Visibility']+1)
#)

# polynomial regression - 1039
#median_mrp = df['Item_MRP'].median()

#train_df['Customer_Outlet_Preference'] = (
#    ((train_df['Item_MRP'] / median_mrp) ** 0.5) *
#    np.exp(-train_df['Item_Visibility']) *
#    train_df['Outlet_Type_Percentage']
#)

#valid_df['Customer_Outlet_Preference'] = (
#    ((valid_df['Item_MRP'] / median_mrp) ** 0.5) *
#    np.exp(-valid_df['Item_Visibility']) *
#    valid_df['Outlet_Type_Percentage']
#)

### polynomial regression - 1038.26
median_mrp = train_df['Item_MRP'].median()

train_df['Customer_Outlet_Preference'] = (
    np.sqrt(train_df['Item_MRP'] / median_mrp) *
    (1 / (1 + np.log1p(train_df['Item_Visibility']))) *   # softer visibility penalty
    (train_df['Outlet_Type_Percentage'] * (1 / train_df['Outlet_Location_Type']))
)

valid_df['Customer_Outlet_Preference'] = (
    np.sqrt(valid_df['Item_MRP'] / median_mrp) *
    (1 / (1 + np.log1p(valid_df['Item_Visibility']))) *
    (valid_df['Outlet_Type_Percentage'] * (1 / valid_df['Outlet_Location_Type']))
)



# --- New Features ---
# polynomial regression + above equations + these equations - 1038.14
# 1. Price per weight
train_df['Price_per_Weight'] = train_df['Item_MRP'] / (train_df['Item_Weight'] + 1)
valid_df['Price_per_Weight'] = valid_df['Item_MRP'] / (valid_df['Item_Weight'] + 1)

# 2. Visibility × Price interaction
train_df['Visibility_Price_Interaction'] = train_df['Item_Visibility'] * train_df['Item_MRP']
valid_df['Visibility_Price_Interaction'] = valid_df['Item_Visibility'] * valid_df['Item_MRP']

# 3. Outlet Age × Outlet Type
train_df['Outlet_Age_Type'] = train_df['Outlet_Age'] * train_df['Outlet_Type']
valid_df['Outlet_Age_Type'] = valid_df['Outlet_Age'] * valid_df['Outlet_Type']

# 4. Fat × Type interaction
train_df['Fat_Type_Interaction'] = train_df['Item_Fat_Content'] * train_df['Item_Type']
valid_df['Fat_Type_Interaction'] = valid_df['Item_Fat_Content'] * valid_df['Item_Type']

# 5. Outlet diversity
outlet_diversity = train_df.groupby('Outlet_Identifier')['Item_Identifier'].nunique()
train_df['Outlet_Diversity'] = train_df['Outlet_Identifier'].map(outlet_diversity)
valid_df['Outlet_Diversity'] = valid_df['Outlet_Identifier'].map(outlet_diversity)
valid_df['Outlet_Diversity'].fillna(outlet_diversity.mean(), inplace=True)

# 6. Log features
train_df['Log_MRP'] = np.log1p(train_df['Item_MRP'])
valid_df['Log_MRP'] = np.log1p(valid_df['Item_MRP'])

train_df['Log_Visibility'] = np.log1p(train_df['Item_Visibility'])
valid_df['Log_Visibility'] = np.log1p(valid_df['Item_Visibility'])



# Normalize to 0-1 range
#train_df['Customer_Outlet_Preference'] = (
#    (train_df['Customer_Outlet_Preference'] - train_df['Customer_Outlet_Preference'].min()) /
#    (train_df['Customer_Outlet_Preference'].max() - train_df['Customer_Outlet_Preference'].min())
#)
#valid_df['Customer_Outlet_Preference'] = (
#    (valid_df['Customer_Outlet_Preference'] - valid_df['Customer_Outlet_Preference'].min()) /
#    (valid_df['Customer_Outlet_Preference'].max() - valid_df['Customer_Outlet_Preference'].min())
#)

# --- Features & Target ---
#features = [
#    'Item_Weight', 'Item_Fat_Content', 'Item_Type', 'Item_MRP',
#    'Item_Visibility', 'Outlet_Size', 'Outlet_Location_Type',
#    'Outlet_Type', 'Outlet_Age', 'Item_Sales_Frequency',
#    'Customer_Outlet_Preference'
#]


features = [
    'Item_Weight', 'Item_Fat_Content', 'Item_Type', 'Item_MRP',
    'Item_Visibility', 'Outlet_Size', 'Outlet_Location_Type',
    'Outlet_Type', 'Outlet_Age',
    'Item_Sales_Frequency', 'Customer_Outlet_Preference',
    'Price_per_Weight', 'Visibility_Price_Interaction',
    'Outlet_Age_Type', 'Fat_Type_Interaction',
    'Outlet_Diversity', 'Log_MRP', 'Log_Visibility'
]



X = train_df[features]
y = train_df['Item_Outlet_Sales']
X_test_final = valid_df[features]

# --- Train/Validation Split ---
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

In [36]:
from sklearn.model_selection import GridSearchCV

# --- Polynomial Regression with Lasso Tuning ---
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scorer = make_scorer(
    lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
    greater_is_better=False
)

poly_lasso = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(include_bias=False)),
    ('lasso', Lasso(max_iter=500))
])

param_grid = {
    'poly__degree': [2, 3],
    'lasso__alpha': [0.001, 0.01, 0.05, 0.1, 0.5]
}

grid = GridSearchCV(poly_lasso, param_grid, cv=kf, scoring=rmse_scorer, n_jobs=-1, verbose=1)
numerical_features = X_train.select_dtypes(include=[np.number]).columns
grid.fit(X_train[numerical_features], y_train)

best_poly_lasso = grid.best_estimator_
y_pred_poly = best_poly_lasso.predict(X_val[numerical_features])
rmse_poly = np.sqrt(mean_squared_error(y_val, y_pred_poly))
cv_scores = -grid.best_score_

print("\nBest params:", grid.best_params_)
print(f"Polynomial Regression with Lasso RMSE: {rmse_poly:.2f}")
print(f"Polynomial Regression with Lasso CV RMSE: {cv_scores:.2f}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits

Best params: {'lasso__alpha': 0.5, 'poly__degree': 2}
Polynomial Regression with Lasso RMSE: 1038.14
Polynomial Regression with Lasso CV RMSE: 1091.63


In [37]:
# --- Predict on full validation dataset ---
X_valid = valid_df[features]
numerical_features_valid = X_valid.select_dtypes(include=[np.number]).columns
y_pred_valid = best_poly_lasso.predict(X_valid[numerical_features_valid])

# --- Save predictions to CSV ---
submission_poly = valid_df[['Item_Identifier', 'Outlet_Identifier']].copy()
submission_poly['Item_Outlet_Sales'] = y_pred_valid

# Ensure no negative predictions
submission_poly['Item_Outlet_Sales'] = submission_poly['Item_Outlet_Sales'].clip(lower=0)

submission_poly.to_csv("validation_predictions.csv", index=False)

# --- Print first few predictions ---
print(submission_poly.head())

  Item_Identifier Outlet_Identifier  Item_Outlet_Sales
0           FDW58            OUT049        1693.817586
1           FDW14            OUT017        1404.158092
2           NCN55            OUT010         539.290492
3           FDQ58            OUT017        2506.617873
4           FDY38            OUT027        6292.848042


In [41]:
# --- Predict on full validation dataset using numerical features from training ---
X_valid = valid_df[features]
numerical_features_train = X_train.select_dtypes(include=[np.number]).columns  # same as training

# Predict
y_pred_valid = best_poly_lasso.predict(X_valid[numerical_features_train])

# --- Save predictions to CSV ---
submission_poly = valid_df[['Item_Identifier', 'Outlet_Identifier']].copy()
submission_poly['Item_Outlet_Sales'] = y_pred_valid

# Ensure no negative predictions
submission_poly['Item_Outlet_Sales'] = submission_poly['Item_Outlet_Sales'].clip(lower=0)

# Save CSV
submission_poly.to_csv("validation_predictions_1038.csv", index=False)

# Print first few predictions
print(submission_poly.head())

  Item_Identifier Outlet_Identifier  Item_Outlet_Sales
0           FDW58            OUT049        1693.817586
1           FDW14            OUT017        1404.158092
2           NCN55            OUT010         539.290492
3           FDQ58            OUT017        2506.617873
4           FDY38            OUT027        6292.848042
