## IMPORTS AND DATA LOADING

In [None]:
#pip install catboost

In [None]:
#pip install lightgbm

In [1]:
# --- Core and Utility Libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import joblib
from datetime import datetime

# --- Preprocessing ---
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif, f_regression

# --- Models ---
# Classification

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [2]:
from pathlib import Path
import pandas as pd

DATA_ROOT = Path(r"D:\python\instacart")

files = {
    "orders": DATA_ROOT / "orders.csv",
    "prior": DATA_ROOT / "order_products__prior.csv",
    "train": DATA_ROOT / "order_products__train.csv",
    "products": DATA_ROOT / "products.csv",
    "aisles": DATA_ROOT / "aisles.csv",
    "departments": DATA_ROOT / "departments.csv",
}

# check existence
missing = [k for k, p in files.items() if not p.exists()]
if missing:
    raise FileNotFoundError(f"Missing: {missing}")

orders = pd.read_csv(files["orders"])
prior = pd.read_csv(files["prior"])
train = pd.read_csv(files["train"])
products = pd.read_csv(files["products"])
aisles = pd.read_csv(files["aisles"])
departments = pd.read_csv(files["departments"])

print(orders.shape, prior.shape, train.shape, products.shape)


(3421083, 7) (32434489, 4) (1384617, 4) (49688, 4)


In [3]:
# Add aisle and department names to products
products = products.merge(aisles, on="aisle_id", how="left")
products = products.merge(departments, on="department_id", how="left")

In [4]:
# Merge prior orders with product info
df = prior.merge(products, on="product_id", how="left")

# Merge with order metadata
df = df.merge(orders, on="order_id", how="left")

MemoryError: Unable to allocate 742. MiB for an array with shape (3, 32434489) and data type object

In [None]:
df.head()

In [None]:
# Save memory by converting strings to category
for col in ["product_name", "aisle", "department", "eval_set"]:
    df[col] = df[col].astype("category")

In [None]:
def get_df_info(df):
    """Provides a comprehensive overview of a DataFrame."""
    print("\n\033[1mShape of DataFrame:\033[0m ", df.shape)
    print("\n\033[1mColumns in DataFrame:\033[0m ", df.columns.to_list())
    print("\n\033[1mInformation about DataFrame:\033[0m")
    df.info()
    print("\n\033[1mUnique values per column:\033[0m")
    print(df.nunique())
    print("\n\033[1mNull values:\033[0m\n", df.isnull().sum())
    print("\n\033[1mDuplicate rows:\033[0m ", df.duplicated().sum())
    print("\n\033[1mDescriptive statistics:\033[0m\n", df.describe().transpose())

In [None]:
get_df_info(df)

## Basic EDA

In [None]:
# Count missing values per column
df.isna().sum()

# Missing value imputation 
 Here only column with missing value is 'days_since_prior_order'.
 In this case missing value can occur for a user's first order (no prior order exists).

In [None]:
df["days_since_prior_order"] = df["days_since_prior_order"].fillna(0)

In [None]:
df.isnull().sum()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Reorder rate
reorder_rate = df["reordered"].mean()
print(f"Overall reorder rate: {reorder_rate:.2%}")

# Plot
sns.countplot(x="reordered", data=df)
plt.title("Reordered vs Not Reordered")
plt.xlabel("Reordered")
plt.ylabel("Count")
plt.show()


In [None]:
top_reordered = (
    df[df["reordered"] == 1]
    .groupby("product_name")["reordered"]
    .count()
    .sort_values(ascending=False)
    .head(10)
)

top_reordered.plot(kind="barh", figsize=(8, 5), title="Top 10 Reordered Products")
plt.xlabel("Reorder Count")
plt.gca().invert_yaxis()
plt.show()


In [None]:
basket_sizes = df.groupby("order_id")["product_id"].count()
sns.histplot(basket_sizes, bins=30, kde=True)
plt.title("Basket Size Distribution")
plt.xlabel("Number of Products per Order")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Orders by day of week and hour

# Day of week
sns.countplot(x="order_dow", data=df)
plt.title("Orders by Day of Week")
plt.xlabel("Day of Week (0=Sunday)")
plt.ylabel("Order Count")
plt.show()

# Hour of day
sns.countplot(x="order_hour_of_day", data=df)
plt.title("Orders by Hour of Day")
plt.xlabel("Hour")
plt.ylabel("Order Count")
plt.show()

In [None]:
# Number of orders per user

user_orders = df.groupby("user_id")["order_number"].max()
sns.histplot(user_orders, bins=30, kde=True)
plt.title("User Order Count Distribution")
plt.xlabel("Total Orders per User")
plt.ylabel("Frequency")
plt.show()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Define cohorts by total number of orders per user
user_order_counts = df.groupby("user_id")["order_number"].max()
df["user_cohort"] = df["user_id"].map(
    lambda uid: "Low" if user_order_counts[uid] <= 10 else "Medium" if user_order_counts[uid] <= 30 else "High"
)

# Reorder rate trend by cohort
cohort_trend = df.groupby(["user_cohort", "order_number"])["reordered"].mean().reset_index()

# Plot
plt.figure(figsize=(10, 5))
sns.lineplot(data=cohort_trend, x="order_number", y="reordered", hue="user_cohort")
plt.title("Reorder Rate Over Time by User Cohort")
plt.xlabel("Order Number")
plt.ylabel("Reorder Rate")
plt.grid(True)
plt.show()


In [None]:
# Reorder rate by department
dept_trend = df.groupby("department")["reordered"].mean().sort_values(ascending=False)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x=dept_trend.values, y=dept_trend.index, palette="viridis")
plt.title("Average Reorder Rate by Department")
plt.xlabel("Reorder Rate")
plt.ylabel("Department")
plt.show()


In [None]:
# Average days between orders by department
dept_days = df.groupby("department")["days_since_prior_order"].mean().sort_values()

plt.figure(figsize=(10, 6))
sns.barplot(x=dept_days.values, y=dept_days.index, palette="mako")
plt.title("Avg Days Between Orders by Department")
plt.xlabel("Days")
plt.ylabel("Department")
plt.show()


In [None]:
# Popular products during weekends (Saturday & Sunday)

# Filter for weekend orders (0=Sunday, 6=Saturday)
weekend_df = df[df["order_dow"].isin([0, 6])]

# Top 10 products on weekends
top_weekend_products = (
    weekend_df["product_name"]
    .value_counts()
    .head(10)
)

# Plot
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))
sns.barplot(x=top_weekend_products.values, y=top_weekend_products.index, palette="coolwarm")
plt.title("Top 10 Products Ordered on Weekends")
plt.xlabel("Order Count")
plt.ylabel("Product Name")
plt.show()


In [None]:
# Popular products during evening or late night

# Define evening/night hours (e.g., 18:00 to 23:00)
evening_df = df[df["order_hour_of_day"].between(18, 23)]

# Top 10 products during evening/night
top_evening_products = (
    evening_df["product_name"]
    .value_counts()
    .head(10)
)

# Plot
plt.figure(figsize=(8, 5))
sns.barplot(x=top_evening_products.values, y=top_evening_products.index, palette="magma")
plt.title("Top 10 Products Ordered in Evening/Night")
plt.xlabel("Order Count")
plt.ylabel("Product Name")
plt.show()
