In [None]:
# Cell 1: imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

pd.set_option('display.max_columns', None)
sns.set(style="whitegrid")


In [None]:
# Cell 2: load data (adjust path if needed)
data_path = "../mnt/data/laptop_data.csv"   # notebook placed in notebooks/, so adjust relative path
df = pd.read_csv(data_path)

print("Shape:", df.shape)
display(df.head())
print("\nMissing values per column:\n", df.isna().sum())


In [None]:
# Cell 3: clean / drop identifier columns
df = df.drop_duplicates().reset_index(drop=True)

# Drop identifier-like columns (agar dataset me ho)
for c in ['SKU','Model']:
    if c in df.columns:
        df = df.drop(columns=[c])

print("After dropping identifiers, shape:", df.shape)


In [None]:
# Cell 4: target summary
target = "Price_INR"
print("Target exists:", target in df.columns)
display(df[target].describe())

# Optional: sometimes price is skewed; log-transform helps for some models/plots
df['log_price'] = np.log1p(df[target])
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
sns.histplot(df[target], kde=True)
plt.title("Price_INR distribution")
plt.subplot(1,2,2)
sns.histplot(df['log_price'], kde=True)
plt.title("log(1+Price_INR) distribution")
plt.tight_layout()
plt.show()


In [None]:
# Cell 5: choose features (exclude target & newly created log)
# Adjust this list based on columns present
exclude = [target, 'log_price']
features = [c for c in df.columns if c not in exclude]

print("Candidate features:", features)
# Example small feature engineering: encode Touchscreen Yes/No -> 1/0 if present
if 'Touchscreen' in df.columns:
    df['Touchscreen_flag'] = df['Touchscreen'].astype(str).str.lower().map({'yes':1,'no':0}).fillna(0)
    # replace in features
    features = [f for f in features if f!='Touchscreen'] + ['Touchscreen_flag']


In [None]:
# Cell 6: detect numeric and categorical features automatically
num_cols = df[features].select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = [c for c in features if c not in num_cols]

print("Numeric cols:", num_cols)
print("Categorical cols:", cat_cols)


In [None]:
# Cell 7: pipeline setup
num_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
], remainder='drop')

# train-test split
X = df[features].copy()
y = df['log_price']  # using log target improves stability
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# full model pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1))
])

print("Training model (this may take a moment)...")
model.fit(X_train, y_train)
print("Training done.")


In [None]:
# Cell 8: evaluation on test set (report RMSE on original price scale)
y_pred_log = model.predict(X_test)
y_pred = np.expm1(y_pred_log)   # inverse of log1p

y_test_orig = np.expm1(y_test)

rmse = mean_squared_error(y_test_orig, y_pred, squared=False)
r2 = r2_score(y_test_orig, y_pred)
print(f"RMSE (on original INR scale): {rmse:.2f}")
print(f"R2 (on original INR scale): {r2:.4f}")


In [None]:
# Cell 9: feature importances mapping back to original feature names
# We need to get column names after ColumnTransformer
preprocessor = model.named_steps['preprocessor']
reg = model.named_steps['regressor']

# get feature names produced by preprocessor
try:
    # sklearn >=1.0
    num_features = num_cols
    cat_features = preprocessor.named_transformers_['cat'].named_steps['ohe'].get_feature_names_out(cat_cols).tolist()
    feature_names = num_features + cat_features
except Exception as e:
    # fallback: try older API or manual approach (less precise)
    feature_names = num_cols + cat_cols  # approximate
    print("Warning: couldn't expand categorical feature names precisely â€” using approximate names.", e)

importances = reg.feature_importances_
# if shapes mismatch, truncate or pad (safe-guard)
min_len = min(len(importances), len(feature_names))
feat_imp = pd.DataFrame({
    'feature': feature_names[:min_len],
    'importance': importances[:min_len]
}).sort_values('importance', ascending=False)

display(feat_imp.head(30))

# Plot top 20
plt.figure(figsize=(8,6))
sns.barplot(data=feat_imp.head(20), x='importance', y='feature')
plt.title("Top 20 Feature Importances (RandomForest on log_price)")
plt.tight_layout()
plt.show()


In [None]:
# Cell 10: brand average price analysis (original price scale)
if 'Brand' in df.columns:
    brand_avg = df.groupby('Brand')['Price_INR'].agg(['count','mean','median']).reset_index()
    brand_avg = brand_avg.sort_values('mean', ascending=False)
    display(brand_avg.head(20))

    plt.figure(figsize=(10,5))
    sns.barplot(data=brand_avg.head(15), x='Brand', y='mean')
    plt.xticks(rotation=45)
    plt.title("Top 15 Brands by Average Price (mean)")
    plt.show()
else:
    print("Column 'Brand' not present in dataset.")


In [None]:
# Cell 11: save results (feature importances and brand averages)
out_dir = Path("../artifacts/analysis")
out_dir.mkdir(parents=True, exist_ok=True)

feat_imp.to_csv(out_dir / "feature_importances.csv", index=False)
if 'Brand' in df.columns:
    brand_avg.to_csv(out_dir / "brand_average_price.csv", index=False)

print("Saved feature_importances.csv and brand_average_price.csv in artifacts/analysis/")
