# Assignment Notebook Template (Colab-ready)

Use this template to run EDA, preprocessing, and lightweight modeling on a tabular dataset. Set the dataset path/target below and run top-to-bottom. Keep compute light; subsample if needed.

In [None]:
# Optional: install dependencies (uncomment on Colab if needed)
# !pip -q install -r "requirements.txt"

In [None]:
import pathlib
import random
import json
from typing import Dict, List

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.inspection import permutation_importance

sns.set_theme(style="whitegrid")
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

## Configure data and target
- Update `DATA_PATH` to your CSV.
- Set `TARGET_COL` to the column you want to predict (numeric regression assumed; swap models/metrics if classification).

In [None]:
DATA_PATH = pathlib.Path("./your_dataset.csv")
TARGET_COL = "target_column"
ID_COLS: List[str] = []  # e.g., ["id"] to drop from features

assert DATA_PATH.exists(), f"CSV not found at {DATA_PATH}."

In [None]:
# Load data
df = pd.read_csv(DATA_PATH)
display(df.head())
print(df.shape)
print(df.dtypes)
print("Missing values per column:\n", df.isna().sum())

## Column typing (auto, override if needed)

In [None]:
cat_cols = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
num_cols = df.select_dtypes(include=["number"]).columns.tolist()
cat_cols = [c for c in cat_cols if c not in ID_COLS + [TARGET_COL]]
num_cols = [c for c in num_cols if c not in ID_COLS + [TARGET_COL]]
feature_cols = cat_cols + num_cols
print("Categorical:", cat_cols)
print("Numeric:", num_cols)
assert TARGET_COL in df.columns, "Target column not found."
assert feature_cols, "No feature columns detected; adjust column typing."

## Univariate visuals

In [None]:
# Numeric distributions
if num_cols:
    df[num_cols].hist(figsize=(12, 8), bins=20)
    plt.tight_layout()
    plt.show()

# Categorical counts
for col in cat_cols:
    plt.figure(figsize=(6, 3))
    sns.countplot(x=col, data=df, order=df[col].value_counts().index)
    plt.xticks(rotation=30)
    plt.title(f"Distribution of {col}")
    plt.tight_layout()
    plt.show()

## Correlations and target relationships

In [None]:
if num_cols:
    plt.figure(figsize=(10, 8))
    sns.heatmap(df[num_cols + [TARGET_COL]].corr(), annot=False, cmap="coolwarm", center=0)
    plt.title("Correlation Heatmap")
    plt.show()

for col in num_cols:
    plt.figure(figsize=(6, 4))
    sns.scatterplot(x=df[col], y=df[TARGET_COL], alpha=0.5)
    sns.regplot(x=df[col], y=df[TARGET_COL], scatter=False, color="red")
    plt.title(f"{TARGET_COL} vs {col}")
    plt.show()

for col in cat_cols:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=df[col], y=df[TARGET_COL])
    plt.xticks(rotation=30)
    plt.title(f"{TARGET_COL} by {col}")
    plt.show()

## Split and preprocessing

In [None]:
X = df[feature_cols].copy()
y = df[TARGET_COL].copy()

    # train/val/test: 70/15/15
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, random_state=RANDOM_STATE)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=RANDOM_STATE)

categorical_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="most_frequent")), ("onehot", OneHotEncoder(handle_unknown="ignore"))]
)

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

preprocess = ColumnTransformer(
    transformers=[
        ("categorical", categorical_transformer, cat_cols),
        ("numeric", numeric_transformer, num_cols),
    ]
)

def regression_metrics(y_true, y_pred) -> Dict[str, float]:
    return {
        "mae": mean_absolute_error(y_true, y_pred),
        "rmse": mean_squared_error(y_true, y_pred) ** 0.5,
        "r2": r2_score(y_true, y_pred),
    }

## Models (regression baseline and light learners)
- Swap these for classifiers if your target is categorical.

In [None]:
models = {
    "dummy_mean": DummyRegressor(strategy="mean"),
    "linear": LinearRegression(),
    "ridge": Ridge(alpha=1.0, random_state=RANDOM_STATE),
    "lasso": Lasso(alpha=0.001, random_state=RANDOM_STATE, max_iter=5000),
    "rf": RandomForestRegressor(n_estimators=200, max_depth=8, random_state=RANDOM_STATE, n_jobs=-1),
    "gbr": GradientBoostingRegressor(random_state=RANDOM_STATE),
}

results = []
trained_models = {}

for name, model in models.items():
    pipe = Pipeline(steps=[("preprocess", preprocess), ("model", model)])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_val)
    metrics = regression_metrics(y_val, preds)
    results.append({"model": name, **metrics})
    trained_models[name] = pipe

results_df = pd.DataFrame(results).sort_values("mae")
results_df

In [None]:
# Evaluate best on test
best_model_name = results_df.iloc[0]["model"]
best_model = trained_models[best_model_name]
test_preds = best_model.predict(X_test)
test_metrics = regression_metrics(y_test, test_preds)
print(f"Best model: {best_model_name}")
print(json.dumps(test_metrics, indent=2))

## Permutation importance (feature signals)
- Uses original feature columns (pre-encoding) to keep interpretation simple.

In [None]:
perm = permutation_importance(
    best_model, X_val, y_val, n_repeats=5, random_state=RANDOM_STATE, n_jobs=-1
)

importance_df = pd.DataFrame(
    {
        "feature": feature_cols,
        "importance_mean": perm.importances_mean,
        "importance_std": perm.importances_std,
    }
).sort_values("importance_mean", ascending=False)

importance_df.head(15)

## Next steps
- Add classification models/metrics if the target is categorical.
- Tune hyperparameters (grid/random search) if compute allows.
- Save figures (e.g., to `figures/`) and update your README with metrics/insights.