<a href="https://colab.research.google.com/github/yeabwang/ML_zoom_camp/blob/main/week_2_regression_notes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [311]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

# ====================================
# 1. LOAD DATA
# ====================================
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-02-car-price/data.csv'
df = pd.read_csv(data)

# ====================================
# 2. DATA CLEANING
# ====================================
# Clean column names
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Clean string values
strings = list(df.dtypes[df.dtypes == 'object'].index)
for col in strings:
    df[col] = df[col].str.lower().str.replace(' ', '_')

# ====================================
# 3. TRAIN/VAL/TEST SPLIT (60/20/20)
# ====================================
n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test

# Shuffle data
idx = np.arange(n)
np.random.seed(2)
np.random.shuffle(idx)

# Split data
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]

# Reset indices
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Extract target variable (log transform)
y_train = np.log1p(df_train.msrp.values)
y_val = np.log1p(df_val.msrp.values)
y_test = np.log1p(df_test.msrp.values)

# Remove target from features
del df_train['msrp']
del df_val['msrp']
del df_test['msrp']

# ====================================
# 4. DEFINE BASE FEATURES & CATEGORICAL MAPPING
# ====================================
base = ['engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'popularity']

categorical_columns = [
    'make', 'model', 'engine_fuel_type', 'driven_wheels',
    'market_category', 'vehicle_size', 'vehicle_style'
]

# Get top 5 values for each categorical column (from training set only)
categorical = {}
for c in categorical_columns:
    categorical[c] = list(df_train[c].value_counts().head().index)

# ====================================
# 5. FEATURE PREPARATION FUNCTION
# ====================================
def prepare_X(df):
    df = df.copy()

    # Create age feature
    df['age'] = 2017 - df['year']
    features = base + ['age']

    # Create door indicator features
    for v in [2, 3, 4]:
        df['num_doors_%d' % v] = (df.number_of_doors == v).astype(int)
        features.append('num_doors_%d' % v)

    # Create categorical features (one-hot encoding)
    for name, values in categorical.items():
        for value in values:
            df['%s_%s' % (name, value)] = (df[name] == value).astype(int)
            features.append('%s_%s' % (name, value))

    # Get feature matrix
    df_num = df[features]
    df_num = df_num.fillna(0)
    X = df_num.values

    return X

# ====================================
# 6. MODEL TRAINING FUNCTION (WITH REGULARIZATION)
# ====================================
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)

    return w_full[0], w_full[1:]

# ====================================
# 7. EVALUATION FUNCTION (RMSE)
# ====================================
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

# ====================================
# 8. HYPERPARAMETER TUNING
# ====================================
print("Tuning regularization parameter:")
for r in [0.0, 0.00001, 0.0001, 0.001, 0.1, 1, 10]:
    X_train = prepare_X(df_train)
    w0, w = train_linear_regression_reg(X_train, y_train, r=r)

    X_val = prepare_X(df_val)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)

    print(f"r={r:7.5f}, RMSE={score:.4f}")

# ====================================
# 9. FINAL MODEL TRAINING (TRAIN + VAL)
# ====================================
# Combine train and validation
df_full_train = pd.concat([df_train, df_val])
df_full_train = df_full_train.reset_index(drop=True)
y_full_train = np.concatenate([y_train, y_val])

# Train on combined data
X_full_train = prepare_X(df_full_train)
w0, w = train_linear_regression_reg(X_full_train, y_full_train, r=0.001)

# ====================================
# 10. FINAL EVALUATION ON TEST SET
# ====================================
X_test = prepare_X(df_test)
y_pred = w0 + X_test.dot(w)
final_score = rmse(y_test, y_pred)

print(f"\nFinal Test RMSE: {final_score:.4f}")

# ====================================
# 11. PREDICTION ON SINGLE EXAMPLE
# ====================================
car = df_test.iloc[20].to_dict()
df_small = pd.DataFrame([car])
X_small = prepare_X(df_small)

y_pred = w0 + X_small.dot(w)
predicted_price = np.expm1(y_pred[0])
actual_price = np.expm1(y_test[20])

print(f"\nPrediction Example:")
print(f"Predicted price: ${predicted_price:,.2f}")
print(f"Actual price: ${actual_price:,.2f}")

Tuning regularization parameter:
r=0.00000, RMSE=53.7037
r=0.00001, RMSE=0.4608
r=0.00010, RMSE=0.4608
r=0.00100, RMSE=0.4608
r=0.10000, RMSE=0.4609
r=1.00000, RMSE=0.4616
r=10.00000, RMSE=0.4726

Final Test RMSE: 0.4601

Prediction Example:
Predicted price: $41,459.34
Actual price: $35,000.00
