# 00 — Setup & Dataset Sanity Check

**Goal:** Make sure your environment works, load the dataset, and confirm shapes and basic stats.


In [2]:
# If this cell runs, you're good on core libs.
import sys, numpy as np, pandas as pd, sklearn, matplotlib
print("Python", sys.version)
print("NumPy", np.__version__)
print("Pandas", pd.__version__)
print("scikit-learn", sklearn.__version__)
print("Matplotlib", matplotlib.__version__)


Python 3.13.3 (main, Apr  8 2025, 13:54:08) [Clang 17.0.0 (clang-1700.0.13.3)]
NumPy 2.3.3
Pandas 2.3.3
scikit-learn 1.7.2
Matplotlib 3.10.6


In [3]:
import warnings
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_california_housing, make_regression
from sklearn.model_selection import train_test_split

def load_regression_data(random_state=42):
    """Return (X, y, feature_names) as numpy arrays.
    Try California Housing; fallback to synthetic if unavailable (e.g., offline).
    """
    try:
        cali = fetch_california_housing(as_frame=True)
        df = cali.frame.copy()
        X = df.drop(columns=["MedHouseVal"]).values
        y = df["MedHouseVal"].values
        feature_names = list(df.drop(columns=["MedHouseVal"]).columns)
    except Exception as e:
        warnings.warn(f"California Housing fetch failed: {e}. Falling back to synthetic make_regression.")
        X, y = make_regression(n_samples=5000, n_features=8, n_informative=6, noise=8.5, random_state=random_state)
        feature_names = [f"x{i}" for i in range(X.shape[1])]
    return X, y, feature_names

def train_val_test_split(X, y, random_state=42):
    # 60/20/20 split: train/val/test
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=random_state)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=random_state)
    return (X_train, y_train), (X_val, y_val), (X_test, y_test)

def rmse(y_true, y_pred):
    return float(np.sqrt(np.mean((y_true - y_pred)**2)))

def mae(y_true, y_pred):
    return float(np.mean(np.abs(y_true - y_pred)))

def r2(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred)**2)
    ss_tot = np.sum((y_true - np.mean(y_true))**2)
    return float(1 - ss_res/ss_tot)


In [5]:
# Load data and split
X, y, feature_names = load_regression_data()
(X_train, y_train), (X_val, y_val), (X_test, y_test) = train_val_test_split(X, y)

print("Shapes:", X_train.shape, X_val.shape, X_test.shape)
print("Target stats:", np.mean(y_train), np.std(y_train))

# TODO: Print a small DataFrame preview of X_train/y_train (first 5 rows) with feature names.
train_df = pd.DataFrame(X_train, columns=feature_names)
train_df['target'] = y_train
print(train_df.head())

Shapes: (12384, 8) (4128, 8) (4128, 8)
Target stats: 2.0633966957364342 1.1518306473785487
   MedInc  HouseAge   AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  5.8735      35.0   5.811639   1.056662      1521.0  2.329250     34.11   
1  1.4688       8.0  10.000000   1.916667        63.0  2.625000     33.32   
2  2.1603      28.0   4.808173   0.995460      2008.0  2.279228     38.74   
3  4.7404      43.0   5.855140   1.009346       967.0  2.259346     37.58   
4  3.2617      10.0   3.929142   1.051896      2032.0  2.027944     37.45   

   Longitude   target  
0    -118.63  4.48100  
1    -115.98  0.53800  
2    -120.78  1.11300  
3    -122.37  5.00001  
4    -121.92  2.52200  


In [None]:
# TODO: Plot a simple histogram of y_train and a scatter of one feature vs y (choose a feature).
# Use matplotlib directly; one chart per cell; don't hardcode colors.
import matplotlib.pyplot as plt

# Example (replace '0' with a more interesting feature index after exploration):
plt.figure()
plt.hist(y_train, bins=30)
plt.title("Target distribution (train)")
plt.xlabel("y")
plt.ylabel("count")


In [None]:
import numpy as np
import matplotlib.pyplot as plt


feat_idx = int(np.argmax(np.std(X_train, axis=0)))
feat_name = feature_names[feat_idx] if feature_names else f"x{feat_idx}"

plt.figure()
plt.scatter(X_train[:, feat_idx], y_train, alpha=0.5)
plt.title(f"Feature {feat_name} vs Target (train)")
plt.xlabel(feat_name)
plt.ylabel("y")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
