# Intro to ML: Preprocessing Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn import datasets
np.random.seed(42)

## What is Preprocessing?
Preprocessing means **cleaning and transforming raw data** so that a model can use it effectively.

Common reasons:
- Features have **different units/scales** (e.g., dollars vs. years).
- Some columns are **categorical** (words/labels) and need numbers.
- Reducing **noise/dimensionality** can make models faster and sometimes more accurate.

### Q: Quick Checks
a. If one feature is in **meters** and another in **millimeters**, why might a distance-based model struggle?

b. Why can’t we feed the strings `"cat"`, `"dog"`, and `"fish"` directly into a regression model?

### A:
YOUR ANSWER HERE

## Scaling
We change the **range** or **spread** of numeric features so that none dominates just because of units.

| Method | What it does | Typical range | When to use |
|--|--|--|--|
| StandardScaler | Subtract mean, divide by std (z-score) | mean≈0, std≈1 | Good for distance/gradient-based models (KNN, SVM, LR, NN) |
| MinMaxScaler | Linearly rescales to a fixed interval | usually [0,1] | Good when you need bounded features or to preserve original shape |


### Q: When is scaling necessary?
Name two model types that often benefit from feature scaling and explain why.

### A:
YOUR ANSWER HERE

In [None]:
# Demo: KNN with and without scaling
X, y = datasets.make_classification(
    n_samples=600, n_features=3, n_informative=2, n_redundant=0, random_state=42
)
X = X.copy()
X[:, 0] = X[:, 0] * 1000  # make feature 0 huge to dominate distances

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# KNN without scaling
knn_plain = KNeighborsClassifier(n_neighbors=5)
knn_plain.fit(X_train, y_train)
acc_plain = knn_plain.score(X_test, y_test)
print(f"Accuracy WITHOUT scaling: {acc_plain:.3f}")

# KNN with StandardScaler
knn_scaled = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier(n_neighbors=5))
])
knn_scaled.fit(X_train, y_train)
acc_scaled = knn_scaled.score(X_test, y_test)
print(f"Accuracy WITH    scaling: {acc_scaled:.3f}")

In [None]:
# Visual: histogram of the large-scale feature BEFORE scaling
plt.figure()
plt.hist(X[:,0], bins=50)
plt.title("Feature 0 BEFORE scaling")
plt.xlabel("Value")
plt.ylabel("Count")
plt.show()

In [None]:
# Visual: histogram of the large-scale feature AFTER StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
plt.figure()
plt.hist(X_scaled[:,0], bins=50)
plt.title("Feature 0 AFTER StandardScaler")
plt.xlabel("Value (z-score)")
plt.ylabel("Count")
plt.show()

## Encoding Categorical Variables
We must convert categories (words/labels) into numbers—**carefully**.

| Encoder | Use when | Example |
|--|--|--|
| OneHotEncoder | No natural order (nominal) | colors: red/green/blue |
| OrdinalEncoder | Clear order (ordinal) | spiciness: low < medium < high |

**Pitfall:** Don’t assign numbers to unordered labels (e.g., dog=1, cat=2, fish=3). Models might think `fish > cat`!

### Q: Nominal or Ordinal?
a. `shirt_size` with values `S, M, L`

b. `zip_code`

### A:
YOUR ANSWER HERE

In [None]:
# Toy dataset for encoding
raw = pd.DataFrame({
    "snack": ["apple", "banana", "chips", "carrot", "chips", "apple"],
    "spiciness": ["low", "medium", "high", "low", "medium", "high"],
    "price_dollars": [1.0, 1.2, 2.5, 0.9, 2.7, 1.1],
    "yummy_label": [1, 1, 0, 1, 0, 1]
})
raw

In [None]:
# One-hot (snack) + Ordinal (spiciness) + passthrough numeric
X = raw.drop(columns=["yummy_label"]) 
y = raw["yummy_label"]

nominal_cols = ["snack"]
ordinal_cols = ["spiciness"]
ordinal_order = [["low", "medium", "high"]]
num_cols = ["price_dollars"]

preprocess = ColumnTransformer([
    ("ohe", OneHotEncoder(handle_unknown="ignore"), nominal_cols),
    ("ord", OrdinalEncoder(categories=ordinal_order), ordinal_cols),
    ("pass", "passthrough", num_cols)
])

clf = Pipeline([
    ("prep", preprocess),
    ("logit", LogisticRegression(max_iter=500))
])

clf.fit(X, y)
print("Training accuracy:", clf.score(X, y))

ohe = clf.named_steps["prep"].named_transformers_["ohe"]
encoded_names = list(ohe.get_feature_names_out(nominal_cols)) + ordinal_cols + num_cols
print("Encoded feature names:", encoded_names)

## Principal Component Analysis (PCA)
PCA **compresses** many features into a smaller number of new features (principal components) that capture the most variation.

Why use it?
- Easier **visualization** in 2D/3D
- Can speed up models
- May reduce noise

Note: Always **scale** features before PCA.

### Q: When might losing detail (via compression) be acceptable? When might it be risky?

### A:
YOUR ANSWER HERE

In [None]:
# PCA on the Iris dataset (2 components)
iris = datasets.load_iris()
X = iris.data
y = iris.target
names = iris.target_names

pipe_pca = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=2))
])
X_2d = pipe_pca.fit_transform(X)
expl = pipe_pca.named_steps["pca"].explained_variance_ratio_
print("Explained variance ratio:", expl)

plt.figure(figsize=(6,4))
for label in np.unique(y):
    mask = y == label
    plt.scatter(X_2d[mask, 0], X_2d[mask, 1], label=names[label])
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Iris in 2D via PCA")
plt.legend()
plt.show()

In [None]:
# Compare accuracy with and without PCA
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

base = Pipeline([
    ("scaler", StandardScaler()),
    ("logit", LogisticRegression(max_iter=1000))
])
base.fit(X_train, y_train)
acc_base = base.score(X_test, y_test)

with_pca = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=2)),
    ("logit", LogisticRegression(max_iter=1000))
])
with_pca.fit(X_train, y_train)
acc_pca = with_pca.score(X_test, y_test)

print(f"Accuracy without PCA: {acc_base:.3f}")
print(f"Accuracy with    PCA: {acc_pca:.3f}")

## Practice: Build a Clean Pipeline
Fill in the code below to create a training/testing split, **fit transforms on train only**, and evaluate accuracy.


### A:
Please uncomment and edit the skeleton as needed.

In [None]:
# def build_and_eval_pipeline(df: pd.DataFrame):
#     """
#     INPUT
#       df: a DataFrame that includes numeric, categorical, and a target column 'label'
#     RETURNS
#       trained pipeline, accuracy on a held-out test set
#     """
#     # 1) Split into X/y
#     # X = df.drop(columns=["label"]) 
#     # y = df["label"]
#     
#     # 2) Identify column types
#     # nominal_cols = [...]
#     # ordinal_cols = [...]; ordinal_order = [[...]]
#     # num_cols = [...]
#     
#     # 3) Preprocess
#     # preprocess = ColumnTransformer([
#     #     ("ohe", OneHotEncoder(handle_unknown="ignore"), nominal_cols),
#     #     ("ord", OrdinalEncoder(categories=ordinal_order), ordinal_cols),
#     #     ("num", StandardScaler(), num_cols)
#     # ])
#     
#     # 4) Pipeline + split
#     # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
#     # pipe = Pipeline([("prep", preprocess), ("model", LogisticRegression(max_iter=1000))])
#     # pipe.fit(X_train, y_train)
#     # acc = pipe.score(X_test, y_test)
#     # return pipe, acc
pass

## Exit Ticket
1. Why do KNN and logistic regression often benefit from scaling?
2. When is `OrdinalEncoder` preferable to `OneHotEncoder`?
3. Name one risk of using PCA.
4. What is **data leakage**, and how do we avoid it when scaling/encoding?

### A:
YOUR ANSWER HERE