# Day 04 — Feature engineering

This notebook creates engineered features and compares a baseline model to an enhanced one.


In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler


In [None]:
data = fetch_openml("titanic", version=1, as_frame=True)
df = data.frame[["survived", "pclass", "sex", "age", "fare"]].dropna()

X = df.drop(columns="survived")
y = df["survived"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
numeric_features = ["age", "fare"]
categorical_features = ["pclass", "sex"]

baseline_preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

baseline_model = Pipeline(
    steps=[
        ("preprocess", baseline_preprocess),
        ("model", LogisticRegression(max_iter=1000)),
    ]
)

baseline_model.fit(X_train, y_train)
baseline_preds = baseline_model.predict(X_test)
accuracy_score(y_test, baseline_preds)


In [None]:
feature_preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("scale", StandardScaler()),
            ("poly", PolynomialFeatures(degree=2, include_bias=False)),
        ]), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

feature_model = Pipeline(
    steps=[
        ("preprocess", feature_preprocess),
        ("model", LogisticRegression(max_iter=1000)),
    ]
)

feature_model.fit(X_train, y_train)
feature_preds = feature_model.predict(X_test)
accuracy_score(y_test, feature_preds)
