# Feature Engineering

Create better features for better models.

## What is Feature Engineering?
- Transform raw data
- Create new features
- Improve model performance
- Domain knowledge + creativity

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    LabelEncoder,
    OneHotEncoder
)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

sns.set_style('whitegrid')

## 1. Handling Missing Values

In [None]:
# Create sample data with missing values
df = pd.DataFrame({
    'Age': [25, 30, np.nan, 35, 40, np.nan],
    'Salary': [50000, 60000, 55000, np.nan, 70000, 65000],
    'City': ['NYC', 'LA', 'NYC', None, 'LA', 'NYC']
})

print("Original Data:")
print(df)
print(f"\nMissing values:\n{df.isnull().sum()}")

In [None]:
# Strategy 1: Fill with mean/median
df_filled = df.copy()
df_filled['Age'].fillna(
    df_filled['Age'].median(), 
    inplace=True
)
df_filled['Salary'].fillna(
    df_filled['Salary'].mean(), 
    inplace=True
)

print("After filling numeric columns:")
print(df_filled)

In [None]:
# Strategy 2: Fill with mode (categorical)
df_filled['City'].fillna(
    df_filled['City'].mode()[0], 
    inplace=True
)

print("After filling categorical:")
print(df_filled)
print(f"\nNo missing values: "
      f"{df_filled.isnull().sum().sum() == 0}")

## 2. Encoding Categorical Variables

In [None]:
# Sample data
df_cat = pd.DataFrame({
    'Color': ['Red', 'Blue', 'Green', 'Red', 'Blue'],
    'Size': ['S', 'M', 'L', 'M', 'S'],
    'Price': [10, 15, 20, 12, 11]
})

print("Original:")
print(df_cat)

In [None]:
# Label Encoding (ordinal)
size_mapping = {'S': 0, 'M': 1, 'L': 2}
df_cat['Size_Encoded'] = df_cat['Size'].map(
    size_mapping
)

print("Label Encoding:")
print(df_cat[['Size', 'Size_Encoded']])

In [None]:
# One-Hot Encoding (nominal)
df_encoded = pd.get_dummies(
    df_cat, 
    columns=['Color'],
    prefix='Color'
)

print("One-Hot Encoding:")
print(df_encoded)

## 3. Feature Scaling

In [None]:
# Sample data with different scales
df_scale = pd.DataFrame({
    'Age': [25, 30, 35, 40, 45],
    'Salary': [50000, 60000, 70000, 80000, 90000],
    'Years_Exp': [2, 5, 8, 12, 15]
})

print("Original:")
print(df_scale)
print(f"\nRanges:")
print(df_scale.describe().loc[['min', 'max']])

In [None]:
# StandardScaler (mean=0, std=1)
scaler_std = StandardScaler()
df_standardized = pd.DataFrame(
    scaler_std.fit_transform(df_scale),
    columns=df_scale.columns
)

print("StandardScaler:")
print(df_standardized)
print(f"\nMean: {df_standardized.mean().values}")
print(f"Std: {df_standardized.std().values}")

In [None]:
# MinMaxScaler (0 to 1)
scaler_mm = MinMaxScaler()
df_normalized = pd.DataFrame(
    scaler_mm.fit_transform(df_scale),
    columns=df_scale.columns
)

print("MinMaxScaler:")
print(df_normalized)
print(f"\nMin: {df_normalized.min().values}")
print(f"Max: {df_normalized.max().values}")

## 4. Creating New Features

In [None]:
# Sample data
df_new = pd.DataFrame({
    'Length': [10, 15, 20, 25],
    'Width': [5, 7, 10, 12],
    'Price': [100, 150, 200, 250]
})

print("Original features:")
print(df_new)

In [None]:
# Create derived features
df_new['Area'] = df_new['Length'] * df_new['Width']
df_new['Perimeter'] = 2 * (
    df_new['Length'] + df_new['Width']
)
df_new['Price_per_Area'] = (
    df_new['Price'] / df_new['Area']
)

print("With new features:")
print(df_new)

## 5. Binning/Discretization

In [None]:
# Sample age data
df_age = pd.DataFrame({
    'Age': [18, 25, 35, 45, 55, 65, 75]
})

# Create age groups
bins = [0, 25, 40, 60, 100]
labels = ['Young', 'Adult', 'Middle-aged', 'Senior']

df_age['Age_Group'] = pd.cut(
    df_age['Age'], 
    bins=bins, 
    labels=labels
)

print("Age binning:")
print(df_age)

## 6. Date/Time Features

In [None]:
# Sample date data
df_date = pd.DataFrame({
    'Date': pd.date_range('2023-01-01', periods=5)
})

# Extract features
df_date['Year'] = df_date['Date'].dt.year
df_date['Month'] = df_date['Date'].dt.month
df_date['Day'] = df_date['Date'].dt.day
df_date['DayOfWeek'] = df_date['Date'].dt.dayofweek
df_date['Quarter'] = df_date['Date'].dt.quarter
df_date['IsWeekend'] = (
    df_date['DayOfWeek'] >= 5
).astype(int)

print("Date features:")
print(df_date)

## 7. Polynomial Features

In [None]:
from sklearn.preprocessing import PolynomialFeatures

# Sample data
X = np.array([[2, 3], [3, 4], [4, 5]])
df_poly = pd.DataFrame(X, columns=['X1', 'X2'])

print("Original:")
print(df_poly)

In [None]:
# Create polynomial features
poly = PolynomialFeatures(degree=2, 
                          include_bias=False)
X_poly = poly.fit_transform(X)

df_poly_features = pd.DataFrame(
    X_poly,
    columns=poly.get_feature_names_out()
)

print("Polynomial features:")
print(df_poly_features)

## 8. Real-World Example

In [None]:
# Load Titanic dataset
url = 'https://raw.githubusercontent.com/'\
      'datasciencedojo/datasets/master/titanic.csv'
df_titanic = pd.read_csv(url)

print(f"Shape: {df_titanic.shape}")
df_titanic.head()

In [None]:
# Feature engineering pipeline
df_fe = df_titanic.copy()

# 1. Handle missing values
df_fe['Age'].fillna(
    df_fe['Age'].median(), 
    inplace=True
)
df_fe['Embarked'].fillna(
    df_fe['Embarked'].mode()[0], 
    inplace=True
)

# 2. Create new features
df_fe['FamilySize'] = (
    df_fe['SibSp'] + df_fe['Parch'] + 1
)
df_fe['IsAlone'] = (
    df_fe['FamilySize'] == 1
).astype(int)

# 3. Extract title from name
df_fe['Title'] = df_fe['Name'].str.extract(
    ' ([A-Za-z]+)\.', 
    expand=False
)

# 4. Age groups
df_fe['AgeGroup'] = pd.cut(
    df_fe['Age'],
    bins=[0, 12, 18, 35, 60, 100],
    labels=['Child', 'Teen', 'Adult', 
            'Middle', 'Senior']
)

# 5. Fare per person
df_fe['FarePerPerson'] = (
    df_fe['Fare'] / df_fe['FamilySize']
)

print("Engineered features:")
print(df_fe[[
    'Age', 'FamilySize', 'IsAlone', 
    'Title', 'AgeGroup', 'FarePerPerson'
]].head())

In [None]:
# Encode categorical variables
df_fe = pd.get_dummies(
    df_fe,
    columns=['Sex', 'Embarked', 'AgeGroup'],
    drop_first=True
)

print(f"Final shape: {df_fe.shape}")
print(f"New features created: "
      f"{df_fe.shape[1] - df_titanic.shape[1]}")

## Practice Exercises

### Exercise 1
Create interaction features (product of 
two features) for a dataset.

In [None]:
# Your code here


### Exercise 2
Extract hour, day, and month from 
timestamp data.

In [None]:
# Your code here


## Key Takeaways

✅ **Missing Values** - Fill strategically  
✅ **Encoding** - Label vs One-Hot  
✅ **Scaling** - StandardScaler vs MinMaxScaler  
✅ **New Features** - Domain knowledge  
✅ **Binning** - Group continuous values  
✅ **Date Features** - Extract temporal info  

**Next:** [README](README.md) →