## Lecture 1: Introduction to Feature Engineering
==========================================

Key Learning Objectives:
1. Understand what feature engineering is and why it matters
2. Learn to identify opportunities for feature engineering
3. See the impact of feature engineering on model performance

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy import stats

# Set plotting style
plt.style.use("seaborn-v0_8-whitegrid")
sns.set_palette("husl")

In [None]:
# Load the dataset
df = pd.read_csv('loan_applications.csv')


# Display basic dataset information
print("Dataset Overview:")
print(f"Number of samples: {len(df)}")
print(f"Original features: {df.columns.tolist()}")
print("\nSample data:")
print(df.head())

###  Creating Simple Features
-------------------------
Let's create our first engineered feature: Debt-to-Income Ratio

In [None]:
# Create debt-to-income ratio feature
df['debt_to_income'] = df['monthly_payment'] * 12 / df['income']

# Handle any potential infinite or NaN values
df['debt_to_income'] = df['debt_to_income'].replace([np.inf, -np.inf], np.nan)
df['debt_to_income'] = df['debt_to_income'].fillna(df['debt_to_income'].mean())

In [None]:
# Visualize the new feature
plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
sns.histplot(data=df, x='debt_to_income', hue='default', multiple="dodge", bins=30)
plt.title('Debt-to-Income Ratio Distribution\nby Default Status')
plt.xlabel('Debt-to-Income Ratio')

plt.subplot(1, 2, 2)
sns.boxplot(data=df, x='default', y='debt_to_income')
plt.title('Debt-to-Income Ratio vs Default')
plt.xlabel('Default Status')
plt.ylabel('Debt-to-Income Ratio')

plt.tight_layout()
plt.show()

In [None]:
def evaluate_features(X, y, feature_set_name=""):
    # Handle missing values
    X = X.fillna(X.mean())

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train and evaluate model
    model = LogisticRegression(random_state=42)
    model.fit(X_train_scaled, y_train)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    auc_score = roc_auc_score(y_test, y_pred_proba)

    print(f"\nModel Performance with {feature_set_name}:")
    print(f"ROC-AUC Score: {auc_score:.4f}")

    return auc_score

### PART 3: Evaluating Feature Impact
--------------------------
Compare model performance with and without engineered features.

In [None]:
# Original features
original_features = ['loan_amount', 'income']
original_score = evaluate_features(
    df[original_features],
    df['default'],
    "Original Features"
)

In [None]:
# Original + engineered features
engineered_features = original_features + ['debt_to_income']
engineered_score = evaluate_features(
    df[engineered_features],
    df['default'],
    "Original + Engineered Features"
)

In [None]:
# Visualize performance comparison
plt.figure(figsize=(8, 5))
plt.bar(['Original Features', 'With Engineered Features'],
        [original_score, engineered_score])
plt.title('Model Performance Comparison')
plt.ylabel('ROC-AUC Score')
plt.ylim(0.5, 1.0)  # AUC score range from 0.5 to 1.0
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


## Lecture 2: Understanding the Dataset
==================================

Key Learning Objectives:
1. Learn how to analyze features for engineering opportunities
2. Understand relationships between features
3. Identify patterns that suggest useful feature transformations

In [None]:
# Load the dataset


In [None]:
# Categorize features
numerical_features = []
categorical_features = []
temporal_features = []
target = None

In [None]:
# Analyze distributions of numerical features
plt.figure(figsize=(15, 10))
for i, feature in enumerate(numerical_features[:6], 1):
    plt.subplot(2, 3, i)
    sns.histplot(df[feature], kde=True)
    plt.title(f'{feature} Distribution')
    # Add skewness information
    skewness = stats.skew(df[feature].dropna())
    plt.text(0.7, 0.9, f'Skewness: {skewness:.2f}',
             transform=plt.gca().transAxes)
plt.tight_layout()
plt.show()

In [None]:
# Create box plots for numerical features by default status
plt.figure(figsize=(15, 10))
for i, feature in enumerate(numerical_features[:6], 1):
    plt.subplot(2, 3, i)
    sns.boxplot(x='default', y=feature, data=df)
    plt.title(f'{feature} by Default Status')
plt.tight_layout()
plt.show()

In [None]:
# Analyze categorical features
for feature in categorical_features:
    print(f"\n{feature} value counts:")
    print(df[feature].value_counts())

    # Calculate default rate by category
    default_rates = df.groupby(feature)['default'].mean()

    plt.figure(figsize=(10, 5))
    default_rates.plot(kind='bar')
    plt.title(f'Default Rate by {feature}')
    plt.ylabel('Default Rate')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
# Create correlation matrix for numerical features
# TODO: Corr matrix
correlation_matrix = None

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Feature Correlations')
plt.tight_layout()
plt.show()

# Examine potential interactions
plt.figure(figsize=(15, 5))

In [None]:
# Income vs Credit Score by Default
plt.subplot(1, 3, 1)
plt.scatter(df[df['default']==0]['income'],
           df[df['default']==0]['credit_score'],
           alpha=0.5, label='Non-Default')
plt.scatter(df[df['default']==1]['income'],
           df[df['default']==1]['credit_score'],
           alpha=0.5, label='Default')
plt.xlabel('Income')
plt.ylabel('Credit Score')
plt.title('Income vs Credit Score')
plt.legend()

In [None]:
# Loan Amount vs Income by Education
plt.subplot(1, 3, 2)
for education in df['education'].unique():
    mask = df['education'] == education
    plt.scatter(df[mask]['income'],
               df[mask]['loan_amount'],
               alpha=0.5, label=education)
plt.xlabel('Income')
plt.ylabel('Loan Amount')
plt.title('Loan Amount vs Income by Education')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.show()

In [None]:
# Convert application_date to datetime
df['application_date'] = None

# Analyze temporal patterns
df['month'] = None
df['year'] = None

In [None]:
# Plot default rate over time
monthly_default_rate = df.groupby(['year', 'month'])['default'].mean()

plt.figure(figsize=(12, 5))
monthly_default_rate.plot()
plt.title('Default Rate Over Time')
plt.xlabel('Time')
plt.ylabel('Default Rate')
plt.tight_layout()
plt.show()

Feature Engineering Opportunities Identified:
----------------------------------------
1. Numerical Features:
   - Income and loan_amount show right skewness → Log transformation
   - Credit score has outliers → Binning or normalization
   - Employment length has missing values → Imputation needed

2. Categorical Features:
   - Education shows clear relationship with default → Ordinal encoding
   - City has high cardinality → Need dimensionality reduction
   - Occupation has meaningful groups → Potential for grouping

3. Temporal Features:
   - Monthly patterns in default rate → Create cyclical features
   - Application recency might matter → Create time-based features

4. Potential Interactions:
   - Income and education → Create income-education interaction
   - Loan amount and income → Create loan-to-income ratio
   - Credit score and income → Create risk segments

## Lecture 3: Data Cleaning and Missing Value Handling
====================================================


Key Learning Objectives:
1. Learn how to handle missing values appropriately
2. Understand different imputation strategies
3. Implement data cleaning techniques
4. Validate cleaning results

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [None]:
# Load the dataset
df = pd.read_csv('loan_applications.csv')

In [None]:
# Analyze missing values


In [None]:
print("Missing Values Analysis:")
print("-----------------------")
for column, percentage in missing_percentages[missing_percentages > 0].items():
    print(f"{column}: {percentage:.2f}%")

In [None]:
# Visualize missing value patterns
plt.figure(figsize=(12, 6))
sns.heatmap(df.isnull(), yticklabels=False, cbar=True, cmap='viridis')
plt.title('Missing Value Patterns')
plt.tight_layout()
plt.show()

In [None]:
# Create a copy of the dataframe for cleaning


In [None]:
# 1. Handle employment_length missing values
# Strategy: Use median for different education levels
print("\nMedian employment length by education level:")
print(None)

In [None]:
# 2. Handle income missing values
# Strategy: Use a more sophisticated imputation based on education and occupation

# REMOVE missing values - rows
print("\nMedian income by education and occupation:")
print(df.groupby(['education', 'occupation'])['income'].median().head())



In [None]:
# 3. Handle credit_score missing values
# Strategy: Use a simple imputer with median strategy


In [None]:
# 4. Handle existing_loans missing values
# Strategy: Fill with 0 (assume no existing loans if not specified)


In [None]:
# Check remaining missing values
remaining_missing = df_cleaned.isnull().sum()
print("\nRemaining missing values after imputation:")
print(remaining_missing[remaining_missing > 0])

In [None]:
# Validate imputation results
fig, axes = plt.subplots(2, 2, figsize=(15, 10))


# Compare original vs imputed distributions for key features
features_to_validate = ['employment_length', 'income', 'credit_score', 'existing_loans']
for i, feature in enumerate(features_to_validate):
    ax = axes[i // 2, i % 2]

    # Plot original distribution
    sns.kdeplot(data=df[feature].dropna(), ax=ax, label='Original', alpha=0.5)
    # Plot imputed distribution
    sns.kdeplot(data=df_cleaned[feature], ax=ax, label='After Imputation', alpha=0.5)

    ax.set_title(f'{feature} Distribution Comparison')
    ax.legend()

plt.tight_layout()
plt.show()

## Lesson 4 Outlier detection IQR method

![](https://miro.medium.com/v2/resize:fit:1200/1*0MPDTLn8KoLApoFvI0P2vQ.png)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def analyze_and_remove_outliers_iqr(df, column):
    """
    Analyze, visualize, and remove outliers from a dataframe column using the IQR method.

    Parameters:
    df (pandas.DataFrame): Input dataframe
    column (str): Name of the column to remove outliers from

    Returns:
    pandas.DataFrame: Dataframe with outliers removed/capped
    """
    # Create a copy of the dataframe
    df_clean = df.copy()

    # Calculate Q1, Q3, and IQR
    Q1 = None
    Q3 = None
    IQR =  None

    # Calculate bounds
    lower_bound = None
    upper_bound = None

    # Print statistics before removal
    print(f"\n=== Analysis for {column} ===")
    print("\nBefore outlier removal:")
    print(f"Count: {df[column].count()}")
    print(f"Mean: {df[column].mean():.2f}")
    print(f"Median: {df[column].median():.2f}")
    print(f"Std: {df[column].std():.2f}")
    print(f"Min: {df[column].min():.2f}")
    print(f"Max: {df[column].max():.2f}")

    # Identify outliers
    outliers = df[
        (df[column] < lower_bound) |
        (df[column] > upper_bound)
    ]
    print(f"\nNumber of outliers detected: {len(outliers)}")
    print(f"Outliers percentage: {(len(outliers)/len(df))*100:.2f}%")

    # Create visualization
    plt.figure(figsize=(15, 5))

    # Before removal boxplot
    plt.subplot(131)
    sns.boxplot(y=df[column])
    plt.title('Before Removal\nBoxplot')

    # Before removal distribution
    plt.subplot(132)
    sns.histplot(df[column], kde=True)
    plt.axvline(lower_bound, color='r', linestyle='--', label='Lower bound')
    plt.axvline(upper_bound, color='r', linestyle='--', label='Upper bound')
    plt.title('Before Removal\nDistribution')
    plt.legend()

    # Cap the outliers at the bounds
    df_clean.loc[df_clean[column] > upper_bound, column] = upper_bound
    df_clean.loc[df_clean[column] < lower_bound, column] = lower_bound

    # After removal distribution
    plt.subplot(133)
    sns.histplot(df_clean[column], kde=True)
    plt.title('After Removal\nDistribution')

    plt.tight_layout()
    plt.show()

    # Print statistics after removal
    print("\nAfter outlier removal:")
    print(f"Count: {df_clean[column].count()}")
    print(f"Mean: {df_clean[column].mean():.2f}")
    print(f"Median: {df_clean[column].median():.2f}")
    print(f"Std: {df_clean[column].std():.2f}")
    print(f"Min: {df_clean[column].min():.2f}")
    print(f"Max: {df_clean[column].max():.2f}")

    return df_clean

In [None]:
# Example usage:
numerical_features = ['income', 'loan_amount', 'monthly_payment']
df_cleaned = df.copy()

for feature in numerical_features:
    df_cleaned = analyze_and_remove_outliers_iqr(df_cleaned, feature)



## Lecture 5: Feature Transformations
=========================================


Key Learning Objectives:
1. Learn when and why to transform features
2. Understand different scaling techniques
3. Apply basic mathematical transformations
4. Validate transformation results

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    RobustScaler
)

In [None]:
# Load the dataset
df = pd.read_csv('loan_applications.csv')

### PART 1: Scaling Transformations
--------------------------
Apply different scaling techniques to numerical features.

In [None]:
# Select numerical features for scaling
numerical_features = ['income', 'loan_amount', 'monthly_payment',
                     'credit_score', 'employment_length']
df_transformed = df.copy()

### 1. StandardScaler (z-score normalization)

![](https://i0.wp.com/cdn-images-1.medium.com/max/370/1*Nlgc_wq2b-VfdawWX9MLWA.png?ssl=1)

In [None]:
scaler = None

### 2. MinMaxScaler (to 0-1 range)

![](https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRfalbkrBP7E2nthrr667MhjxO1SOsVDLOTXw&s)


In [None]:
minmax = None

### 3. RobustScaler (using quartiles)

![](https://media.geeksforgeeks.org/wp-content/uploads/20230428205714/for4.png)

In [None]:
robust = None

### PART 2: Mathematical Transformations
-------------------------------
Apply basic mathematical transformations to handle skewness
and non-linear relationships.

### 1. Logarithmic transformation

![](https://theailearner.com/wp-content/uploads/2019/01/log-1.png)

In [None]:
for feature in ['income', 'loan_amount', 'monthly_payment']:
    df_transformed[f'{feature}_log'] = np.log1p(df[feature])

### 2. Square root transformation

In [None]:
for feature in ['credit_score', 'employment_length']:
    df_transformed[f'{feature}_sqrt'] = np.sqrt(df[feature])

### 3. Power transformation


In [None]:
for feature in ['income', 'loan_amount']:
    df_transformed[f'{feature}_squared'] = np.square(df[feature])

### PART 3: Basic Ratio Features
-----------------------
Create simple ratio features from numerical variables.

### 1. Basic financial ratios


In [None]:
df_transformed['loan_to_income'] = None
df_transformed['payment_to_income'] = None
df_transformed['payment_to_loan'] = None

In [None]:
# Visualize ratio distributions by default status
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
fig.suptitle('Ratio Features by Default Status')

ratios = ['loan_to_income', 'payment_to_income', 'payment_to_loan']
for i, ratio in enumerate(ratios):
    sns.boxplot(x='default', y=ratio, data=df_transformed, ax=axes[i])
    axes[i].set_title(ratio)

plt.tight_layout()
plt.show()

# Lecture 6: Feature Transformation
==================================


Key Learning Objectives:
1. Learn advanced transformation techniques
2. Understand when to use each transformation
3. Handle complex financial relationships
4. Validate transformation effectiveness

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import (
    PowerTransformer,
    QuantileTransformer,
    FunctionTransformer
)

In [None]:
# Load the dataset
df = pd.read_csv('loan_applications.csv')


## PART 1: Advanced Distribution Transformations
---------------------------------------
Apply transformations to handle complex distributions in
financial data.

In [None]:
# Select features for transformation
financial_features = ['income', 'loan_amount', 'monthly_payment', 'credit_score']
df_transformed = df.copy()

In [None]:
# 1. Yeo-Johnson transformation (handles negative values)
pt_yj = PowerTransformer(method='yeo-johnson')
transformed_yj = pt_yj.fit_transform(df[financial_features])

for i, feature in enumerate(financial_features):
    df_transformed[f'{feature}_yeojohnson'] = transformed_yj[:, i]

In [None]:
# 2. Quantile transformation (uniform distribution)
qt_uniform = QuantileTransformer(output_distribution='uniform')
transformed_uniform = qt_uniform.fit_transform(df[financial_features])

for i, feature in enumerate(financial_features):
    df_transformed[f'{feature}_uniform'] = transformed_uniform[:, i]


In [None]:
# 3. Quantile transformation (normal distribution)
qt_normal = QuantileTransformer(output_distribution='normal')
transformed_normal = qt_normal.fit_transform(df[financial_features])

for i, feature in enumerate(financial_features):
    df_transformed[f'{feature}_normal'] = transformed_normal[:, i]

In [None]:
# Visualize transformations
fig, axes = plt.subplots(4, 4, figsize=(20, 20))
fig.suptitle('Distribution Transformations Comparison')

for i, feature in enumerate(financial_features):
    # Original distribution
    sns.histplot(df[feature], ax=axes[i, 0])
    axes[i, 0].set_title(f'Original {feature}')

    # Yeo-Johnson
    sns.histplot(df_transformed[f'{feature}_yeojohnson'], ax=axes[i, 1])
    axes[i, 1].set_title(f'Yeo-Johnson {feature}')

    # Uniform
    sns.histplot(df_transformed[f'{feature}_uniform'], ax=axes[i, 2])
    axes[i, 2].set_title(f'Uniform {feature}')

    # Normal
    sns.histplot(df_transformed[f'{feature}_normal'], ax=axes[i, 3])
    axes[i, 3].set_title(f'Normal {feature}')

plt.tight_layout()
plt.show()

## PART 2: Domain-Specific (Financial Domain) Transformations
----------------------------------
Apply transformations specific to financial metrics.

In [None]:
# 1. Risk-adjusted features
def risk_adjust(x, risk_factor):
    return x * (1 - risk_factor)

risk_features = ['income', 'loan_amount']
risk_factor = None

for feature in risk_features:
    df_transformed[f'{feature}_risk_adj'] = None

In [None]:
# 2. Smoothed temporal features
def exponential_smooth(x, alpha=0.3):
    return pd.Series(x).ewm(alpha=alpha).mean()

temporal_features = ['monthly_payment']
for feature in temporal_features:
    df_transformed[f'{feature}_smoothed'] = None

In [None]:
# 3. Bounded transformations
def sigmoid_transform(x):
    return 1 / (1 + np.exp(-x))

def tanh_transform(x):
    return np.tanh(x)

# Apply bounded transformations to normalized features
for feature in financial_features:
    normalized = None
    df_transformed[f'{feature}_sigmoid'] = None
    df_transformed[f'{feature}_tanh'] = None

# Lecture 7: Categorical Encoding
================================

Key Learning Objectives:
1. Understand different encoding techniques
2. Learn when to use each encoding method
3. Handle high-cardinality categories
4. Implement target-based encoding

![](https://miro.medium.com/v2/resize:fit:1400/1*ggtP4a5YaRx6l09KQaYOnw.png)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import (
    LabelEncoder,
    OneHotEncoder,
    OrdinalEncoder,
    TargetEncoder
)

In [None]:
df = pd.read_csv('loan_applications.csv')

In [None]:
# Identify categorical columns
categorical_features = ['education', 'occupation', 'city', 'gender']
df_encoded = df.copy()

In [None]:

# 1. Label Encoding (for ordinal categories)
education_order = ['High School', 'Bachelor', 'Master', 'PhD']
le = None

In [None]:
df_encoded[['education','education_label']]

In [None]:
# Initialize the OneHotEncoder
ohe = None

# Define nominal features
nominal_features = ['occupation', 'city']

# Fit and transform the data
encoded_nominal = None

# Get feature names
encoded_feature_names = ohe.get_feature_names_out(nominal_features)

# Create a new dataframe with encoded features
df_encoded = pd.DataFrame(
    encoded_nominal,
    columns=encoded_feature_names,
    index=df.index
)

# If you want to add these columns to your original dataframe:
df_new = pd.concat([df.drop(columns=nominal_features), df_encoded], axis=1)

In [None]:
df_new

In [None]:
# 3. Binary Encoding (for gender)
df_encoded['gender_binary'] = None


## PART 2: Handling High Cardinality
---------------------------
Deal with categorical variables that have many unique values.

In [None]:
# 1. Frequency-based encoding
for feature in ['occupation', 'city']:
    value_counts = df[feature].value_counts()
    print(f'{feature}_freq')
    df_encoded[f'{feature}_freq'] = df[feature].map(value_counts)

In [None]:
df_encoded['city_freq']

In [None]:
# 2. Top-K encoding with "Other" category
def top_k_encoding(series, k=10):
    return None

df_encoded['city_top_k'] = top_k_encoding(df['city'])
df_encoded['occupation_top_k'] = top_k_encoding(df['occupation'])

In [None]:
df_encoded['city_top_k']

# Lecture 8: Feature Interactions
================================

Key Learning Objectives:
1. Understand different types of feature interactions
2. Create meaningful financial interaction features
3. Combine categorical and numerical features
4. Evaluate interaction effectiveness

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures
from scipy.stats import chi2_contingency


# Load the encoded dataset
df = pd.read_csv('loan_applications.csv')

In [None]:
# Select numerical features for interactions
numerical_features = ['income', 'loan_amount', 'monthly_payment',
                     'credit_score', 'employment_length']
df_interactions = df.copy()

In [None]:
# 1. Basic Financial Ratios
df_interactions['debt_service_ratio'] = df['monthly_payment'] / df['income']
df_interactions['loan_to_income'] = df['loan_amount'] / df['income']
df_interactions['payment_to_loan'] = df['monthly_payment'] / df['loan_amount']

In [None]:
# 2. Polynomial Interactions
poly = None
poly_features = ['loan_amount']
poly_transformed = None
poly_names = None

In [None]:
poly_names

In [None]:
for i, name in enumerate(poly_names):
    if i >= len(poly_features):  # Skip original features
        df_interactions[f'poly_{name}'] = poly_transformed[:, i]

# 3. Risk-Weighted Features
df_interactions['risk_weighted_income'] = None
df_interactions['risk_weighted_loan'] =  None

## PART 2: Categorical-Numerical Interactions
------------------------------------
Create interactions between categorical and numerical features.

In [None]:
# 1. Education-Income Interaction
education_income_mean = df.groupby('education')['income'].transform('mean')
df_interactions['relative_income'] = df['income'] / education_income_mean

In [None]:
# 2. Occupation-Loan Interaction
occupation_loan_mean = df.groupby('occupation')['loan_amount'].transform('mean')
df_interactions['relative_loan'] = df['loan_amount'] / occupation_loan_mean