# Data Preprocessing Tutorial
This notebook demonstrates various data preprocessing techniques essential for machine learning.

## Contents
1. Data Loading and Exploration
2. Feature Scaling
3. Feature Engineering
4. Handling Missing Values
5. Outlier Detection

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

## 1. Data Loading and Exploration

In [None]:
# Load dataset
data = load_boston()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['PRICE'] = data.target

# Basic exploration
print('Dataset Shape:', df.shape)
print('
First few rows:
', df.head())
print('
Basic statistics:
', df.describe())

# Check for missing values
print('
Missing values:
', df.isnull().sum())

## 2. Feature Scaling

In [None]:
def standardize(X):
    return (X - X.mean()) / X.std()

def min_max_scale(X):
    return (X - X.min()) / (X.max() - X.min())

# Apply both scaling methods
df_standardized = df.copy()
df_min_max = df.copy()

for column in df.columns:
    df_standardized[column] = standardize(df[column])
    df_min_max[column] = min_max_scale(df[column])

# Visualize the effect of scaling
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

sns.boxplot(data=df[['RM', 'LSTAT', 'PRICE']], ax=axes[0])
axes[0].set_title('Original Data')

sns.boxplot(data=df_standardized[['RM', 'LSTAT', 'PRICE']], ax=axes[1])
axes[1].set_title('Standardized Data')

sns.boxplot(data=df_min_max[['RM', 'LSTAT', 'PRICE']], ax=axes[2])
axes[2].set_title('Min-Max Scaled Data')

plt.tight_layout()
plt.show()

## 3. Feature Engineering

In [None]:
# Create polynomial features
def create_polynomial_features(df, columns, degree=2):
    poly_df = df.copy()
    
    for i in range(len(columns)):
        for j in range(i, len(columns)):
            col1 = columns[i]
            col2 = columns[j]
            
            # Add interaction term
            poly_df[f'{col1}_{col2}_interaction'] = df[col1] * df[col2]
            
            # Add squared term
            if i == j:
                poly_df[f'{col1}_squared'] = df[col1] ** 2
    
    return poly_df

# Create interaction terms for selected features
selected_features = ['RM', 'LSTAT', 'DIS']
df_poly = create_polynomial_features(df, selected_features)

print('Original features:', df.columns.tolist())
print('
New features:', [col for col in df_poly.columns if col not in df.columns])

## 4. Handling Missing Values

In [None]:
# Create some artificial missing values
df_missing = df.copy()
df_missing.loc[np.random.choice(df.index, 50), 'RM'] = np.nan
df_missing.loc[np.random.choice(df.index, 30), 'LSTAT'] = np.nan

# Different strategies for handling missing values
def handle_missing_values(df, strategy='mean'):
    if strategy == 'mean':
        return df.fillna(df.mean())
    elif strategy == 'median':
        return df.fillna(df.median())
    elif strategy == 'drop':
        return df.dropna()
    
# Apply different strategies
df_mean = handle_missing_values(df_missing, 'mean')
df_median = handle_missing_values(df_missing, 'median')
df_dropped = handle_missing_values(df_missing, 'drop')

print('Original shape:', df_missing.shape)
print('Shape after dropping NA:', df_dropped.shape)

## 5. Outlier Detection

In [None]:
def detect_outliers(df, columns, n_std=3):
    outliers = {}
    for column in columns:
        mean = df[column].mean()
        std = df[column].std()
        
        outliers[column] = df[
            (df[column] < mean - n_std * std) | 
            (df[column] > mean + n_std * std)
        ].index.tolist()
    
    return outliers

# Detect outliers in selected columns
outliers = detect_outliers(df, ['RM', 'LSTAT', 'PRICE'])

# Visualize outliers
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for i, column in enumerate(['RM', 'LSTAT', 'PRICE']):
    sns.boxplot(y=df[column], ax=axes[i])
    axes[i].set_title(f'Outliers in {column}')
    
plt.tight_layout()
plt.show()

for column, indices in outliers.items():
    print(f'Number of outliers in {column}: {len(indices)}')