# Data Preprocessing and Feature Engineering
## Telecom Customer Churn Dataset

This notebook handles:
- Data cleaning and preprocessing
- Feature engineering
- Creating derived features
- Preparing data for modeling

**Objectives:**
- Handle missing values and data types
- Create new informative features
- Encode categorical variables
- Scale numerical features
- Prepare train-test splits

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import sys
import warnings

warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('../src')

print("✅ Libraries imported successfully!")

## 1. Load and Clean Data

In [None]:
# Load the dataset
from data_prep import load_data, clean_data

df = load_data('../data/raw/telco_churn.csv')
print(f"\nOriginal shape: {df.shape}")

# Display first few rows
df.head()

In [None]:
# Clean the data
df_clean = clean_data(df)
print(f"\nCleaned shape: {df_clean.shape}")

# Check data types
print("\n📊 Data Types After Cleaning:")
print(df_clean.dtypes)

In [None]:
# Verify no missing values
print("\n🔍 Missing Values Check:")
missing = df_clean.isnull().sum()
if missing.sum() == 0:
    print("✅ No missing values!")
else:
    print(missing[missing > 0])

## 2. Feature Engineering

In [None]:
# Import feature engineering functions
from features import create_tenure_groups, create_charge_features, create_service_features

# Create tenure groups
df_featured = create_tenure_groups(df_clean)

print("\n📊 Tenure Groups:")
print(df_featured['TenureGroup'].value_counts().sort_index())

In [None]:
# Visualize tenure groups
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Tenure group distribution
df_featured['TenureGroup'].value_counts().sort_index().plot(kind='bar', ax=axes[0], color='skyblue')
axes[0].set_title('Distribution of Tenure Groups', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Tenure Group')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=45)

# Churn rate by tenure group
tenure_churn = df_featured.groupby('TenureGroup')['Churn'].apply(lambda x: (x == 'Yes').sum() / len(x) * 100)
tenure_churn.plot(kind='bar', ax=axes[1], color='coral')
axes[1].set_title('Churn Rate by Tenure Group', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Tenure Group')
axes[1].set_ylabel('Churn Rate (%)')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Create charge-related features
df_featured = create_charge_features(df_featured)

print("\n📊 New Charge Features:")
print(df_featured[['MonthlyCharges', 'TotalCharges', 'AvgMonthlyCharges', 'ChargeRatio']].head(10))

In [None]:
# Visualize charge features
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Average Monthly Charges by Churn
df_featured.boxplot(column='AvgMonthlyCharges', by='Churn', ax=axes[0], 
                    patch_artist=True, medianprops=dict(color='red', linewidth=2))
axes[0].set_title('Average Monthly Charges by Churn', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Churn')
axes[0].set_ylabel('Avg Monthly Charges ($)')
plt.sca(axes[0])
plt.xticks([1, 2], ['No', 'Yes'])

# Charge Ratio by Churn
df_featured.boxplot(column='ChargeRatio', by='Churn', ax=axes[1],
                    patch_artist=True, medianprops=dict(color='red', linewidth=2))
axes[1].set_title('Charge Ratio by Churn', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Churn')
axes[1].set_ylabel('Charge Ratio')
plt.sca(axes[1])
plt.xticks([1, 2], ['No', 'Yes'])

plt.tight_layout()
plt.show()

In [None]:
# Create service bundle features
df_featured = create_service_features(df_featured)

print("\n📊 Total Services Distribution:")
print(df_featured['TotalServices'].value_counts().sort_index())

In [None]:
# Visualize service features
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Total Services distribution
df_featured['TotalServices'].value_counts().sort_index().plot(kind='bar', ax=axes[0], color='lightgreen')
axes[0].set_title('Distribution of Total Services', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Number of Services')
axes[0].set_ylabel('Count')

# Churn rate by Total Services
service_churn = df_featured.groupby('TotalServices')['Churn'].apply(lambda x: (x == 'Yes').sum() / len(x) * 100)
service_churn.plot(kind='bar', ax=axes[1], color='salmon')
axes[1].set_title('Churn Rate by Total Services', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Number of Services')
axes[1].set_ylabel('Churn Rate (%)')

plt.tight_layout()
plt.show()

## 3. Feature Selection and Preparation

In [None]:
# Identify feature types
from features import prepare_features_for_modeling

X, y, feature_names, numerical_features, categorical_features = prepare_features_for_modeling(df_featured)

print(f"\n📊 Features Summary:")
print(f"Total features: {len(feature_names)}")
print(f"Numerical features: {len(numerical_features)}")
print(f"Categorical features: {len(categorical_features)}")
print(f"\nTarget distribution:")
print(y.value_counts())

In [None]:
# Display feature lists
print("\n🔢 Numerical Features:")
for i, feat in enumerate(numerical_features, 1):
    print(f"  {i}. {feat}")

print(f"\n📝 Categorical Features:")
for i, feat in enumerate(categorical_features, 1):
    print(f"  {i}. {feat}")

## 4. Train-Test Split

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"📊 Data Split:")
print(f"Training set: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Test set: {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")

print(f"\n📊 Target Distribution in Training Set:")
print(pd.Series(y_train).value_counts())
print(f"\nChurn rate in training: {y_train.mean()*100:.2f}%")

print(f"\n📊 Target Distribution in Test Set:")
print(pd.Series(y_test).value_counts())
print(f"Churn rate in test: {y_test.mean()*100:.2f}%")

## 5. Create Preprocessing Pipeline

In [None]:
# Create preprocessing pipeline
from features import get_preprocessor

preprocessor = get_preprocessor(numerical_features, categorical_features)

print("\n✅ Preprocessing pipeline created!")
print("\nPipeline steps:")
print("1. Numerical features: StandardScaler")
print("2. Categorical features: OneHotEncoder")

In [None]:
# Fit and transform the training data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

print(f"\n📊 Transformed Data Shape:")
print(f"Training: {X_train_transformed.shape}")
print(f"Test: {X_test_transformed.shape}")

print(f"\n✅ Features after preprocessing: {X_train_transformed.shape[1]}")
print("   (One-hot encoding expanded categorical features)")

In [None]:
# Get feature names after preprocessing
try:
    # Get numerical feature names
    num_features = numerical_features
    
    # Get categorical feature names after one-hot encoding
    cat_encoder = preprocessor.named_transformers_['cat']
    cat_features = cat_encoder.get_feature_names_out(categorical_features)
    
    # Combine all feature names
    all_feature_names = list(num_features) + list(cat_features)
    
    print(f"\n📋 Total features after encoding: {len(all_feature_names)}")
    print("\n🔢 Sample of encoded features:")
    for i, feat in enumerate(all_feature_names[:15], 1):
        print(f"  {i}. {feat}")
    print(f"  ... and {len(all_feature_names) - 15} more features")
    
except Exception as e:
    print(f"⚠️ Could not extract feature names: {str(e)}")

## 6. Save Preprocessed Data

In [None]:
# Save the engineered dataset
output_path = '../data/processed/telco_churn_engineered.csv'
df_featured.to_csv(output_path, index=False)
print(f"✅ Engineered dataset saved to: {output_path}")

# Save train-test splits
import joblib

splits_data = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'numerical_features': numerical_features,
    'categorical_features': categorical_features
}

joblib.dump(splits_data, '../data/processed/train_test_splits.joblib')
print(f"✅ Train-test splits saved!")

# Save preprocessor
joblib.dump(preprocessor, '../data/processed/preprocessor.joblib')
print(f"✅ Preprocessor saved!")

## 7. Summary

### ✅ Preprocessing Complete!

**What we accomplished:**
1. ✓ Loaded and cleaned the raw data
2. ✓ Created tenure groups for better segmentation
3. ✓ Engineered charge-related features
4. ✓ Created service bundle features
5. ✓ Identified numerical and categorical features
6. ✓ Split data into train and test sets (80/20)
7. ✓ Created preprocessing pipeline with scaling and encoding
8. ✓ Saved all preprocessed data and artifacts

**Key Statistics:**
- Total samples: {total_samples}
- Training samples: {train_samples}
- Test samples: {test_samples}
- Original features: {original_features}
- Engineered features: {engineered_features}
- Features after encoding: {encoded_features}

**Next Steps:**
📌 Proceed to: **03_Modeling_and_Evaluation.ipynb** to train and evaluate ML models

In [None]:
# Print final summary
print("="*60)
print("✅ PREPROCESSING AND FEATURE ENGINEERING COMPLETE!")
print("="*60)
print(f"\n📊 Final Dataset:")
print(f"  - Total samples: {len(df_featured):,}")
print(f"  - Features: {len(df_featured.columns)}")
print(f"  - Churn rate: {(df_featured['Churn'] == 'Yes').mean()*100:.2f}%")

print(f"\n📊 Train-Test Split:")
print(f"  - Training: {len(X_train):,} samples")
print(f"  - Test: {len(X_test):,} samples")

print(f"\n📊 Feature Engineering:")
print(f"  - Numerical features: {len(numerical_features)}")
print(f"  - Categorical features: {len(categorical_features)}")
print(f"  - Total after encoding: {X_train_transformed.shape[1]}")

print("\n✅ Ready for model training!")
print("="*60)