In [None]:
# Customer Churn Prediction - Feature Engineering

# 2_feature_engineering.ipynb

# Import libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load dataset
df = pd.read_csv('data/telco_churn.csv')

# Encode categorical variables
le = LabelEncoder()
categorical_features = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
                        'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                        'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
                        'PaperlessBilling', 'PaymentMethod']

for col in categorical_features:
    df[col] = le.fit_transform(df[col])

# Handle missing TotalCharges
if df['TotalCharges'].dtype == 'object':
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

# Feature scaling
scaler = StandardScaler()
scaled_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
df[scaled_features] = scaler.fit_transform(df[scaled_features])

# Create tenure bucket feature
def tenure_bucket(tenure):
    if tenure <= 12:
        return '0-12 months'
    elif tenure <= 48:
        return '13-48 months'
    else:
        return '49+ months'

# New feature
df['tenure_bucket'] = df['tenure'].apply(tenure_bucket)
df['tenure_bucket'] = le.fit_transform(df['tenure_bucket'])

# Save preprocessed data
df.to_csv('data/processed_telco_churn.csv', index=False)