In [10]:
# ===========================================
# üì¶ Step 1: Import Libraries
# ===========================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from scipy import stats

# ===========================================
# üìÇ Step 2: Load Dataset
# ===========================================
df = pd.read_csv("Churn.csv")
print("‚úÖ Dataset Loaded Successfully\n")
print(df.head())

# ===========================================
# üîç Step 3: Basic Info & Missing Values
# ===========================================
print("\n--- Dataset Info ---")
print(df.info())
print("\n--- Missing Values ---")
print(df.isnull().sum())

# ===========================================
# üßπ Step 4: Handle Missing Values
# ===========================================
# Convert TotalCharges to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Impute missing numeric values with median
imputer = SimpleImputer(strategy='median')
df['TotalCharges'] = imputer.fit_transform(df[['TotalCharges']])

# Check missing values again
print("\nMissing Values After Imputation:\n", df.isnull().sum())

# ===========================================
# üö´ Step 5: Outlier Detection & Removal
# ===========================================
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
z_scores = np.abs(stats.zscore(df[numeric_cols]))
df = df[(z_scores < 3).all(axis=1)]
print(f"\n‚úÖ Outliers Removed ‚Äî New Shape: {df.shape}")

# ===========================================
# üî§ Step 6: Encoding Categorical Columns
# ===========================================
# Drop customerID (not useful)
df = df.drop('customerID', axis=1)

# Encode target variable first
le = LabelEncoder()
df['Churn'] = le.fit_transform(df['Churn'])  # Yes=1, No=0

# Binary encoding (Yes/No) for other columns
for col in df.columns:
    if df[col].dtype == 'object' and df[col].nunique() == 2:
        df[col] = le.fit_transform(df[col])

# One-hot encode remaining categorical features
df = pd.get_dummies(df, drop_first=True)
print("\n‚úÖ Encoding Completed ‚Äî New Shape:", df.shape)

# ===========================================
# üìà Step 7: Feature Scaling
# ===========================================
scaler = StandardScaler()
num_cols = df.select_dtypes(include=np.number).columns
df[num_cols] = scaler.fit_transform(df[num_cols])
print("\n‚úÖ Feature Scaling Done")

# ===========================================
# üß© Step 8: Feature Engineering
# ===========================================
# Example: Total number of services a customer uses
service_cols = [col for col in df.columns if 'Yes' in col or 'InternetService' in col]
if len(service_cols) > 0:
    df['TotalServices'] = df[service_cols].sum(axis=1)

# Example: Average monthly charge
df['AvgMonthlyCharge'] = df['TotalCharges'] / (df['tenure'] + 1)
df['AvgMonthlyCharge'] = df['AvgMonthlyCharge'].fillna(0)

print("\n‚úÖ New Features Created Successfully")

# ===========================================
# üß† Step 9: Train/Test Split
# ===========================================
X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\n‚úÖ Data Split Successful")
print("Train Shape:", X_train.shape, " | Test Shape:", X_test.shape)

# ===========================================
# ‚úÖ Final Check
# ===========================================
print("\nüéØ Data Cleaning + Feature Engineering Done Successfully ‚Äî No Errors Found!")


‚úÖ Dataset Loaded Successfully

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV Str