In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files
import joblib

# Set random seed for reproducibility
np.random.seed(42)

# Load and preprocess the Telco Customer Churn dataset
def load_and_preprocess_data(file_path='WA_Fn-UseC_-Telco-Customer-Churn.csv'):
    try:
        # Load dataset
        df = pd.read_csv(file_path)

        # Convert TotalCharges to numeric, handle missing values
        df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
        df = df.assign(TotalCharges=df['TotalCharges'].fillna(df['TotalCharges'].median()))

        # Drop customerID as it's not predictive
        df = df.drop('customerID', axis=1)

        # Encode categorical variables
        le = LabelEncoder()
        categorical_cols = df.select_dtypes(include=['object']).columns
        for col in categorical_cols:
            df[col] = le.fit_transform(df[col])

        return df

    except Exception as e:
        print(f"Error loading data: {e}")
        return None

# Feature engineering
def engineer_features(df):
    # Create tenure groups, ensuring no NaN values in bins
    df['TenureGroup'] = pd.cut(df['tenure'],
                              bins=[-1, 12, 24, 36, 48, 60, np.inf],
                              labels=[1, 2, 3, 4, 5, 6],
                              include_lowest=True)

    # Convert TenureGroup to numeric
    df['TenureGroup'] = df['TenureGroup'].cat.codes + 1

    # Create interaction feature: MonthlyCharges * Contract
    df['MonthlyContractInteraction'] = df['MonthlyCharges'] * df['Contract']

    # Create service usage score
    service_cols = ['PhoneService', 'MultipleLines', 'InternetService',
                   'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                   'TechSupport', 'StreamingTV', 'StreamingMovies']
    df['ServiceUsageScore'] = df[service_cols].sum(axis=1)

    return df

# Train and evaluate models
def train_and_evaluate_models(X, y):
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train Logistic Regression
    lr_model = LogisticRegression(random_state=42, max_iter=1000)
    lr_model.fit(X_train_scaled, y_train)

    # Train XGBoost
    xgb_model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
    xgb_model.fit(X_train_scaled, y_train)

    # Evaluate models
    models = {'Logistic Regression': lr_model, 'XGBoost': xgb_model}
    results = {}

    for name, model in models.items():
        y_pred = model.predict(X_test_scaled)
        results[name] = {
            'confusion_matrix': confusion_matrix(y_test, y_pred),
            'classification_report': classification_report(y_test, y_pred, output_dict=True),
            'roc_auc': roc_auc_score(y_test, model.predict_proba(X_test_scaled)[:, 1])
        }

    return results, xgb_model, X_train, X_test, y_test, scaler

# Visualize results
def visualize_results(results, xgb_model, X_train):
    # Plot confusion matrix for XGBoost
    plt.figure(figsize=(8, 6))
    sns.heatmap(results['XGBoost']['confusion_matrix'], annot=True, fmt='d', cmap='Blues')
    plt.title('XGBoost Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig('confusion_matrix.png')
    plt.close()

    # Plot feature importance for XGBoost
    plt.figure(figsize=(10, 6))
    feature_importance = pd.Series(xgb_model.feature_importances_, index=X_train.columns)
    feature_importance.nlargest(10).sort_values().plot(kind='barh', color='teal')
    plt.title('Top 10 Feature Importance')
    plt.xlabel('Importance Score')
    plt.savefig('feature_importance.png')
    plt.close()

# Generate risk segmentation and save predictions
def generate_risk_segmentation(xgb_model, X_test, scaler):
    X_test_scaled = scaler.transform(X_test)
    churn_probs = xgb_model.predict_proba(X_test_scaled)[:, 1]
    risk_levels = pd.qcut(churn_probs, q=3, labels=['Low', 'Medium', 'High'])
    output_df = pd.DataFrame({
        'CustomerIndex': X_test.index,
        'ChurnProbability': churn_probs,
        'RiskLevel': risk_levels
    })
    output_df.to_csv('churn_predictions.csv', index=False)
    return pd.Series(risk_levels).value_counts()

# Main execution
def main():
    # Upload dataset in Colab
    print("Please upload the dataset file 'WA_Fn-UseC_-Telco-Customer-Churn.csv'")
    uploaded = files.upload()
    file_path = list(uploaded.keys())[0]

    # Load and preprocess data
    df = load_and_preprocess_data(file_path)
    if df is None:
        return

    # Engineer features
    df = engineer_features(df)

    # Prepare features and target
    X = df.drop('Churn', axis=1)
    y = df['Churn']

    # Train and evaluate models
    results, xgb_model, X_train, X_test, y_test, scaler = train_and_evaluate_models(X, y)

    # Print evaluation metrics
    for name, result in results.items():
        print(f"\n{name} Results:")
        print(f"ROC-AUC Score: {result['roc_auc']:.3f}")
        print("\nClassification Report:")
        print(pd.DataFrame(result['classification_report']).transpose())

    # Generate and print risk segmentation
    risk_segmentation = generate_risk_segmentation(xgb_model, X_test, scaler)
    print("\nChurn Risk Segmentation:")
    print(risk_segmentation)

    # Save and download results
    joblib.dump(xgb_model, 'xgboost_churn_model.pkl')
    joblib.dump(scaler, 'scaler.pkl')
    visualize_results(results, xgb_model, X_train)

    print("\nDownloading output files...")
    files.download('confusion_matrix.png')
    files.download('feature_importance.png')
    files.download('churn_predictions.csv')
    files.download('xgboost_churn_model.pkl')
    files.download('scaler.pkl')

    print("\nAll outputs generated and downloaded successfully!")

if __name__ == "__main__":
    main()

Please upload the dataset file 'WA_Fn-UseC_-Telco-Customer-Churn.csv'


Saving WA_Fn-UseC_-Telco-Customer-Churn.csv to WA_Fn-UseC_-Telco-Customer-Churn (1).csv

Logistic Regression Results:
ROC-AUC Score: 0.863

Classification Report:
              precision    recall  f1-score     support
0              0.858059  0.904440  0.880639  1036.00000
1              0.687697  0.584450  0.631884   373.00000
accuracy       0.819730  0.819730  0.819730     0.81973
macro avg      0.772878  0.744445  0.756262  1409.00000
weighted avg   0.812959  0.819730  0.814787  1409.00000

XGBoost Results:
ROC-AUC Score: 0.842

Classification Report:
              precision    recall  f1-score      support
0              0.841198  0.894788  0.867166  1036.000000
1              0.644951  0.530831  0.582353   373.000000
accuracy       0.798439  0.798439  0.798439     0.798439
macro avg      0.743074  0.712809  0.724759  1409.000000
weighted avg   0.789246  0.798439  0.791768  1409.000000

Churn Risk Segmentation:
Low       470
High      470
Medium    469
Name: count, dtype: int64

D

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


All outputs generated and downloaded successfully!
