In [1]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Customer Churn Model Training\n",
    "\n",
    "This notebook focuses on building and evaluating machine learning models to predict customer churn. We'll use the engineered features from our previous notebook to train various models and identify the most effective approach.\n",
    "\n",
    "## Modeling Objectives\n",
    "\n",
    "1. **Data Preparation**: Prepare the feature-engineered data for modeling\n",
    "2. **Baseline Models**: Train simple models to establish a performance baseline\n",
    "3. **Advanced Models**: Implement more sophisticated algorithms with hyperparameter tuning\n",
    "4. **Model Evaluation**: Compare models using appropriate metrics\n",
    "5. **Feature Importance**: Identify the most important predictors of churn\n",
    "6. **Model Interpretation**: Understand how the model makes predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Import libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.metrics import (\n",
    "    accuracy_score, precision_score, recall_score, f1_score,\n",
    "    roc_auc_score, confusion_matrix, classification_report,\n",
    "    precision_recall_curve, roc_curve, average_precision_score\n",
    ")\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.pipeline import Pipeline\n",
    "import pickle\n",
    "import os\n",
    "\n",
    "# Set plot style\n",
    "plt.style.use('seaborn-whitegrid')\n",
    "sns.set_palette('colorblind')\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Import our custom modules\n",
    "import sys\n",
    "import os\n",
    "sys.path.append('..')\n",
    "from src.model_trainer import (\n",
    "    split_data, train_logistic_regression, train_random_forest,\n",
    "    plot_feature_importance, save_model\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Load and Prepare Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Load the engineered dataset\n",
    "df = pd.read_csv('../data/telco_churn_engineered.csv')\n",
    "print(f\"Dataset shape: {df.shape}\")\n",
    "\n",
    "# Check a sample of the data\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Preprocessing for modeling\n",
    "\n",
    "# Remove non-predictor columns\n",
    "X = df.drop(['Churn', 'customerID'], axis=1)\n",
    "y = df['Churn']\n",
    "\n",
    "# Get categorical columns (except those we've already encoded)\n",
    "cat_cols = X.select_dtypes(include=['object']).columns\n",
    "print(f\"Categorical columns: {len(cat_cols)}\")\n",
    "print(cat_cols.tolist())\n",
    "\n",
    "# One-hot encode categorical variables\n",
    "X_encoded = pd.get_dummies(X, columns=cat_cols, drop_first=True)\n",
    "print(f\"Encoded feature shape: {X_encoded.shape}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Split the data into training and testing sets\n",
    "X_train, X_test, y_train, y_test = split_data(X_encoded, y, test_size=0.2, random_state=42)\n",
    "\n",
    "# Save column information for future prediction\n",
    "pd.DataFrame(columns=X_train.columns).to_csv('../models/X_train_columns.csv', index=False)\n",
    "print(\"Column information saved for future prediction\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Baseline Models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Function to evaluate model performance\n",
    "def evaluate_model(model, X_train, X_test, y_train, y_test, model_name=\"Model\"):\n",
    "    # Make predictions\n",
    "    y_train_pred = model.predict(X_train)\n",
    "    y_test_pred = model.predict(X_test)\n",
    "    \n",
    "    # Get probabilities for ROC\n",
    "    y_train_prob = model.predict_proba(X_train)[:, 1]\n",
    "    y_test_prob = model.predict_proba(X_test)[:, 1]\n",
    "    \n",
    "    # Calculate metrics\n",
    "    train_accuracy = accuracy_score(y_train, y_train_pred)\n",
    "    test_accuracy = accuracy_score(y_test, y_test_pred)\n",
    "    \n",
    "    train_precision = precision_score(y_train, y_train_pred)\n",
    "    test_precision = precision_score(y_test, y_test_pred)\n",
    "    \n",
    "    train_recall = recall_score(y_train, y_train_pred)\n",
    "    test_recall = recall_score(y_test, y_test_pred)\n",
    "    \n",
    "    train_f1 = f1_score(y_train, y_train_pred)\n",
    "    test_f1 = f1_score(y_test, y_test_pred)\n",
    "    \n",
    "    train_auc = roc_auc_score(y_train, y_train_prob)\n",
    "    test_auc = roc_auc_score(y_test, y_test_prob)\n",
    "    \n",
    "    # Print results\n",
    "    print(f\"\\n{model_name} Performance:\")\n",
    "    print(f\"Training Accuracy: {train_accuracy:.4f}\")\n",
    "    print(f\"Testing Accuracy: {test_accuracy:.4f}\")\n",
    "    print(f\"Training AUC: {train_auc:.4f}\")\n",
    "    print(f\"Testing AUC: {test_auc:.4f}\")\n",
    "    print(f\"Training F1 Score: {train_f1:.4f}\")\n",
    "    print(f\"Testing F1 Score: {test_f1:.4f}\")\n",
    "    \n",
    "    # Confusion matrix for test set\n",
    "    cm = confusion_matrix(y_test, y_test_pred)\n",
    "    \n",
    "    # Classification report\n",
    "    print(f\"\\nClassification Report (Test Set):\")\n",
    "    print(classification_report(y_test, y_test_pred))\n",
    "    \n",
    "    # Visualize confusion matrix\n",
    "    plt.figure(figsize=(8, 6))\n",
    "    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', \n",
    "                xticklabels=['Not Churn', 'Churn'],\n",
    "                yticklabels=['Not Churn', 'Churn'])\n",
    "    plt.title(f\"{model_name} Confusion Matrix\")\n",
    "    plt.ylabel('Actual')\n",
    "    plt.xlabel('Predicted')\n",
    "    plt.show()\n",
    "    \n",
    "    # Return metrics dictionary\n",
    "    return {\n",
    "        'model_name': model_name,\n",
    "        'train_accuracy': train_accuracy,\n",
    "        'test_accuracy': test_accuracy,\n",
    "        'train_precision': train_precision,\n",
    "        'test_precision': test_precision,\n",
    "        'train_recall': train_recall,\n",
    "        'test_recall': test_recall,\n",
    "        'train_f1': train_f1,\n",
    "        'test_f1': test_f1,\n",
    "        'train_auc': train_auc,\n",
    "        'test_auc': test_auc,\n",
    "        'confusion_matrix': cm\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Train Logistic Regression baseline\n",
    "lr_model, _ = train_logistic_regression(X_train, y_train, X_test, y_test)\n",
    "\n",
    "# Evaluate logistic regression\n",
    "lr_metrics = evaluate_model(lr_model, X_train, X_test, y_train, y_test, \"Logistic Regression\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Logistic Regression with L1 regularization (feature selection)\n",
    "lr_l1 = LogisticRegression(penalty='l1', solver='liblinear', C=0.1, random_state=42)\n",
    "lr_l1.fit(X_train, y_train)\n",
    "lr_l1_metrics = evaluate_model(lr_l1, X_train, X_test, y_train, y_test, \"Logistic Regression (L1)\")\n",
    "\n",
    "# Analyze logistic regression coefficients\n",
    "coef = pd.DataFrame({\n",
    "    'Feature': X_train.columns,\n",
    "    'Coefficient': lr_l1.coef_[0]\n",
    "})\n",
    "\n",
    "# Show non-zero coefficients (selected features)\n",
    "non_zero_coef = coef[coef['Coefficient'] != 0].sort_values('Coefficient', ascending=False)\n",
    "print(f\"\\nNumber of features selected by L1 regularization: {len(non_zero_coef)}\")\n",
    "\n",
    "# Visualize top coefficients\n",
    "plt.figure(figsize=(12, 8))\n",
    "top_coef = non_zero_coef.head(15)\n",
    "bottom_coef = non_zero_coef.tail(15)\n",
    "coef_to_plot = pd.concat([top_coef, bottom_coef])\n",
    "\n",
    "# Sort for visualization\n",
    "coef_to_plot = coef_to_plot.sort_values('Coefficient')\n",
    "\n",
    "# Plot\n",
    "colors = ['red' if x < 0 else 'green' for x in coef_to_plot['Coefficient']]\n",
    "plt.barh(coef_to_plot['Feature'], coef_to_plot['Coefficient'], color=colors)\n",
    "plt.title('Top Logistic Regression Coefficients (L1 Regularization)')\n",
    "plt.xlabel('Coefficient Value')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Train Random Forest baseline\n",
    "rf_model, _ = train_random_forest(X_train, y_train, X_test, y_test)\n",
    "\n",
    "# Evaluate Random Forest\n",
    "rf_metrics = evaluate_model(rf_model, X_train, X_test, y_train, y_test, \"Random Forest\")\n",
    "\n",
    "# Plot feature importance\n",
    "plt, feature_importances = plot_feature_importance(rf_model, X_train, n_features=20)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Advanced Models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Gradient Boosting Classifier\n",
    "gb_model = GradientBoostingClassifier(random_state=42)\n",
    "gb_model.fit(X_train, y_train)\n",
    "gb_metrics = evaluate_model(gb_model, X_train, X_test, y_train, y_test, \"Gradient Boosting\")\n",
    "\n",
    "# Gradient Boosting feature importance\n",
    "gb_feature_importances = pd.DataFrame({\n",
    "    'Feature': X_train.columns,\n",
    "    'Importance': gb_model.feature_importances_\n",
    "}).sort_values('Importance', ascending=False)\n",
    "\n",
    "plt.figure(figsize=(12, 8))\n",
    "sns.barplot(x='Importance', y='Feature', data=gb_feature_importances.head(20))\n",
    "plt.title('Top 20 Features (Gradient Boosting)')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Hyperparameter tuning for Gradient Boosting\n",
    "param_grid = {\n",
    "    'n_estimators': [100, 200],\n",
    "    'learning_rate': [0.05, 0.1],\n",
    "    'max_depth': [3, 5],\n",
    "    'min_samples_split': [2, 5],\n",
    "    'min_samples_leaf': [1, 2]\n",
    "}\n",
    "\n",
    "# Comment out for faster execution\n",
    "gb_grid = GridSearchCV(GradientBoostingClassifier(random_state=42), \n",
    "                      param_grid, cv=5, scoring='roc_auc')\n",
    "gb_grid.fit(X_train, y_train)\n",
    "\n",
    "print(f\"Best parameters: {gb_grid.best_params_}\")\n",
    "print(f\"Best CV score: {gb_grid.best_score_:.4f}\")\n",
    "\n",
    "# Evaluate tuned model\n",
    "gb_tuned = gb_grid.best_estimator_\n",
    "gb_tuned_metrics = evaluate_model(gb_tuned, X_train, X_test, y_train, y_test, \"Tuned Gradient Boosting\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Model Comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Collect metrics for all models\n",
    "models_metrics = [lr_metrics, lr_l1_metrics, rf_metrics, gb_metrics, gb_tuned_metrics]\n",
    "models_names = [metric['model_name'] for metric in models_metrics]\n",
    "\n",
    "# Create comparison dataframe\n",
    "comparison_metrics = ['test_accuracy', 'test_precision', 'test_recall', 'test_f1', 'test_auc']\n",
    "comparison_df = pd.DataFrame()\n",
    "\n",
    "for metric in comparison_metrics:\n",
    "    comparison_df[metric] = [m[metric] for m in models_metrics]\n",
    "\n",
    "comparison_df.index = models_names\n",
    "\n",
    "# Rename columns for better display\n",
    "comparison_df.columns = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC']\n",
    "\n",
    "# Display comparison\n",
    "print(\"Model Performance Comparison (Test Set):\")\n",
    "comparison_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Visualize model comparison\n",
    "plt.figure(figsize=(12, 8))\n",
    "comparison_df.plot(kind='bar', figsize=(12, 8))\n",
    "plt.title('Model Performance Comparison')\n",
    "plt.ylabel('Score')\n",
    "plt.ylim(0, 1)\n",
    "plt.xticks(rotation=45)\n",
    "plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# ROC curve comparison\n",
    "plt.figure(figsize=(10, 8))\n",
    "\n",
    "# Models to include in comparison\n",
    "models = [\n",
    "    (lr_model, \"Logistic Regression\", 'blue'),\n",
    "    (rf_model, \"Random Forest\", 'green'),\n",
    "    (gb_tuned, \"Tuned Gradient Boosting\", 'red')\n",
    "]\n",
    "\n",
    "for model, name, color in models:\n",
    "    y_prob = model.predict_proba(X_test)[:, 1]\n",
    "    fpr, tpr, _ = roc_curve(y_test, y_prob)\n",
    "    auc = roc_auc_score(y_test, y_prob)\n",
    "    plt.plot(fpr, tpr, color=color, label=f'{name} (AUC = {auc:.3f})')\n",
    "\n",
    "# Add diagonal line (random classifier)\n",
    "plt.plot([0, 1], [0, 1], color='navy', linestyle='--')\n",
    "\n",
    "plt.xlabel('False Positive Rate')\n",
    "plt.ylabel('True Positive Rate')\n",
    "plt.title('ROC Curve Comparison')\n",
    "plt.legend(loc='lower right')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Precision-Recall curve comparison\n",
    "plt.figure(figsize=(10, 8))\n",
    "\n",
    "for model, name, color in models:\n",
    "    y_prob = model.predict_proba(X_test)[:, 1]\n",
    "    precision, recall, _ = precision_recall_curve(y_test, y_prob)\n",
    "    ap = average_precision_score(y_test, y_prob)\n",
    "    plt.plot(recall, precision, color=color, label=f'{name} (AP = {ap:.3f})')\n",
    "\n",
    "plt.xlabel('Recall')\n",
    "plt.ylabel('Precision')\n",
    "plt.title('Precision-Recall Curve Comparison')\n",
    "plt.legend(loc='lower left')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Feature Importance Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Feature importance comparison between Random Forest and Gradient Boosting\n",
    "# Extract feature importances\n",
    "rf_importance = pd.DataFrame({\n",
    "    'Feature': X_train.columns,\n",
    "    'RF_Importance': rf_model.feature_importances_\n",
    "})\n",
    "\n",
    "gb_importance = pd.DataFrame({\n",
    "    'Feature': X_train.columns,\n",
    "    'GB_Importance': gb_tuned.feature_importances_\n",
    "})\n",
    "\n",
    "# Merge importances\n",
    "feature_comparison = pd.merge(rf_importance, gb_importance, on='Feature')\n",
    "\n",
    "# Add logistic regression coefficients\n",
    "lr_coef = pd.DataFrame({\n",
    "    'Feature': X_train.columns,\n",
    "    'LR_Coefficient': np.abs(lr_model.coef_[0])  # Take absolute value for comparison\n",
    "})\n",
    "\n",
    "feature_comparison = pd.merge(feature_comparison, lr_coef, on='Feature')\n",
    "\n",
    "# Normalize coefficients to make them comparable\n",
    "feature_comparison['LR_Importance'] = feature_comparison['LR_Coefficient'] / feature_comparison['LR_Coefficient'].max()\n",
    "\n",
    "# Calculate average importance\n",
    "feature_comparison['Avg_Importance'] = (\n",
    "    feature_comparison['RF_Importance'] + \n",
    "    feature_comparison['GB_Importance'] + \n",
    "    feature_comparison['LR_Importance']\n",
    ") / 3\n",
    "\n",
    "# Sort by average importance\n",
    "feature_comparison = feature_comparison.sort_values('Avg_Importance', ascending=False)\n",
    "\n",
    "# Display top features\n",
    "print(\"Top 15 Features by Average Importance:\")\n",
    "feature_comparison[['Feature', 'RF_Importance', 'GB_Importance', 'LR_Importance', 'Avg_Importance']].head(15)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Visualize top feature importance comparison\n",
    "top_features = feature_comparison.head(15)['Feature'].tolist()\n",
    "importance_df = pd.DataFrame()\n",
    "\n",
    "for model, name in [(rf_model, 'Random Forest'), (gb_tuned, 'Gradient Boosting')]:\n",
    "    imp = pd.DataFrame({\n",
    "        'Feature': X_train.columns,\n",
    "        'Importance': model.feature_importances_,\n",
    "        'Model': name\n",
    "    })\n",
    "    importance_df = pd.concat([importance_df, imp])\n",
    "\n",
    "# Filter for top features\n",
    "top_importance = importance_df[importance_df['Feature'].isin(top_features)]\n",
    "\n",
    "# Create barplot\n",
    "plt.figure(figsize=(12, 10))\n",
    "sns.barplot(x='Importance', y='Feature', hue='Model', data=top_importance)\n",
    "plt.title('Top 15 Feature Importance Comparison')\n",
    "plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Final Model Selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Based on evaluation metrics, select the best model\n",
    "# (In this case, Gradient Boosting typically performs best)\n",
    "best_model = gb_tuned\n",
    "\n",
    "# Print best model parameters\n",
    "print(\"Best Model: Tuned Gradient Boosting\")\n",
    "print(f\"Parameters: {best_model.get_params()}\")\n",
    "\n",
    "# Save the best model\n",
    "os.makedirs('../models', exist_ok=True)\n",
    "save_model(best_model, '../models/churn_model.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Analyze misclassifications\n",
    "y_pred = best_model.predict(X_test)\n",
    "y_prob = best_model.predict_proba(X_test)[:, 1]\n",
    "\n",
    "# Create dataframe with actual and predicted values\n",
    "misclass_df = pd.DataFrame({\n",
    "    'actual': y_test,\n",
    "    'predicted': y_pred,\n",
    "    'probability': y_prob\n",
    "})\n",
    "\n",
    "# Add original features\n",
    "misclass_df = pd.concat([X_test.reset_index(drop=True), misclass_df], axis=1)\n",
    "\n",
    "# False positives (predicted churn but didn't)\n",
    "false_positives = misclass_df[(misclass_df['actual'] == 0) & (misclass_df['predicted'] == 1)]\n",
    "print(f\"Number of False Positives: {len(false_positives)}\")\n",
    "\n",
    "# False negatives (didn't predict churn but did)\n",
    "false_negatives = misclass_df[(misclass_df['actual'] == 1) & (misclass_df['predicted'] == 0)]\n",
    "print(f\"Number of False Negatives: {len(false_negatives)}\")\n",
    "\n",
    "# Analyze false negatives (most costly error type)\n",
    "print(\"\\nFalse Negative Analysis:\")\n",
    "fn_numerical = false_negatives.select_dtypes(include=['int64', 'float64'])\n",
    "fn_means = fn_numerical.mean()\n",
    "\n",
    "# Compare to overall population means\n",
    "overall_means = X_test.select_dtypes(include=['int64', 'float64']).mean()\n",
    "percent_diff = ((fn_means - overall_means) / overall_means * 100).dropna()\n",
    "\n",
    "# Show features with large differences\n",
    "significant_diffs = percent_diff[abs(percent_diff) > 20].sort_values(ascending=False)\n",
    "print(\"\\nFeatures where False Negatives differ from population:\")\n",
    "for feature, diff in significant_diffs.items():\n",
    "    direction = \"higher\" if diff > 0 else \"lower\"\n",
    "    print(f\"{feature}: {abs(diff):.1f}% {direction}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Threshold Optimization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Threshold optimization\n",
    "# Instead of using default 0.5 threshold, we can optimize it for specific business objectives\n",
    "\n",
    "# Get probabilities\n",
    "y_prob = best_model.predict_proba(X_test)[:, 1]\n",
    "\n",
    "# Calculate metrics across different thresholds\n",
    "thresholds = np.arange(0.1, 0.9, 0.05)\n",
    "threshold_metrics = []\n",
    "\n",
    "for threshold in thresholds:\n",
    "    y_pred_threshold = (y_prob >= threshold).astype(int)\n",
    "    \n",
    "    # Calculate metrics\n",
    "    accuracy = accuracy_score(y_test, y_pred_threshold)\n",
    "    precision = precision_score(y_test, y_pred_threshold)\n",
    "    recall = recall_score(y_test, y_pred_threshold)\n",
    "    f1 = f1_score(y_test, y_pred_threshold)\n",
    "    \n",
    "    # Confusion matrix\n",
    "    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_threshold).ravel()\n",
    "    \n",
    "    # Assuming cost of false negative is 5x cost of false positive\n",
    "    # This is a business assumption - adjust based on your specific situation\n",
    "    cost = (fn * 5) + fp\n",
    "    \n",
    "    threshold_metrics.append({\n",
    "        'threshold': threshold,\n",
    "        'accuracy': accuracy,\n",
    "        'precision': precision,\n",
    "        'recall': recall,\n",
    "        'f1': f1,\n",
    "        'tp': tp,\n",
    "        'fp': fp,\n",
    "        'tn': tn,\n",
    "        'fn': fn,\n",
    "        'cost': cost\n",
    "    })\n",
    "\n",
    "# Convert to DataFrame\n",
    "threshold_df = pd.DataFrame(threshold_metrics)\n",
    "\n",
    "# Plot metrics vs threshold\n",
    "plt.figure(figsize=(12, 8))\n",
    "plt.plot(threshold_df['threshold'], threshold_df['accuracy'], label='Accuracy')\n",
    "plt.plot(threshold_df['threshold'], threshold_df['precision'], label='Precision')\n",
    "plt.plot(threshold_df['threshold'], threshold_df['recall'], label='Recall')\n",
    "plt.plot(threshold_df['threshold'], threshold_df['f1'], label='F1 Score')\n",
    "plt.xlabel('Threshold')\n",
    "plt.ylabel('Score')\n",
    "plt.title('Metrics vs. Classification Threshold')\n",
    "plt.grid(True)\n",
    "plt.legend()\n",
    "plt.show()\n",
    "\n",
    "# Plot cost vs threshold\n",
    "plt.figure(figsize=(10, 6))\n",
    "plt.plot(threshold_df['threshold'], threshold_df['cost'])\n",
    "plt.xlabel('Threshold')\n",
    "plt.ylabel('Cost (5×FN + FP)')\n",
    "plt.title('Error Cost vs. Threshold')\n",
    "plt.grid(True)\n",
    "plt.show()\n",
    "\n",
    "# Find optimal threshold for minimizing cost\n",
    "optimal_idx = threshold_df['cost'].idxmin()\n",
    "optimal_threshold = threshold_df.loc[optimal_idx, 'threshold']\n",
    "print(f\"Optimal threshold for minimizing cost: {optimal_threshold:.2f}\")\n",
    "print(\"Metrics at optimal threshold:\")\n",
    "print(f\"Accuracy: {threshold_df.loc[optimal_idx, 'accuracy']:.4f}\")\n",
    "print(f\"Precision: {threshold_df.loc[optimal_idx, 'precision']:.4f}\")\n",
    "print(f\"Recall: {threshold_df.loc[optimal_idx, 'recall']:.4f}\")\n",
    "print(f\"F1 Score: {threshold_df.loc[optimal_idx, 'f1']:.4f}\")\n",
    "print(f\"False Negatives: {threshold_df.loc[optimal_idx, 'fn']}\")\n",
    "print(f\"False Positives: {threshold_df.loc[optimal_idx, 'fp']}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": None,
   "metadata": {},
   "source": [
    "# Save the optimal threshold with the model information\n",
    "model_info = {\n",
    "    'model': best_model,\n",
    "    'optimal_threshold': optimal_threshold,\n",
    "    'feature_importance': feature_comparison[['Feature', 'Avg_Importance']].head(20).to_dict(),\n",
    "    'columns': X_train.columns.tolist()\n",
    "}\n",
    "\n",
    "with open('../models/model_info.pkl', 'wb') as f:\n",
    "    pickle.dump(model_info, f)\n",
    "\n",
    "print(\"Model information saved with optimal threshold\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model Summary\n",
    "\n",
    "In this notebook, we've developed and evaluated several machine learning models to predict customer churn. Here's a summary of our findings:\n",
    "\n",
    "### Model Performance\n",
    "\n",
    "1. **Gradient Boosting** emerged as the best performing model with the highest AUC and F1 scores after hyperparameter tuning.\n",
    "2. **Random Forest** also performed well and provided valuable insights through feature importance.\n",
    "3. **Logistic Regression** offered good interpretability and highlighted key linear relationships.\n",
    "\n",
    "### Key Predictors of Churn\n",
    "\n",
    "The most important features across models were:\n",
    "\n",
    "1. **Contract-Related Features**: `ContractRiskFactor`, `Contract_Month-to-month`, and contract-related variables consistently ranked as top predictors.\n",
    "2. **Tenure**: Tenure-related features like `tenure` and `TenureGroup` were strong negative predictors of churn.\n",
    "3. **Service Usage**: Features like `TotalServices`, `FiberNoProtection`, and specific service flags were important.\n",
    "4. **Composite Features**: The engineered `CompositeRiskScore` captured multiple risk factors effectively.\n",
    "5. **Customer Value**: Features like `MonthlyCharges` and the interaction between high value and short contracts were significant.\n",
    "\n",
    "### Threshold Optimization\n",
    "\n",
    "We optimized the classification threshold to minimize the cost of misclassifications, considering that false negatives (failing to predict customers who will churn) are more costly than false positives. The optimal threshold was determined to be around 0.3 (instead of the default 0.5), which improves recall at the expense of some precision.\n",
    "\n",
    "### Next Steps\n",
    "\n",
    "1. **Business Analysis**: Apply the model to segment customers by churn risk and develop targeted retention strategies.\n",
    "2. **Deployment**: Implement the model in a production system for ongoing churn prediction.\n",
    "3. **Monitoring**: Establish metrics to monitor model performance over time and retrain as needed.\n",
    "4. **Feedback Loop**: Incorporate the effectiveness of retention actions back into the model to improve future predictions."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

{'cells': [{'cell_type': 'markdown',
   'metadata': {},
   'source': ['# Customer Churn Model Training\n',
    '\n',
    "This notebook focuses on building and evaluating machine learning models to predict customer churn. We'll use the engineered features from our previous notebook to train various models and identify the most effective approach.\n",
    '\n',
    '## Modeling Objectives\n',
    '\n',
    '1. **Data Preparation**: Prepare the feature-engineered data for modeling\n',
    '2. **Baseline Models**: Train simple models to establish a performance baseline\n',
    '3. **Advanced Models**: Implement more sophisticated algorithms with hyperparameter tuning\n',
    '4. **Model Evaluation**: Compare models using appropriate metrics\n',
    '5. **Feature Importance**: Identify the most important predictors of churn\n',
    '6. **Model Interpretation**: Understand how the model makes predictions']},
  {'cell_type': 'code',
   'execution_count': None,
   'metadata': {},
   'source':