<a href="https://colab.research.google.com/github/tanmaymaind/desktop-tutorial/blob/main/tele_custmer_churn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("blastchar/telco-customer-churn")

print("Path to dataset files:", path)

In [None]:
# Import the pandas library, which is essential for working with dataframes
import pandas as pd
import os

# Define the path to your CSV file
# The dataset was downloaded to /kaggle/input/telco-customer-churn
file_path = '/content/WA_Fn-UseC_-Telco-Customer-Churn[1].csv'

# Load the dataset into a pandas DataFrame
df = pd.read_csv(file_path)

# Display the first 5 rows of the DataFrame to get a quick look at the data
print("First 5 rows of the dataset:")
print(df.head())

# Get a concise summary of the DataFrame, including data types and missing values
print("\nDataset information:")
df.info()

# Get descriptive statistics for numerical columns
print("\nDescriptive statistics:")
print(df.describe())

# Check for the number of missing values in each column
print("\nMissing values per column:")
print(df.isnull().sum())

In [None]:
# Convert 'TotalCharges' to a numeric type.
# errors='coerce' will replace any values that cannot be converted to numeric with NaN (Not a Number)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Now, let's check for missing values again, specifically in 'TotalCharges'
print("Missing values in 'TotalCharges' after conversion:")
print(df['TotalCharges'].isnull().sum())

# Let's see the data type of TotalCharges now
print("\nData type of 'TotalCharges' after conversion:")
print(df['TotalCharges'].dtype)

In [None]:
# Display rows where 'TotalCharges' is NaN to understand the context
print("\nRows where TotalCharges is NaN:")
print(df[df['TotalCharges'].isnull()][['customerID', 'tenure', 'MonthlyCharges', 'TotalCharges']])

# Count how many missing values we have
missing_total_charges_count = df['TotalCharges'].isnull().sum()

if missing_total_charges_count > 0:
    # For this specific dataset, it's common that missing TotalCharges are for customers with 0 tenure.
    # In such cases, TotalCharges should logically be 0 or close to MonthlyCharges if tenure is 1.
    # For simplicity, if tenure is 0, TotalCharges should be 0.
    # Let's check if these NaNs correspond to tenure = 0
    if not df[(df['TotalCharges'].isnull()) & (df['tenure'] == 0)].empty:
        print("\nMissing TotalCharges are for customers with 0 tenure. Imputing with 0.")
        df['TotalCharges'] = df['TotalCharges'].fillna(0)
    else:
        # If NaNs are not just for tenure 0, we might use median imputation
        print("\nMissing TotalCharges found for tenures other than 0, or tenure 0 check is inconclusive.")
        print("Imputing with the median TotalCharge.")
        median_total_charges = df['TotalCharges'].median()
        df['TotalCharges'] = df['TotalCharges'].fillna(median_total_charges)

# Verify that there are no more missing values in 'TotalCharges'
print("\nMissing values in 'TotalCharges' after handling:")
print(df['TotalCharges'].isnull().sum())

In [None]:
# Check the unique values in the 'Churn' column
print("\nUnique values in 'Churn' column before conversion:")
print(df['Churn'].unique())

# Convert 'Churn' column to numerical (0 or 1)
# We can use map or replace, or scikit-learn's LabelEncoder later for many columns
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# Verify the conversion
print("\nUnique values in 'Churn' column after conversion:")
print(df['Churn'].unique())
print("\nData type of 'Churn' column after conversion:")
print(df['Churn'].dtype)

# Let's look at the info again to see the updated Dtypes
print("\nDataset information after conversions:")
df.info()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set a style for seaborn plots for better aesthetics
sns.set(style="whitegrid")

In [None]:
# Calculate the distribution of the 'Churn' variable
churn_distribution = df['Churn'].value_counts(normalize=True) * 100
print("Churn Distribution (%):\n", churn_distribution)

# Visualize the Churn distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='Churn', data=df)
plt.title('Distribution of Customer Churn (0 = No, 1 = Yes)')
plt.xlabel('Churn')
plt.ylabel('Number of Customers')
plt.xticks([0, 1], ['No (0)', 'Yes (1)']) # To make x-axis labels clearer
plt.show()

In [None]:
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']

for feature in numerical_features:
    plt.figure(figsize=(10, 5))

    # Plot distribution for all customers
    plt.subplot(1, 2, 1) # 1 row, 2 columns, 1st subplot
    sns.histplot(df[feature], kde=True, color='skyblue')
    plt.title(f'Distribution of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')

    # Plot distribution separately for Churn vs. No Churn
    plt.subplot(1, 2, 2) # 1 row, 2 columns, 2nd subplot
    sns.histplot(df[df['Churn'] == 0][feature], label='Churn: No', kde=True, color='green', stat="density", common_norm=False)
    sns.histplot(df[df['Churn'] == 1][feature], label='Churn: Yes', kde=True, color='red', stat="density", common_norm=False)
    plt.title(f'Distribution of {feature} by Churn')
    plt.xlabel(feature)
    plt.ylabel('Density')
    plt.legend()

    plt.tight_layout() # Adjusts plot to prevent labels from overlapping
    plt.show()

In [None]:
# Identify categorical columns (excluding customerID and already numeric/target columns)
categorical_features = df.select_dtypes(include='object').columns.tolist()
# We can also manually list them if preferred, or if some 'object' columns are not truly categorical
# For now, let's exclude 'customerID' from this general categorical analysis
if 'customerID' in categorical_features:
    categorical_features.remove('customerID')

print(f"\nCategorical features to analyze: {categorical_features}")

for feature in categorical_features:
    plt.figure(figsize=(10, 5)) # Adjust figsize as needed, some features have many categories
    sns.countplot(x=feature, hue='Churn', data=df, palette=['green', 'red'])
    plt.title(f'Churn Counts by {feature}')
    plt.xlabel(feature)
    plt.ylabel('Number of Customers')
    plt.xticks(rotation=45, ha='right') # Rotate x-axis labels if they overlap
    plt.legend(title='Churn', labels=['No', 'Yes'])
    plt.tight_layout()
    plt.show()

In [None]:
# Drop the customerID column as it's not needed for modeling
df_processed = df.copy() # Create a copy to keep the original df intact for reference
df_processed = df_processed.drop('customerID', axis=1)

print("Columns after dropping customerID:")
print(df_processed.columns)
print(f"\nShape of DataFrame after dropping customerID: {df_processed.shape}")

In [None]:
# Identify remaining categorical columns (object Dtype) to be encoded
categorical_cols_to_encode = df_processed.select_dtypes(include='object').columns.tolist()

print(f"\nCategorical columns to be one-hot encoded: {categorical_cols_to_encode}")

# Apply one-hot encoding using pandas get_dummies
df_processed = pd.get_dummies(df_processed, columns=categorical_cols_to_encode, drop_first=True)

print("\nShape of DataFrame after one-hot encoding:")
print(df_processed.shape)

print("\nFirst 5 rows of the processed DataFrame (showing new encoded columns):")
print(df_processed.head())

print("\nDataset information of the processed DataFrame:")
df_processed.info()

In [None]:
# Separate features (X) and target (y)
# X will contain all columns except 'Churn'
X = df_processed.drop('Churn', axis=1)

# y will contain only the 'Churn' column
y = df_processed['Churn']

print("Shape of features (X):", X.shape)
print("Shape of target (y):", y.shape)

In [None]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets (e.g., 80% train, 20% test)
# random_state ensures reproducibility of the split
# stratify=y ensures that the class proportions are maintained in train and test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("\nShape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

print("\nChurn distribution in original data:\n", y.value_counts(normalize=True))
print("Churn distribution in y_train:\n", y_train.value_counts(normalize=True))
print("Churn distribution in y_test:\n", y_test.value_counts(normalize=True))

In [None]:
from sklearn.preprocessing import StandardScaler

# Identify numerical columns that need scaling
# 'SeniorCitizen' is already 0/1, and one-hot encoded columns are 0/1.
cols_to_scale = ['tenure', 'MonthlyCharges', 'TotalCharges']

scaler = StandardScaler()

# Fit the scaler on the training data for these columns and transform them
X_train[cols_to_scale] = scaler.fit_transform(X_train[cols_to_scale])

# Use the SAME fitted scaler to transform the test data for these columns
X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])

print("\nFirst 5 rows of X_train after scaling (showing only scaled columns for brevity):")
print(X_train[cols_to_scale].head())

print("\nFirst 5 rows of X_test after scaling (showing only scaled columns for brevity):")
print(X_test[cols_to_scale].head())

In [None]:
# Import necessary models and metrics
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, classification_report

# Create a dictionary to store the results of each model
model_results = {}

# --- 1. Logistic Regression ---
print("--- Logistic Regression ---")
log_reg = LogisticRegression(solver='liblinear', random_state=42) # liblinear is good for small datasets
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
y_pred_proba_log_reg = log_reg.predict_proba(X_test)[:, 1] # Probabilities for ROC AUC

# Evaluate
accuracy_lr = accuracy_score(y_test, y_pred_log_reg)
precision_lr = precision_score(y_test, y_pred_log_reg)
recall_lr = recall_score(y_test, y_pred_log_reg)
f1_lr = f1_score(y_test, y_pred_log_reg)
roc_auc_lr = roc_auc_score(y_test, y_pred_proba_log_reg)
model_results['Logistic Regression'] = {'Accuracy': accuracy_lr, 'Precision': precision_lr, 'Recall': recall_lr, 'F1-Score': f1_lr, 'ROC AUC': roc_auc_lr}

print(f"Accuracy: {accuracy_lr:.4f}")
print(f"Precision: {precision_lr:.4f}")
print(f"Recall: {recall_lr:.4f}") # How many of the actual positives our model captured
print(f"F1-Score: {f1_lr:.4f}")
print(f"ROC AUC Score: {roc_auc_lr:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log_reg))
print("Classification Report:\n", classification_report(y_test, y_pred_log_reg))


# --- 2. Decision Tree Classifier ---
print("\n--- Decision Tree Classifier ---")
dt_clf = DecisionTreeClassifier(random_state=42) # Using default parameters for now
dt_clf.fit(X_train, y_train)
y_pred_dt = dt_clf.predict(X_test)
y_pred_proba_dt = dt_clf.predict_proba

In [None]:
# Make sure these are imported
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, classification_report

# Initialize (or re-initialize) the results dictionary
model_results = {}

In [None]:
# --- 1. Logistic Regression ---
print("--- Logistic Regression ---")
log_reg = LogisticRegression(solver='liblinear', random_state=42)
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
y_pred_proba_log_reg = log_reg.predict_proba(X_test)[:, 1]

# Evaluate
accuracy_lr = accuracy_score(y_test, y_pred_log_reg)
precision_lr = precision_score(y_test, y_pred_log_reg)
recall_lr = recall_score(y_test, y_pred_log_reg)
f1_lr = f1_score(y_test, y_pred_log_reg)
roc_auc_lr = roc_auc_score(y_test, y_pred_proba_log_reg)
model_results['Logistic Regression'] = {'Accuracy': accuracy_lr, 'Precision': precision_lr, 'Recall': recall_lr, 'F1-Score': f1_lr, 'ROC AUC': roc_auc_lr}

print(f"Accuracy: {accuracy_lr:.4f}")
print(f"Precision: {precision_lr:.4f}")
print(f"Recall: {recall_lr:.4f}")
print(f"F1-Score: {f1_lr:.4f}")
print(f"ROC AUC Score: {roc_auc_lr:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log_reg))
print("Classification Report:\n", classification_report(y_test, y_pred_log_reg))

In [None]:
# --- 2. Decision Tree Classifier ---
print("\n--- Decision Tree Classifier ---")
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)
y_pred_dt = dt_clf.predict(X_test)
y_pred_proba_dt = dt_clf.predict_proba(X_test)[:, 1]

# Evaluate
accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt)
recall_dt = recall_score(y_test, y_pred_dt)
f1_dt = f1_score(y_test, y_pred_dt)
roc_auc_dt = roc_auc_score(y_test, y_pred_proba_dt)
model_results['Decision Tree'] = {'Accuracy': accuracy_dt, 'Precision': precision_dt, 'Recall': recall_dt, 'F1-Score': f1_dt, 'ROC AUC': roc_auc_dt}

print(f"Accuracy: {accuracy_dt:.4f}")
print(f"Precision: {precision_dt:.4f}")
print(f"Recall: {recall_dt:.4f}")
print(f"F1-Score: {f1_dt:.4f}")
print(f"ROC AUC Score: {roc_auc_dt:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))
print("Classification Report:\n", classification_report(y_test, y_pred_dt))

In [None]:
# --- 3. Random Forest Classifier ---
print("\n--- Random Forest Classifier ---")
# Ensure X_train, y_train, X_test, y_test are available from your previous data splitting step

rf_clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1) # n_estimators is the number of trees, n_jobs=-1 uses all processors
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
y_pred_proba_rf = rf_clf.predict_proba(X_test)[:, 1] # Probabilities for ROC AUC

# Evaluate
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
roc_auc_rf = roc_auc_score(y_test, y_pred_proba_rf)

# Storing results - make sure model_results dictionary was initialized earlier
if 'model_results' not in globals():
    model_results = {} # Initialize if it wasn't
    # If you ran LR and DT in this session, you'd ideally repopulate model_results
    # For now, let's assume it exists or will be primarily built from these individual runs

model_results['Random Forest'] = {'Accuracy': accuracy_rf, 'Precision': precision_rf, 'Recall': recall_rf, 'F1-Score': f1_rf, 'ROC AUC': roc_auc_rf}

print(f"Accuracy: {accuracy_rf:.4f}")
print(f"Precision: {precision_rf:.4f}")
print(f"Recall: {recall_rf:.4f}")
print(f"F1-Score: {f1_rf:.4f}")
print(f"ROC AUC Score: {roc_auc_rf:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

In [None]:
# --- Display all model results ---
print("\n--- Model Comparison ---")

# Ensure model_results is populated. If you ran cells out of order or restarted,
# you might need to re-run the previous model cells to populate it correctly.
# For demonstration, I'll reconstruct it with the values you've provided
# In your notebook, if model_results was built correctly, this part is just pd.DataFrame(model_results).T

# Reconstructing model_results based on the outputs you shared:
model_results_data = {
    'Logistic Regression': {'Accuracy': 0.8041, 'Precision': 0.6541, 'Recall': 0.5561, 'F1-Score': 0.6012, 'ROC AUC': 0.8425},
    'Decision Tree': {'Accuracy': 0.7253, 'Precision': 0.4825, 'Recall': 0.4786, 'F1-Score': 0.4805, 'ROC AUC': 0.6460},
    'Random Forest': {'Accuracy': 0.7850, 'Precision': 0.6187, 'Recall': 0.4947, 'F1-Score': 0.5498, 'ROC AUC': 0.8248}
}

results_df = pd.DataFrame(model_results_data).T # Transpose for better readability
print(results_df.sort_values(by='ROC AUC', ascending=False))

In [None]:
# Ensure your Random Forest model (rf_clf) and X_train (to get column names) are available
# from the previous steps.

if 'rf_clf' in globals() and 'X_train' in globals():
    # Get feature importances from the trained Random Forest model
    importances = rf_clf.feature_importances_

    # Create a pandas Series to associate importances with feature names
    # X_train.columns contains the names of all features after preprocessing (including one-hot encoded ones)
    feature_names = X_train.columns
    feature_importance_series = pd.Series(importances, index=feature_names)

    # Sort the features by importance in descending order
    sorted_feature_importances = feature_importance_series.sort_values(ascending=False)

    print("Feature Importances from Random Forest:\n")
    print(sorted_feature_importances)

    # Visualize the top N most important features
    N = 15 # You can choose how many top features to display
    plt.figure(figsize=(10, 8))
    sns.barplot(x=sorted_feature_importances.head(N).values, y=sorted_feature_importances.head(N).index)
    plt.xlabel("Importance Score")
    plt.ylabel("Features")
    plt.title(f"Top {N} Most Important Features (Random Forest)")
    plt.tight_layout()
    plt.show()

else:
    print("Please make sure you have trained the Random Forest model ('rf_clf') and have 'X_train' available.")

In [None]:
from sklearn.model_selection import GridSearchCV

# --- Hyperparameter Tuning for Random Forest ---
print("--- Starting Hyperparameter Tuning for Random Forest ---")
print("This might take a few minutes depending on the grid size and your system...\n")

# Define the parameter grid to search
# These are just example values; a more exhaustive search would include more options
# We'll keep the grid small for now to reduce runtime
param_grid_rf = {
    'n_estimators': [100, 200],          # Number of trees
    'max_depth': [None, 10, 20],         # Maximum depth of the tree
    'min_samples_split': [2, 5],         # Minimum samples required to split a node
    'min_samples_leaf': [1, 2],          # Minimum samples required at each leaf node
    'max_features': ['sqrt', 'log2']     # Number of features to consider at every split
}

# Initialize the Random Forest Classifier (the one we used before, but it will be re-instantiated by GridSearchCV)
rf_clf_for_tuning = RandomForestClassifier(random_state=42, n_jobs=-1)

# Initialize GridSearchCV
# Scoring: 'roc_auc' is a good general metric. For churn, 'recall_weighted' or 'f1_weighted' can also be good.
# We can also create a specific recall scorer for the positive class if needed. Let's stick to roc_auc for broadness.
# cv=3 means 3-fold cross-validation. Can increase for more robust results, but increases time.
grid_search_rf = GridSearchCV(estimator=rf_clf_for_tuning,
                              param_grid=param_grid_rf,
                              scoring='roc_auc', # Focus on ROC AUC for overall discrimination
                              cv=3,              # Number of cross-validation folds
                              verbose=1,         # Prints updates as it runs
                              n_jobs=-1)         # Use all available CPU cores

# Fit GridSearchCV to the training data
# This is the step that takes time!
grid_search_rf.fit(X_train, y_train)

# Get the best hyperparameters found
print("\nBest Hyperparameters found by GridSearchCV:")
print(grid_search_rf.best_params_)

# Get the best estimator (the model with the best parameters)
best_rf_model = grid_search_rf.best_estimator_

print("\n--- Evaluating Tuned Random Forest Model ---")
# Make predictions on the test set using the best model
y_pred_best_rf = best_rf_model.predict(X_test)
y_pred_proba_best_rf = best_rf_model.predict_proba(X_test)[:, 1]

# Evaluate the tuned model
accuracy_best_rf = accuracy_score(y_test, y_pred_best_rf)
precision_best_rf = precision_score(y_test, y_pred_best_rf)
recall_best_rf = recall_score(y_test, y_pred_best_rf)
f1_best_rf = f1_score(y_test, y_pred_best_rf)
roc_auc_best_rf = roc_auc_score(y_test, y_pred_proba_best_rf)

# Update our model_results dictionary (or create a new one for tuned models)
if 'model_results' not in globals(): # Just in case it wasn't run in this session
    model_results = {}
model_results['Random Forest (Tuned)'] = {
    'Accuracy': accuracy_best_rf,
    'Precision': precision_best_rf,
    'Recall': recall_best_rf,
    'F1-Score': f1_best_rf,
    'ROC AUC': roc_auc_best_rf
}

print(f"Accuracy: {accuracy_best_rf:.4f}")
print(f"Precision: {precision_best_rf:.4f}")
print(f"Recall: {recall_best_rf:.4f}")
print(f"F1-Score: {f1_best_rf:.4f}")
print(f"ROC AUC Score: {roc_auc_best_rf:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_best_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_best_rf))

# --- Compare with the original Random Forest ---
print("\n--- Comparison with Original Random Forest ---")
original_rf_metrics = model_results.get('Random Forest', {}) # Get original RF if available
tuned_rf_metrics = model_results['Random Forest (Tuned)']

comparison_data = {
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC AUC'],
    'Original RF': [original_rf_metrics.get(m, 'N/A') for m in ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC AUC']],
    'Tuned RF': [tuned_rf_metrics.get(m, 'N/A') for m in ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC AUC']]
}
comparison_df = pd.DataFrame(comparison_data)
print(comparison_df)