<a href="https://colab.research.google.com/github/ujwaldeepkadiyam/Data_Science_and_Machine_Learning/blob/main/Project_Notebook_2_Telecom_Churn_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # this is used for the plot the graph
import seaborn as sns # used for plot interactive graph.
import warnings
warnings.filterwarnings("ignore")

from pylab import rcParams

### Data Download

In [None]:
data_download_path='https://raw.githubusercontent.com/venkatareddykonasani/Datasets/master/IBM_Telco_Data/IBM_Cognos_Data.csv'
churn_data = pd.read_csv(data_download_path)
churn_data.head()

In [None]:
def data_overview(df, message="Data Overview"):
    """
    Generate an overview of the dataset, including key statistics and information.

    Parameters:
    - df (DataFrame): The input DataFrame to be analyzed.
    - message (str): An optional message to be displayed as the title of the overview.

    Returns:
    None (prints the overview to the console).
    """
    # Display the provided message as the title of the overview
    print(f'{message}:\n')

    # Print the number of rows and features in the dataset
    print("Rows:", df.shape[0])
    print("Number of features:", df.shape[1])

    # Print the names of all features in the dataset
    print("\nFeatures:")
    print(df.columns.tolist())

    # Print the Variable types in the dataset
    print("\nVariable Data Types:", df.info())

    # Print the total count of missing values in the dataset
    print("\nMissing values:", df.isnull().sum().values.sum())

    # Print the number of unique values for each feature in the dataset
    print("\nUnique values:")
    print(df.nunique())


In [None]:
data_overview(churn_data)

In [None]:
churn_data.describe()

## Target Variable Analysis

In [None]:
churn_data['Churn'].value_counts(sort = False)

In [None]:
sns.countplot(x='Churn', data=churn_data)
plt.show()

In [None]:
# # Data to plot
# churn_labels = churn_data['Churn'].value_counts(sort=True).index
# churn_sizes = churn_data['Churn'].value_counts(sort=True)

# # Define colors for the pie chart
# colors = ["Green", "red"]

# # Define the degree to which the first slice should be exploded
# explode = (0.1, 0)

# # Set the size of the plot
# rcParams['figure.figsize'] = 6, 6

# # Plotting the pie chart
# plt.pie(churn_sizes, explode=explode, labels=churn_labels, colors=colors,
#         autopct='%1.1f%%', shadow=True, startangle=90)

# # Set the title of the plot
# plt.title('Percentage of Churn in Customers')

# # Display the pie chart
# plt.show()

## Analysis of Categorical Variables

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

def display_frequency_and_bar_chart(df, column_name):
    """
    Display frequency table and bar chart for a specified column in a DataFrame.

    Parameters:
    - df (DataFrame): The input dataframe.
    - column_name (str): The name of the column for analysis.
    """
    # Frequency Table
    frequency_table = df[column_name].value_counts()

    # Bar Chart
    plt.figure(figsize=(6, 3))
    df[column_name].value_counts().plot(kind='bar', color='skyblue', edgecolor='black')

    # Adding labels and title
    plt.title(f'Frequency of {column_name}')
    plt.xlabel(column_name)
    plt.ylabel('Frequency')

    # Displaying the frequency table
    print(f'\nFrequency Table for {column_name}:\n')
    print(frequency_table)

    # Display the bar chart
    plt.show()

# Example usage:
# Assuming 'churn_data' is your DataFrame
columns_to_analyze = ['gender', 'SeniorCitizen', 'Partner', 'Dependents',
                      'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
                      'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
                      'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

for column in columns_to_analyze:
    display_frequency_and_bar_chart(churn_data, column)


## Analysis on Continuous Variables

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def display_stats_and_plots(df, variable_names):
    """
    Display percentiles, box plots, and histograms for specified variables in a DataFrame.

    Parameters:
    - df (DataFrame): The input dataframe.
    - variable_names (list): List of variable names to analyze.
    """
    for variable in variable_names:
        # Display Percentiles
        print(f"\nPercentiles for {variable}:\n")
        print(df[variable].describe(percentiles=[.1, .2, .3, .4, .5, .6,.7,.8,.9,1]))

        # Create Subplots for Box Plot and Histogram
        fig, axes = plt.subplots(1, 2, figsize=(14, 6))

        # Box Plot
        sns.boxplot(y=variable, data=df, ax=axes[0], color='skyblue')
        axes[0].set_title(f'Box Plot of {variable}')

        # Histogram
        sns.histplot(df[variable], bins=30, kde=True, color='skyblue', ax=axes[1])
        axes[1].set_title(f'Histogram of {variable}')

        # Adjust layout
        plt.tight_layout()

        # Display the plots
        plt.show()

# Example usage:
# Assuming 'churn_data' is your DataFrame
variables_to_analyze = ['tenure', 'MonthlyCharges', 'TotalCharges']
display_stats_and_plots(churn_data, variables_to_analyze)


## Data Cleaning and Preparing for Analysis



| Feature         | **Nominal Data**                         | **Ordinal Data**                      |
|---------------|--------------------------------|--------------------------------|
| **Definition**  | Categorical data with no inherent order | Categorical data with a meaningful order |
| **Order**       | No order or ranking | Ordered categories |
| **Comparison**  | Only equality (e.g., "Apple" ≠ "Banana") | Can compare (e.g., "Good" > "Average") |
| **Examples**    | Colors (Red, Blue, Green), Gender (Male, Female), Nationality (Indian, American) | Education Level (High School < Bachelor's < Master's), Satisfaction Ratings (Bad < Average < Good) |
| **Numerical Meaning** | No numerical meaning | Relative ranking but no fixed difference |
| **Encoding Methods** | One-Hot Encoding, Binary Encoding | Label Encoding, Ordinal Encoding |
| **Use Case**    | Identifying categories without ranking | Identifying ranked categories without exact intervals |


### Mapping binary categorical variables to numeric values in churn_data

In [None]:
# Mapping binary categorical variables to numeric values in churn_data

# Convert 'Churn' column to binary (1 for 'Yes', 0 for 'No')
churn_data['Churn'] = churn_data['Churn'].map(lambda s: 1 if s == 'Yes' else 0)

# Convert 'Gender' column to binary (1 for 'Male', 0 for 'Female')
churn_data['gender'] = churn_data['gender'].map(lambda s: 1 if s == 'Male' else 0)

# Convert 'Partner' column to binary (1 for 'Yes', 0 for 'No')
churn_data['Partner'] = churn_data['Partner'].map(lambda s: 1 if s == 'Yes' else 0)

# Convert 'Dependents' column to binary (1 for 'Yes', 0 for 'No')
churn_data['Dependents'] = churn_data['Dependents'].map(lambda s: 1 if s == 'Yes' else 0)

# Convert 'PhoneService' column to binary (1 for 'Yes', 0 for 'No')
churn_data['PhoneService'] = churn_data['PhoneService'].map(lambda s: 1 if s == 'Yes' else 0)

# Convert 'PaperlessBilling' column to binary (1 for 'Yes', 0 for 'No')
churn_data['PaperlessBilling'] = churn_data['PaperlessBilling'].map(lambda s: 1 if s == 'Yes' else 0)

# Replace 'No phone service' with 'No' in 'MultipleLines' column
churn_data['MultipleLines'].replace('No phone service', 'No', inplace=True)

# Convert 'MultipleLines' column to binary (1 for 'Yes', 0 for 'No')
churn_data['MultipleLines'] = churn_data['MultipleLines'].map(lambda s: 1 if s == 'Yes' else 0)

# Convert 'OnlineSecurity' column to binary (1 for 'Yes', 0 for 'No')
churn_data['OnlineSecurity'] = churn_data['OnlineSecurity'].map(lambda s: 1 if s == 'Yes' else 0)

# Convert 'OnlineBackup' column to binary (1 for 'Yes', 0 for 'No')
churn_data['OnlineBackup'] = churn_data['OnlineBackup'].map(lambda s: 1 if s == 'Yes' else 0)

# Convert 'DeviceProtection' column to binary (1 for 'Yes', 0 for 'No')
churn_data['DeviceProtection'] = churn_data['DeviceProtection'].map(lambda s: 1 if s == 'Yes' else 0)

# Convert 'TechSupport' column to binary (1 for 'Yes', 0 for 'No')
churn_data['TechSupport'] = churn_data['TechSupport'].map(lambda s: 1 if s == 'Yes' else 0)

# Convert 'StreamingTV' column to binary (1 for 'Yes', 0 for 'No')
churn_data['StreamingTV'] = churn_data['StreamingTV'].map(lambda s: 1 if s == 'Yes' else 0)

# Convert 'StreamingMovies' column to binary (1 for 'Yes', 0 for 'No')
churn_data['StreamingMovies'] = churn_data['StreamingMovies'].map(lambda s: 1 if s == 'Yes' else 0)

#churn_data.info()

### One-Hot Encoding of categorical variables

In [None]:
# One-Hot Encoding for 'InternetService', 'PaymentMethod' and 'Contract' columns in churn_data

# Perform one-hot encoding for the 'InternetService' column
churn_data = pd.get_dummies(data=churn_data, columns=['InternetService'])


# Perform one-hot encoding for the 'PaymentMethod' column
churn_data = pd.get_dummies(data=churn_data, columns=['PaymentMethod'])

# Perform one-hot encoding for the 'Contract' column
churn_data = pd.get_dummies(data=churn_data, columns=['Contract'])



Remaning variables

In [None]:
churn_data["MonthlyCharges"]=churn_data["MonthlyCharges"].astype(int)
churn_data["TotalCharges"]=churn_data["TotalCharges"].astype(int)

In [None]:
churn_data.info()

# 4.ML Model Building:

## Train and Test data preparation

In [None]:
features=[col for col in churn_data.columns if col not in ['customerID', 'Churn']]
print(features)

In [None]:
from sklearn.model_selection import train_test_split

Y = churn_data["Churn"].astype(int)
X = churn_data[features]

# Splitting the data into training and testing sets (80% train, 20% test)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Displaying the shapes of the resulting sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of Y_train:", Y_train.shape)
print("Shape of Y_test:", Y_test.shape)


## Decision Tree model

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Instantiate the Decision Tree model
decision_tree_model = DecisionTreeClassifier(max_leaf_nodes=20)

# Train the model on the training data
decision_tree_model.fit(X_train, Y_train)

# Predictions on the training set
train_predictions = decision_tree_model.predict(X_train)

# Predictions on the test set
test_predictions = decision_tree_model.predict(X_test)

# Evaluate the model on the training set
train_accuracy = accuracy_score(Y_train, train_predictions)
print("Training Accuracy:", train_accuracy)
print("\nClassification Report on Training Data:")
print(classification_report(Y_train, train_predictions))

# Evaluate the model on the test set
test_accuracy = accuracy_score(Y_test, test_predictions)
print("\nTesting Accuracy:", test_accuracy)
print("\nClassification Report on Test Data:")
print(classification_report(Y_test, test_predictions))


### Decision Tree Result



**Class Imbalance Impact:**
The class imbalance is reflected in the lower performance metrics for the minority class (Churn: Yes).
The model is more accurate at predicting instances of the majority class.

**Potential Improvements:**

Techniques like oversampling (SMOTE) can be explored to address class imbalance and potentially improve performance, especially for the minority class.

**Model Limitations:**

While the model demonstrates reasonable accuracy, precision, and recall, further improvements and fine-tuning may be explored to enhance its predictive capabilities.

**Next Steps:**

Consideration of alternative models, such as Random Forest, may be beneficial to evaluate if ensemble methods lead to improved accuracy.

## Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Instantiate the Random Forest model
random_forest_model = RandomForestClassifier(n_estimators=100, max_features=5, max_depth=8)

# Train the model on the training data
random_forest_model.fit(X_train, Y_train)

# Predictions on the training set
train_predictions = random_forest_model.predict(X_train)

# Predictions on the test set
test_predictions = random_forest_model.predict(X_test)

# Evaluate the model on the training set
train_accuracy = accuracy_score(Y_train, train_predictions)
print("Training Accuracy:", train_accuracy)
print("\nClassification Report on Training Data:")
print(classification_report(Y_train, train_predictions))

# Evaluate the model on the test set
test_accuracy = accuracy_score(Y_test, test_predictions)
print("\nTesting Accuracy:", test_accuracy)
print("\nClassification Report on Test Data:")
print(classification_report(Y_test, test_predictions))


### RF Model Result



**Advantages of Random Forest:**

Random Forest, with its ensemble nature, demonstrates enhanced predictive performance over the Decision Tree model.
Improved accuracy, precision, recall, and F1-score on both training and testing datasets indicate the robustness of the Random Forest approach.

**Considerations for Deployment:**

Random Forest may be a more suitable model for deployment due to its better performance and ability to handle imbalanced classes.
Continued monitoring and potential hyperparameter tuning can further enhance the model's effectiveness.

**Final Recommendations:**

Given the observed improvements, the Random Forest model is recommended for predicting churn in this scenario.
Ongoing evaluation and refinement should be considered for continuous model improvement.

## Important features

In [None]:
# Extracting feature importances from the trained Random Forest model ('rf')
feature_importances = pd.DataFrame(random_forest_model.feature_importances_)

# Creating a DataFrame with feature importances and corresponding feature names
feature_importances["Feature"] = list(X_train.columns)
feature_importances.rename(columns={0: 'Importance'}, inplace=True)

# Sorting the DataFrame by feature importance in descending order
sorted_feature_importances = feature_importances.sort_values(by="Importance", ascending=False)

# Displaying the top features based on Random Forest feature importances
top_features = sorted_feature_importances.head()
print("Top Features based on Random Forest Feature Importances:")
print(top_features)


The Random Forest model's feature importance analysis reveals critical insights into the factors influencing customer churn prediction. Among the top features, 'tenure' emerges as the most significant determinant, indicating the length of time a customer has been with the telecom service plays a crucial role in predicting churn. Additionally, the contractual agreement type, particularly 'Contract_Month-to-month,' holds substantial importance, suggesting that customers with month-to-month contracts are more likely to churn. 'TotalCharges' and 'MonthlyCharges' also exhibit notable importance, highlighting the financial aspect of customer relationships as a contributing factor. Notably, the presence of 'InternetService_Fiber optic' signifies that the type of internet service subscribed to significantly impacts churn predictions. This feature importance analysis provides actionable insights for telecom companies to strategically address customer retention, focusing on contract types, service duration, and financial considerations to mitigate churn effectively.

## SMOTE to Handle the Class Imbalance

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import pandas as pd


# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Instantiate SMOTE
smote = SMOTE(random_state=42,sampling_strategy=0.6)

# Apply SMOTE to the training data only to avoid data leakage
X_train_resampled, Y_train_resampled = smote.fit_resample(X_train, Y_train)

# Display the shape before and after SMOTE
print("Shape of X_train before SMOTE:", X_train.shape)
print("Shape of X_train_resampled after SMOTE:", X_train_resampled.shape)

# Now, X_train_resampled and Y_train_resampled can be used for training the model
Y_train_resampled.value_counts()

## Random Forest on Balanced Data

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Instantiate the Random Forest model
random_forest_model = RandomForestClassifier(n_estimators=150, max_features=5, max_depth=7)

# Train the model on the training data
random_forest_model.fit(X_train_resampled, Y_train_resampled)

# Predictions on the training set
train_predictions = random_forest_model.predict(X_train_resampled)

# Predictions on the test set
test_predictions = random_forest_model.predict(X_test)

# Evaluate the model on the training set
train_accuracy = accuracy_score(Y_train_resampled, train_predictions)
print("Training Accuracy:", train_accuracy)
print("\nClassification Report on Training Data:")
print(classification_report(Y_train_resampled, train_predictions))

# Evaluate the model on the test set
test_accuracy = accuracy_score(Y_test, test_predictions)
print("\nTesting Accuracy:", test_accuracy)
print("\nClassification Report on Test Data:")
print(classification_report(Y_test, test_predictions))


### Impact of SMOTE


Applying SMOTE has positively influenced the model's ability to predict the minority class, as reflected in the increased recall and F1-score for Churn: Yes in both training and testing datasets.


**Balanced Performance:**

The model exhibits a more balanced performance across precision and recall for both classes, mitigating the impact of class imbalance observed in the previous Random Forest output.
Trade-offs:

While there is a slight trade-off in training accuracy, the model's effectiveness in predicting churn instances, especially among customers likely to churn (Churn: Yes), is significantly enhanced.

In conclusion, the Random Forest model, after incorporating the SMOTE technique, demonstrates improved sensitivity to predicting customer churn, particularly for the minority class. The trade-off in training accuracy is justifiable, considering the overall enhancement in the model's ability to capture true positives. This refined model, with its balanced performance metrics, holds promise for more accurate and reliable churn predictions. The strategic application of SMOTE proves instrumental in addressing class imbalance, offering a more robust solution for telecom companies aiming to proactively retain customers.

# 5.Data Analysis Results:

### Data Analysis Results Compilation:

#### Decision Tree Results:

1. **Training Accuracy: 0.8042**
   - Precision (Churn: No): 85%, Recall: 89%, F1-Score: 87%
   - Precision (Churn: Yes): 65%, Recall: 58%, F1-Score: 61%

2. **Testing Accuracy: 0.7935**
   - Precision (Churn: No): 84%, Recall: 88%, F1-Score: 86%
   - Precision (Churn: Yes): 62%, Recall: 55%, F1-Score: 58%

#### Random Forest Results (Before SMOTE):

1. **Training Accuracy: 0.8324**
   - Precision (Churn: No): 86%, Recall: 93%, F1-Score: 89%
   - Precision (Churn: Yes): 74%, Recall: 57%, F1-Score: 64%

2. **Testing Accuracy: 0.8105**
   - Precision (Churn: No): 84%, Recall: 92%, F1-Score: 88%
   - Precision (Churn: Yes): 69%, Recall: 51%, F1-Score: 59%

#### Random Forest Results (After SMOTE):

1. **Training Accuracy: 0.8211**
   - Precision (Churn: No): 86%, Recall: 85%, F1-Score: 86%
   - Precision (Churn: Yes): 76%, Recall: 77%, F1-Score: 76%

2. **Testing Accuracy: 0.7928**
   - Precision (Churn: No): 87%, Recall: 84%, F1-Score: 86%
   - Precision (Churn: Yes): 60%, Recall: 65%, F1-Score: 63%

#### Top Features based on Random Forest Feature Importances:

1. **Tenure (Importance: 0.177)**
2. **Contract_Month-to-month (Importance: 0.157)**
3. **TotalCharges (Importance: 0.128)**
4. **MonthlyCharges (Importance: 0.096)**
5. **InternetService_Fiber optic (Importance: 0.081)**

### Overall Inference:

1. **Decision Tree vs. Random Forest:**
   - Random Forest consistently outperforms the Decision Tree in accuracy, precision, recall, and F1-score metrics.
   - Random Forest provides more balanced predictions, especially for the minority class.

2. **Impact of SMOTE on Random Forest:**
   - Applying SMOTE improves sensitivity to predict churn in the minority class, enhancing recall and F1-score.
   - Trade-offs in training accuracy are justifiable, considering the improved balance in predicting true positives.

3. **Feature Importance:**
   - Top features influencing churn prediction include 'Tenure,' 'Contract_Month-to-month,' 'TotalCharges,' 'MonthlyCharges,' and 'InternetService_Fiber optic.'
   - These features offer strategic insights for customer retention, emphasizing contract types, service duration, and financial considerations.

4. **Recommendations for Deployment:**
   - The Random Forest model, especially after SMOTE, is recommended for deployment due to its enhanced predictive performance and balanced predictions.
   - Ongoing monitoring and potential hyperparameter tuning can further optimize the model for continued effectiveness.