Importing the required libraries

In [None]:
from google.colab import files
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense
from sklearn.preprocessing import StandardScaler

Uploading the Excel file to Google Colab

In [None]:
uploaded = files.upload() #Upload the csv file here

Data Preprocessing

1. Loading the data

In [None]:
customer_data = pd.read_excel("customer_churn_large_dataset.xlsx")
customer_data.describe()

2. Handling missing data

In [None]:
missing_data = customer_data.isnull()
missing_count = missing_data.sum()
print(missing_count)

3. Checking for outliers

In [None]:
customer_data.describe()

#Boxplot
plt.figure(figsize = (8, 6))
sns.boxplot(data = customer_data, orient = 'v')
plt.title("Box Plot for Outlier Detection")
plt.show()

#Z-score
l = ["Age", "Subscription_Length_Months",	"Monthly_Bill",	"Total_Usage_GB"]
z_scores = stats.zscore(customer_data[l])
outliers = (z_scores > 2).any(axis = 1)
print(customer_data[l][outliers])

#IQR Method
Q1 = customer_data.quantile(0.25)
Q3 = customer_data.quantile(0.75)
IQR = Q3 - Q1
outliers = ((customer_data < (Q1 - 1.5 * IQR)) | (customer_data > (Q3 + 1.5 * IQR))).any(axis = 1)
print(customer_data[outliers])


4. Encoding categorical variables

In [None]:
label_encoder = LabelEncoder()
customer_data["Gender_LabelEncoded"] = label_encoder.fit_transform(customer_data["Gender"])
customer_data["Location_LabelEncoded"] = label_encoder.fit_transform(customer_data["Location"])
print(customer_data)

Feature Engineering

1. Generating relevant features to improve the model's prediction accuracy

In [None]:
#Average monthly usage
customer_data["Average_Monthly_Usage_GB"] = customer_data["Total_Usage_GB"] / customer_data["Subscription_Length_Months"]

#Bill-to-usage ratio
customer_data["Bill_to_Usage_Ratio"] = customer_data["Monthly_Bill"] / customer_data["Total_Usage_GB"]

customer_data.describe()

Splitting into training and test data

In [None]:

X, y = customer_data[["Gender_LabelEncoded", "Location_LabelEncoded", "Subscription_Length_Months", "Monthly_Bill", "Average_Monthly_Usage_GB", "Bill_to_Usage_Ratio"]], customer_data["Churn"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
print(X_train)

Model Building

In [None]:
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Report:")
print(report)

Model Deployment

In [None]:
def predict_from_input():
    # Get user input data
    gender = int(input('Enter Gender (0 for Male, 1 for Female): '))
    location = int(input('Enter Location (0 for Location A, 1 for Location B, etc.): '))
    subscription_length = int(input('Enter Subscription Length (in months): '))
    monthly_bill = float(input('Enter Monthly Bill: '))
    average_monthly_usage = float(input('Enter Average Monthly Usage (in GB): '))
    bill_to_usage_ratio = float(input('Enter Bill to Usage Ratio: '))

    # Create a Pandas DataFrame with the input data
    new_customer_data = pd.DataFrame({
        'Gender_LabelEncoded': [gender],
        'Location_LabelEncoded': [location],
        'Subscription_Length_Months': [subscription_length],
        'Monthly_Bill': [monthly_bill],
        'Average_Monthly_Usage_GB': [average_monthly_usage],
        'Bill_to_Usage_Ratio': [bill_to_usage_ratio],
    })

    # Make predictions
    prediction = predict_churn(new_customer_data, lr_model)

    return prediction

In [None]:
predict_from_input()