In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf # Works as a connector between the pandas library and plotly
cf.go_offline()
from sklearn.model_selection import GridSearchCV

In [3]:
import pandas as pd

# Load the dataset from Excel
df = pd.read_excel('/content/customer_churn_large_dataset.xlsx')

# Display basic information about the dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   CustomerID                  100000 non-null  int64  
 1   Name                        100000 non-null  object 
 2   Age                         100000 non-null  int64  
 3   Gender                      100000 non-null  object 
 4   Location                    100000 non-null  object 
 5   Subscription_Length_Months  100000 non-null  int64  
 6   Monthly_Bill                100000 non-null  float64
 7   Total_Usage_GB              100000 non-null  int64  
 8   Churn                       100000 non-null  int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 6.9+ MB
None


In [4]:
# Drop unnecessary columns (CustomerID and Name)
df_cleaned = df.drop(['CustomerID', 'Name'], axis=1)

# Drop rows with missing values
df_cleaned = df_cleaned.dropna()

# Handle outliers using z-score method
from scipy import stats
z_scores = stats.zscore(df_cleaned[['Age', 'Subscription_Length_Months', 'Monthly_Bill', 'Total_Usage_GB']])
df_cleaned_no_outliers = df_cleaned[(z_scores < 3).all(axis=1)]

# Separate features (X) and target (y)
X = df_cleaned_no_outliers.drop('Churn', axis=1)
y = df_cleaned_no_outliers['Churn']


In [5]:
# Encode categorical variables using one-hot encoding
X_encoded = pd.get_dummies(X, columns=['Gender', 'Location'])

# Scale the numeric columns
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numeric_columns = ['Age', 'Subscription_Length_Months', 'Monthly_Bill', 'Total_Usage_GB']
X_scaled = X_encoded.copy()
X_scaled[numeric_columns] = scaler.fit_transform(X_encoded[numeric_columns])

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Create a Logistic Regression model
model = LogisticRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Predict on the test data
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)


Accuracy: 0.5037
Precision: 0.49966777408637875
Recall: 0.3789940530188489
F1 Score: 0.431044365470595


In [7]:
from sklearn.ensemble import RandomForestClassifier

# Create a Random Forest Classifier model
rf_model = RandomForestClassifier(random_state=42)

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Predict on the test data
rf_y_pred = rf_model.predict(X_test)

# Evaluate the model's performance
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_precision = precision_score(y_test, rf_y_pred)
rf_recall = recall_score(y_test, rf_y_pred)
rf_f1 = f1_score(y_test, rf_y_pred)

print("Random Forest Classifier:")
print("Accuracy:", rf_accuracy)
print("Precision:", rf_precision)
print("Recall:", rf_recall)
print("F1 Score:", rf_f1)

Random Forest Classifier:
Accuracy: 0.497
Precision: 0.49268189954722547
Recall: 0.4716258441689346
F1 Score: 0.48192398805232267


In [8]:
from sklearn.svm import SVC

# Create a Support Vector Machine Classifier model
svm_model = SVC(random_state=42)

# Train the model on the training data
svm_model.fit(X_train, y_train)

# Predict on the test data
svm_y_pred = svm_model.predict(X_test)

# Evaluate the model's performance
svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_precision = precision_score(y_test, svm_y_pred)
svm_recall = recall_score(y_test, svm_y_pred)
svm_f1 = f1_score(y_test, svm_y_pred)

print("Support Vector Machine (SVM) Classifier:")
print("Accuracy:", svm_accuracy)
print("Precision:", svm_precision)
print("Recall:", svm_recall)
print("F1 Score:", svm_f1)


Support Vector Machine (SVM) Classifier:
Accuracy: 0.50185
Precision: 0.4975756176402678
Recall: 0.43443201290192524
F1 Score: 0.4638648226874025


In [9]:
from sklearn.ensemble import GradientBoostingClassifier

# Create a Gradient Boosting Classifier model
gb_model = GradientBoostingClassifier(random_state=42)

# Train the model on the training data
gb_model.fit(X_train, y_train)

# Predict on the test data
gb_y_pred = gb_model.predict(X_test)

# Evaluate the model's performance
gb_accuracy = accuracy_score(y_test, gb_y_pred)
gb_precision = precision_score(y_test, gb_y_pred)
gb_recall = recall_score(y_test, gb_y_pred)
gb_f1 = f1_score(y_test, gb_y_pred)

print("Gradient Boosting Classifier:")
print("Accuracy:", gb_accuracy)
print("Precision:", gb_precision)
print("Recall:", gb_recall)
print("F1 Score:", gb_f1)


Gradient Boosting Classifier:
Accuracy: 0.49935
Precision: 0.49490022172949
Recall: 0.4499546416691866
F1 Score: 0.47135842880523726


In [10]:
from sklearn.neural_network import MLPClassifier

# Create a Neural Network model
nn_model = MLPClassifier(random_state=42)

# Train the model on the training data
nn_model.fit(X_train, y_train)

# Predict on the test data
nn_y_pred = nn_model.predict(X_test)

# Evaluate the model's performance
nn_accuracy = accuracy_score(y_test, nn_y_pred)
nn_precision = precision_score(y_test, nn_y_pred)
nn_recall = recall_score(y_test, nn_y_pred)
nn_f1 = f1_score(y_test, nn_y_pred)

print("Neural Network Classifier:")
print("Accuracy:", nn_accuracy)
print("Precision:", nn_precision)
print("Recall:", nn_recall)
print("F1 Score:", nn_f1)


Neural Network Classifier:
Accuracy: 0.50605
Precision: 0.5033578509753758
Recall: 0.31730672311258945
F1 Score: 0.38924265842349304


In [11]:
print("Logistic Regression:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nRandom Forest Classifier:")
print("Accuracy:", rf_accuracy)
print("Precision:", rf_precision)
print("Recall:", rf_recall)
print("F1 Score:", rf_f1)

print("\nSupport Vector Machine (SVM) Classifier:")
print("Accuracy:", svm_accuracy)
print("Precision:", svm_precision)
print("Recall:", svm_recall)
print("F1 Score:", svm_f1)

print("\nNeural Network model:")
print("Accuracy:", svm_accuracy)
print("Precision:", svm_precision)
print("Recall:", svm_recall)
print("F1 Score:", svm_f1)

print("\nGradient Boosting Classifier:")
print("Accuracy:", svm_accuracy)
print("Precision:", svm_precision)
print("Recall:", svm_recall)
print("F1 Score:", svm_f1)


Logistic Regression:
Accuracy: 0.5037
Precision: 0.49966777408637875
Recall: 0.3789940530188489
F1 Score: 0.431044365470595

Random Forest Classifier:
Accuracy: 0.497
Precision: 0.49268189954722547
Recall: 0.4716258441689346
F1 Score: 0.48192398805232267

Support Vector Machine (SVM) Classifier:
Accuracy: 0.50185
Precision: 0.4975756176402678
Recall: 0.43443201290192524
F1 Score: 0.4638648226874025

Neural Network model:
Accuracy: 0.50185
Precision: 0.4975756176402678
Recall: 0.43443201290192524
F1 Score: 0.4638648226874025

Gradient Boosting Classifier:
Accuracy: 0.50185
Precision: 0.4975756176402678
Recall: 0.43443201290192524
F1 Score: 0.4638648226874025


In [12]:
def predict_churn(new_data):
    # Preprocess new data
    new_data_cleaned = new_data.drop(['CustomerID', 'Name'], axis=1)
    new_data_encoded = pd.get_dummies(new_data_cleaned, columns=['Gender', 'Location'])
    new_data_scaled = new_data_encoded.copy()
    new_data_scaled[numeric_columns] = scaler.transform(new_data_encoded[numeric_columns])

    # Make predictions using the trained model
    predictions = model.predict(new_data_scaled)
    return predictions


In [13]:
def predict_churn(new_data):
    # Preprocess new data
    new_data_cleaned = new_data.drop(['CustomerID', 'Name'], axis=1)
    new_data_encoded = pd.get_dummies(new_data_cleaned, columns=['Gender', 'Location'])
    new_data_scaled = new_data_encoded.copy()
    new_data_scaled[numeric_columns] = scaler.transform(new_data_encoded[numeric_columns])

    # Make predictions using the trained model
    predictions = model.predict(new_data_scaled)

    # Process predictions (if needed) and return results
    result = []
    for pred in predictions:
        if pred == 0:
            result.append('Not Churn')
        else:
            result.append('Churn')

    return result

In [14]:
print(X_test.columns)

Index(['Age', 'Subscription_Length_Months', 'Monthly_Bill', 'Total_Usage_GB',
       'Gender_Female', 'Gender_Male', 'Location_Chicago', 'Location_Houston',
       'Location_Los Angeles', 'Location_Miami', 'Location_New York'],
      dtype='object')


In [15]:
# Scale the numeric columns in the test dataset
X_test_scaled = X_test.copy()  # Create a copy of X_test to avoid modifying the original DataFrame
X_test_scaled[numeric_columns] = scaler.transform(X_test_scaled[numeric_columns])

# Predict on the scaled test data
y_pred = model.predict(X_test_scaled)

# Evaluate the model's performance on the test data
test_accuracy = accuracy_score(y_test, y_pred)
test_precision = precision_score(y_test, y_pred)
test_recall = recall_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred)

print("Model Performance on Test Data:")
print("Accuracy:", test_accuracy)
print("Precision:", test_precision)
print("Recall:", test_recall)
print("F1 Score:", test_f1)

Model Performance on Test Data:
Accuracy: 0.50345
Precision: 0.4991397109428768
Recall: 0.29241003931055337
F1 Score: 0.36877899955507537


In [18]:
import pickle

In [19]:
# Perform feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Instantiate and train your SVM model
svm_model = SVC(kernel='linear')  # You can adjust parameters as needed
svm_model.fit(X_train_scaled, y_train)

# Save the trained SVM model using pickle
with open('trained_model.pkl', 'wb') as model_file:
    pickle.dump(svm_model, model_file)

In [20]:
# Save the scaler using pickle
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

In [22]:
from flask import Flask, request, jsonify
import pickle
import pandas as pd
from sklearn.preprocessing import StandardScaler

app = Flask(__name__)

# Load the trained model
with open('trained_model.pkl', 'rb') as model_file:
    model = pickle.load(model_file)

# Load the scaler
with open('scaler.pkl', 'rb') as scaler_file:
    scaler = pickle.load(scaler_file)

# Endpoint to receive input data and return predictions
@app.route('/predict', methods=['POST'])
def predict():
    try:
        # Get input data from the request
        input_data = request.json

        # Preprocess input data
        input_df = pd.DataFrame(input_data, index=[0])
        input_scaled = scaler.transform(input_df)

        # Make predictions
        predictions = model.predict(input_scaled)

        # Format predictions
        churn_labels = ['Not Churn', 'Churn']
        prediction_labels = [churn_labels[pred] for pred in predictions]

        return jsonify({'predictions': prediction_labels})
    except Exception as e:
        return jsonify({'error': str(e)})

if __name__ == '__main__':
    app.run(host='127.0.0.1', port=5000)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
