<a href="https://colab.research.google.com/github/vinay-7808/Assignment/blob/main/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## Importing necessary library
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# 1. Data Preprocessing

### Loading and Performing Initial Data Exploration

In [None]:
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_excel('/content/drive/MyDrive/customer_churn_large_dataset.xlsx')
print(df.head())
print(df.describe())
print(df['Churn'].value_counts())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
   CustomerID        Name  Age  Gender     Location  \
0           1  Customer_1   63    Male  Los Angeles   
1           2  Customer_2   62  Female     New York   
2           3  Customer_3   24  Female  Los Angeles   
3           4  Customer_4   36  Female        Miami   
4           5  Customer_5   46  Female        Miami   

   Subscription_Length_Months  Monthly_Bill  Total_Usage_GB  Churn  
0                          17         73.36             236      0  
1                           1         48.76             172      0  
2                           5         85.47             460      0  
3                           3         97.94             297      1  
4                          19         58.14             266      0  
          CustomerID            Age  Subscription_Length_Months  \
count  100000.000000  100000.000000               100000.00

### Handling Missing Data

In [None]:
df.dropna(inplace=True)
missing_cols = df.columns[df.isnull().any()].tolist()
# Display the columns with missing data
print("Columns with missing data:", missing_cols)

Columns with missing data: []


In [None]:
#   There is no missing column but still if there was missing columns we can use Simple Imputer
#   from sklearn.impute import SimpleImputer
#   imputer = SimpleImputer(strategy='mean')
#   data[missing_cols] = imputer.fit_transform(data[missing_cols])

### Handling Outliers using z-score

In [None]:
from scipy import stats
z_scores = np.abs(stats.zscore(df[['Age', 'Monthly_Bill', 'Total_Usage_GB']]))
df = df[(z_scores < 3).all(axis=1)]


In [None]:
# Dummy Variables for Striing datatypes Columns
df = pd.get_dummies(df, columns=['Gender', 'Location'], drop_first=True)


In [None]:
df

Unnamed: 0,CustomerID,Name,Age,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn,Gender_Male,Location_Houston,Location_Los Angeles,Location_Miami,Location_New York
0,1,Customer_1,63,17,73.36,236,0,1,0,1,0,0
1,2,Customer_2,62,1,48.76,172,0,0,0,0,0,1
2,3,Customer_3,24,5,85.47,460,0,0,0,1,0,0
3,4,Customer_4,36,3,97.94,297,1,0,0,0,1,0
4,5,Customer_5,46,19,58.14,266,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99996,Customer_99996,33,23,55.13,226,1,1,1,0,0,0
99996,99997,Customer_99997,62,19,61.65,351,0,0,0,0,0,1
99997,99998,Customer_99998,64,17,96.11,251,1,1,0,0,0,0
99998,99999,Customer_99999,51,20,49.25,434,1,0,0,0,0,1


### Preparing the data for machine learning by encoding categorical variables

In [None]:
# Dropped these columns as there is no use of them in model making
df = df.drop(['CustomerID', 'Name'], axis=1)

In [None]:
X = df.drop('Churn', axis=1)
y = df['Churn']
print(X)
print(y)


       Age  Subscription_Length_Months  Monthly_Bill  Total_Usage_GB  \
0       63                          17         73.36             236   
1       62                           1         48.76             172   
2       24                           5         85.47             460   
3       36                           3         97.94             297   
4       46                          19         58.14             266   
...    ...                         ...           ...             ...   
99995   33                          23         55.13             226   
99996   62                          19         61.65             351   
99997   64                          17         96.11             251   
99998   51                          20         49.25             434   
99999   27                          19         76.57             173   

       Gender_Male  Location_Houston  Location_Los Angeles  Location_Miami  \
0                1                 0                     

Splitting it into
training and testing sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Feature Engineering and Feature Scaling

In [None]:
df['Tenure_Months'] = df['Subscription_Length_Months'] - df['Age']
df['Interaction_Per_Month'] = df['Total_Usage_GB'] / df['Subscription_Length_Months']

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. Model Building (Trying Different Model)

Logistic Regression Classifier

In [None]:
# Choose a machine learning algorithm (e.g., Logistic Regression)
from sklearn.linear_model import LogisticRegression

# Train the model
lg_model = LogisticRegression()
lg_model.fit(X_train_scaled, y_train)

# Evaluate the model's performance
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

y_pred = lg_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print("Accuracy: {:.2f}".format(accuracy))
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F1 Score: {:.2f}".format(f1))
print("ROC AUC: {:.2f}".format(roc_auc))


Accuracy: 0.50
Precision: 0.50
Recall: 0.38
F1 Score: 0.43
ROC AUC: 0.50


# Model Optimization

Grid Search CV

In [None]:
# Fine-tune the model parameters (hyperparameter tuning)
# Example: GridSearchCV for logistic regression
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)


In [None]:
from sklearn.metrics import accuracy_score


y_pred = best_model.predict(X_test_scaled)


accuracy = accuracy_score(y_test, y_pred)

print("Accuracy: {:.2f}".format(accuracy))


Accuracy: 0.50


Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create and train the Random Forest model
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train_scaled, y_train)

# Evaluate the model's accuracy
y_pred_rf = random_forest_model.predict(X_test_scaled)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_rf

0.49515

Support Vector Classifier

In [None]:
from sklearn.svm import SVC

# Create and train the SVM model
svm_model = SVC()
svm_model.fit(X_train_scaled, y_train)

# Evaluate the model's accuracy
y_pred_svm = svm_model.predict(X_test_scaled)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
accuracy_svm

0.4993

XG Boost Classifier

In [None]:
from xgboost import XGBClassifier

# Create and train the XGBoost model
xgb_model = XGBClassifier()
xgb_model.fit(X_train_scaled, y_train)

# Evaluate the model's accuracy
y_pred_xgb = xgb_model.predict(X_test_scaled)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
accuracy_xgb

0.50055

KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Create and train the K-NN model
knn_model = KNeighborsClassifier()
knn_model.fit(X_train_scaled, y_train)

# Evaluate the model's accuracy
y_pred_knn = knn_model.predict(X_test_scaled)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
accuracy_knn

0.4995

Neural Network

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import accuracy_score
model = Sequential()
model.add(Dense(64, input_dim=X_train_scaled.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=50, batch_size=32)

# Evaluate the model's accuracy on the test set
y_pred = model.predict(X_test_scaled)
y_pred = (y_pred > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}".format(accuracy))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Accuracy: 0.50


In [None]:
import joblib

# Save the trained logistic regression model to a file
joblib.dump(lg_model, 'logistic_regression_model.pkl')


['logistic_regression_model.pkl']