<a href="https://colab.research.google.com/github/thetestcoder/ml-projects/blob/main/Ml_2_Customer_Churn_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!mkdir ~/.kaggle

!cp kaggle.json ~/.kaggle

!chmod 600 ~/.kaggle/kaggle.json

In [2]:
!kaggle datasets download muhammadshahidazeem/customer-churn-dataset

Downloading customer-churn-dataset.zip to /content
 60% 4.00M/6.66M [00:00<00:00, 6.13MB/s]
100% 6.66M/6.66M [00:01<00:00, 6.79MB/s]


In [3]:
!unzip customer-churn-dataset.zip

Archive:  customer-churn-dataset.zip
  inflating: customer_churn_dataset-testing-master.csv  
  inflating: customer_churn_dataset-training-master.csv  


In [4]:
#load dataset
import pandas as pd

train_df = pd.read_csv('customer_churn_dataset-training-master.csv')
test_df = pd.read_csv('customer_churn_dataset-testing-master.csv')

train_df.head()

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,2.0,30.0,Female,39.0,14.0,5.0,18.0,Standard,Annual,932.0,17.0,1.0
1,3.0,65.0,Female,49.0,1.0,10.0,8.0,Basic,Monthly,557.0,6.0,1.0
2,4.0,55.0,Female,14.0,4.0,6.0,18.0,Basic,Quarterly,185.0,3.0,1.0
3,5.0,58.0,Male,38.0,21.0,7.0,7.0,Standard,Monthly,396.0,29.0,1.0
4,6.0,23.0,Male,32.0,20.0,5.0,8.0,Basic,Monthly,617.0,20.0,1.0


In [5]:
train_df.describe()

Unnamed: 0,CustomerID,Age,Tenure,Usage Frequency,Support Calls,Payment Delay,Total Spend,Last Interaction,Churn
count,440832.0,440832.0,440832.0,440832.0,440832.0,440832.0,440832.0,440832.0,440832.0
mean,225398.667955,39.373153,31.256336,15.807494,3.604437,12.965722,631.616223,14.480868,0.567107
std,129531.91855,12.442369,17.255727,8.586242,3.070218,8.258063,240.803001,8.596208,0.495477
min,2.0,18.0,1.0,1.0,0.0,0.0,100.0,1.0,0.0
25%,113621.75,29.0,16.0,9.0,1.0,6.0,480.0,7.0,0.0
50%,226125.5,39.0,32.0,16.0,3.0,12.0,661.0,14.0,1.0
75%,337739.25,48.0,46.0,23.0,6.0,19.0,830.0,22.0,1.0
max,449999.0,65.0,60.0,30.0,10.0,30.0,1000.0,30.0,1.0


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440833 entries, 0 to 440832
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   CustomerID         440832 non-null  float64
 1   Age                440832 non-null  float64
 2   Gender             440832 non-null  object 
 3   Tenure             440832 non-null  float64
 4   Usage Frequency    440832 non-null  float64
 5   Support Calls      440832 non-null  float64
 6   Payment Delay      440832 non-null  float64
 7   Subscription Type  440832 non-null  object 
 8   Contract Length    440832 non-null  object 
 9   Total Spend        440832 non-null  float64
 10  Last Interaction   440832 non-null  float64
 11  Churn              440832 non-null  float64
dtypes: float64(9), object(3)
memory usage: 40.4+ MB


In [7]:
#data processing and feature engineering
from sklearn.preprocessing import OneHotEncoder, StandardScaler


categorical_colums = ['Gender', 'Subscription Type', 'Contract Length']

encoder = OneHotEncoder(drop='first')
encoder.fit(train_df[categorical_colums])
encoded_features = pd.DataFrame(encoder.transform(train_df[categorical_colums]).toarray(), columns=encoder.get_feature_names_out(categorical_colums))
test_encoded_features = pd.DataFrame(encoder.transform(test_df[categorical_colums]).toarray(), columns=encoder.get_feature_names_out(categorical_colums))

train_df.drop(categorical_colums, axis=1, inplace=True)
train_df = pd.concat([train_df, encoded_features], axis=1)

test_df.drop(categorical_colums, axis=1, inplace=True)
test_df = pd.concat([test_df, test_encoded_features], axis=1)

numerical_columns = ['Age', 'Tenure', 'Usage Frequency', 'Support Calls', 'Payment Delay', 'Total Spend']
scaler = StandardScaler()
scaler.fit(train_df[numerical_columns])

train_df[numerical_columns] = scaler.transform(train_df[numerical_columns])
test_df[numerical_columns] = scaler.transform(test_df[numerical_columns])



In [8]:
#date time feature engineering

def dateTimeFeatureEngineering(df):
  df['Last Interaction'] = pd.to_datetime(df['Last Interaction'])
  df['Last Interaction_Year'] = df['Last Interaction'].dt.year
  df['Last Interaction_Month'] = df['Last Interaction'].dt.month
  df['Last Interaction_Day'] = df['Last Interaction'].dt.day

  df['Average Spend per Month'] = df['Total Spend'] / df['Tenure']
  df.drop(['Last Interaction'], axis=1, inplace=True)

  return df


train_df = dateTimeFeatureEngineering(train_df)
test_df = dateTimeFeatureEngineering(test_df)




In [9]:
train_df.dropna(subset=['Churn'], inplace=True)
test_df.dropna(subset=['Churn'], inplace=True)

X_train = train_df.drop('Churn', axis=1)
y_train = train_df['Churn'].astype(int)

X_test = test_df.drop('Churn', axis=1)
y_test =  test_df['Churn'].astype(int)

In [10]:
y_train.values

array([1, 1, 1, ..., 0, 0, 0])

In [11]:
# model selection and training

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train.values)

In [None]:

clf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(clf, param_grid, cv=2, n_jobs=-1, verbose=3)
grid_search.fit(X_train_resampled, y_train_resampled)

Fitting 2 folds for each of 81 candidates, totalling 162 fits


In [None]:

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)


In [None]:
rf_classifier = RandomForestClassifier(**best_params, random_state=42)
rf_classifier.fit(X_train_resampled, y_train_resampled)


In [None]:
# Step 5: Make predictions on the test data
y_pred = rf_classifier.predict(X_test)

# Step 6: Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print the classification report and confusion matrix for more detailed evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))