Level 3 Task 1: Predictive Modeling

In this notebook, I built a predictive model to find which customers are likely to churn.  
I tested logistic regression, decision tree, and random forest classifiers.  
After evaluating each model's accuracy and performance, [best model] gave the most reliable results.  
This analysis can help a company take early action to retain their valuable customers.

In [30]:
# Importing Libraries
import pandas as pd

In [31]:
# Loads the dataset
df = pd.read_csv('churn-bigml-80.csv')

In [32]:
# Reads few rows of the dataset
df.head()

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [5]:
# Checking the data types and columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2666 entries, 0 to 2665
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   State                   2666 non-null   object 
 1   Account length          2666 non-null   int64  
 2   Area code               2666 non-null   int64  
 3   International plan      2666 non-null   object 
 4   Voice mail plan         2666 non-null   object 
 5   Number vmail messages   2666 non-null   int64  
 6   Total day minutes       2666 non-null   float64
 7   Total day calls         2666 non-null   int64  
 8   Total day charge        2666 non-null   float64
 9   Total eve minutes       2666 non-null   float64
 10  Total eve calls         2666 non-null   int64  
 11  Total eve charge        2666 non-null   float64
 12  Total night minutes     2666 non-null   float64
 13  Total night calls       2666 non-null   int64  
 14  Total night charge      2666 non-null   

In [6]:
# Checks for missing values
df.isnull().sum()

Unnamed: 0,0
State,0
Account length,0
Area code,0
International plan,0
Voice mail plan,0
Number vmail messages,0
Total day minutes,0
Total day calls,0
Total day charge,0
Total eve minutes,0


In [33]:
# Convert the target column 'Churn' from True/FAlse to 1/0 so the model can understand it
df['Churn'] = df['Churn'].astype(int)

In [36]:
# For other columns with True/False values, convert them to 1/0 too
yes_no_columns = df.select_dtypes(include='object').columns

In [37]:
# # Convert remaining non-numeric columns into dummy variables (0/1 format)
# This helps the machine learning model understand categories
df = pd.get_dummies(df, drop_first=True)

In [38]:
# # Double check the updated dataset
print(df.head())

   Account length  Area code  Number vmail messages  Total day minutes  \
0             128        415                     25              265.1   
1             107        415                     26              161.6   
2             137        415                      0              243.4   
3              84        408                      0              299.4   
4              75        415                      0              166.7   

   Total day calls  Total day charge  Total eve minutes  Total eve calls  \
0              110             45.07              197.4               99   
1              123             27.47              195.5              103   
2              114             41.38              121.2              110   
3               71             50.90               61.9               88   
4              113             28.34              148.3              122   

   Total eve charge  Total night minutes  ...  State_TX  State_UT  State_VA  \
0             16.78

In [39]:
# Automatically convert all boolean columns (True/False) to 1/0
df = df.astype({col: int for col in df.columns if df[col].dtype == 'bool'})

In [41]:
# Define features and target
X = df.drop('Churn', axis=1)
y = df['Churn']

# Train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Logistic Regression
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

# Decision Tree
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)

# Random Forest
forest_model = RandomForestClassifier()
forest_model.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [44]:
from sklearn.preprocessing import StandardScaler

# Scale the features (important for Logistic Regression)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Then split the scaled data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Now fit the model again
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)

In [46]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Decision Tree
# Now that logistic regression is trained, let's also train Decision Tree and Random Forest models
# These models are great for handling complex data without scaling
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)

# Random Forest
forest_model = RandomForestClassifier(random_state=42)
forest_model.fit(X_train, y_train)

In [47]:
# Let's evaluate each model using accuracy, precision, recall, and F1-score
# This will help us compare how well each one predicts customer churn

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

models = {
    'Logistic Regression': log_model,
    'Decision Tree': tree_model,
    'Random Forest': forest_model
}

for name, model in models.items():
    y_pred = model.predict(X_test)

    print(f"{name}")
    print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
    print("Precision:", round(precision_score(y_test, y_pred), 3))
    print("Recall:", round(recall_score(y_test, y_pred), 3))
    print("F1 Score:", round(f1_score(y_test, y_pred), 3))
    print('-' * 30)

Logistic Regression
Accuracy: 0.841
Precision: 0.432
Recall: 0.241
F1 Score: 0.309
------------------------------
Decision Tree
Accuracy: 0.893
Precision: 0.653
Recall: 0.595
F1 Score: 0.623
------------------------------
Random Forest
Accuracy: 0.925
Precision: 0.976
Recall: 0.506
F1 Score: 0.667
------------------------------


In [48]:
# Let's take a closer look at the best model's predictions using a confusion matrix
# This helps us understand how many churns and non-churns were predicted correctly and incorrectly
from sklearn.metrics import classification_report, confusion_matrix

y_pred = forest_model.predict(X_test)

print(confusion_matrix(y_test, y_pred))  # Shows how many customers were predicted correctly
print(classification_report(y_test, y_pred))  # Gives more detailed metrics like precision and recall

[[454   1]
 [ 39  40]]
              precision    recall  f1-score   support

           0       0.92      1.00      0.96       455
           1       0.98      0.51      0.67        79

    accuracy                           0.93       534
   macro avg       0.95      0.75      0.81       534
weighted avg       0.93      0.93      0.91       534

