<a href="https://colab.research.google.com/github/viduliyanage7/ML-CW/blob/main/cw.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [162]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, roc_auc_score

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

df = pd.read_csv("data.csv")

In [163]:
#check if dataset is balanced
df['Churn'].value_counts(normalize=True) * 100

Unnamed: 0_level_0,proportion
Churn,Unnamed: 1_level_1
No,73.463013
Yes,26.536987


In [164]:
#fix total_chares

import numpy as np
df['TotalCharges'] = df['TotalCharges'].replace(" ", np.nan)
df['TotalCharges'] = df['TotalCharges'].astype(float)
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].mean())

In [165]:
#remove unwanted columns
X = df.drop(columns=['customerID', 'Churn'])
y = df['Churn'].map({'Yes': 1, 'No': 0})

In [139]:
# store numerical values and categorical values
numeric_features = X.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

In [166]:
# encoding
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(handle_unknown='ignore')
X_cat = encoder.fit_transform(df[categorical_features]).toarray()
X_num = df[numeric_features].values

X_processed = np.hstack([X_num, X_cat])


In [167]:
#test train split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42, stratify=y
)

In [168]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [169]:
#check the dataset again for any imbalance
pd.Series(y_train_res).value_counts(normalize=True) * 100

Unnamed: 0_level_0,proportion
Churn,Unnamed: 1_level_1
0,50.0
1,50.0


In [170]:
#build decision tree
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=1, criterion='gini')
clf.fit(X_train_res, y_train_res)

In [171]:
# hyperparameter tuning
from sklearn.model_selection import GridSearchCV

dt_params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10]
}

dt_model = DecisionTreeClassifier(random_state=42)

dt_grid = GridSearchCV(
    estimator=dt_model,
    param_grid=dt_params,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

dt_grid.fit(X_train_res, y_train_res)

print("Best Decision Tree Parameters:", dt_grid.best_params_)
print("Best Decision Tree CV Accuracy:", dt_grid.best_score_)

Best Decision Tree Parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 10, 'min_samples_split': 2}
Best Decision Tree CV Accuracy: 0.8272670286206342


In [172]:
#train
best_dt = dt_grid.best_estimator_
best_dt.fit(X_train_res, y_train_res)

In [173]:
#evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred_dt = best_dt.predict(X_test)

print("\nDecision Tree Test Accuracy:", accuracy_score(y_test, y_pred_dt))
print("\nDecision Tree Classification Report:\n", classification_report(y_test, y_pred_dt))
print("\nDecision Tree Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))


Decision Tree Test Accuracy: 0.7665010645848119

Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.82      0.84      1035
           1       0.55      0.61      0.58       374

    accuracy                           0.77      1409
   macro avg       0.70      0.72      0.71      1409
weighted avg       0.78      0.77      0.77      1409


Decision Tree Confusion Matrix:
 [[850 185]
 [144 230]]


In [174]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score
)

from imblearn.over_sampling import SMOTE

np.random.seed(42)
tf.random.set_seed(42)

In [175]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

In [150]:
model = keras.Sequential([
    layers.Input(shape=(X_train_res.shape[1],)),

    layers.Dense(128, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    layers.Dense(64, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    layers.Dense(32, activation='relu'),

    layers.Dense(1, activation='sigmoid')
])

In [176]:
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        keras.metrics.AUC(name='auc')
    ]
)

In [177]:
early_stopping = keras.callbacks.EarlyStopping(
    monitor='val_auc',
    patience=10,
    mode='max',
    restore_best_weights=True
)


In [179]:
history = model.fit(
    X_train_res,
    y_train_res,
    validation_split=0.2,
    epochs=40,
    batch_size=32,
    verbose=1
)

Epoch 1/40
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.6581 - auc: 0.6964 - loss: 0.6345 - val_accuracy: 0.4046 - val_auc: 0.0000e+00 - val_loss: 0.7687
Epoch 2/40
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7461 - auc: 0.7813 - loss: 0.5412 - val_accuracy: 0.5924 - val_auc: 0.0000e+00 - val_loss: 0.7075
Epoch 3/40
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7329 - auc: 0.7932 - loss: 0.5282 - val_accuracy: 0.3255 - val_auc: 0.0000e+00 - val_loss: 1.2241
Epoch 4/40
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7416 - auc: 0.7977 - loss: 0.5261 - val_accuracy: 0.4783 - val_auc: 0.0000e+00 - val_loss: 0.9818
Epoch 5/40
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.7394 - auc: 0.8019 - loss: 0.5213 - val_accuracy: 0.7252 - val_auc: 0.0000e+00 - val_loss: 0.5476
Epoch 6/40

In [180]:
y_proba_nn = model.predict(X_test).ravel()
y_pred_nn = (y_proba_nn >= 0.5).astype(int)

print("\nNeural Network Test Accuracy:", accuracy_score(y_test, y_pred_nn))
print("Neural Network ROC-AUC:", roc_auc_score(y_test, y_proba_nn))

print("\nClassification Report:\n", classification_report(y_test, y_pred_nn))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_nn))


[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step

Neural Network Test Accuracy: 0.7679205110007097
Neural Network ROC-AUC: 0.8425792451367901

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.78      0.83      1035
           1       0.55      0.73      0.63       374

    accuracy                           0.77      1409
   macro avg       0.72      0.76      0.73      1409
weighted avg       0.80      0.77      0.78      1409


Confusion Matrix:
 [[809 226]
 [101 273]]
