<h1> Assignment 1 - Task 2 Solution </h1>

Student name: Ngo Vu Anh </br>
UOW ID:    </br>
Tutorial Group: T02   

<h1> Import tools </h1>

In [1]:
import numpy as np
import pandas as pd
from collections import Counter


<h1> Import the data </h1>

In [2]:
test_file = 'customer_churn_dataset-testing-master.csv'
train_file = 'customer_churn_dataset-training-master.csv'

test_df = pd.read_csv(test_file)
train_df = pd.read_csv(train_file)

# Combine train df and test df

train_df = pd.concat([train_df, test_df], axis=0)

# Reset the index of the combined DataFrame
train_df.reset_index(drop=True, inplace=True)

# Select a portion of the data
train_df = train_df.sample(n=50000, random_state=42)

<h1> Data preprocessing </h1>

In [3]:
# Drop missing values 

train_df = train_df.dropna()
test_df = test_df.dropna()

In [4]:
# Z Score normalization for Last Interaction 

normalized_last_interaction = (train_df['Last Interaction'] - train_df['Last Interaction'].mean()) / train_df['Last Interaction'].std()
normalized_last_interaction_test = (test_df['Last Interaction'] - test_df['Last Interaction'].mean()) / test_df['Last Interaction'].std()

# Replace the previous Last Interaction with new one 
train_df['Last Interaction'] = normalized_last_interaction
test_df['Last Interaction'] = normalized_last_interaction_test

Z Score normalization conducted on Last Interaction because the distribution is not balance

In [5]:
# Create 5 bins for attribute Total Spend 

bin_labels = ['Bin 1', 'Bin 2', 'Bin 3', 'Bin 4', 'Bin 5']

train_df['Total Spend'] = pd.cut(train_df['Total Spend'], bins=5, labels=bin_labels)
test_df['Total Spend'] = pd.cut(test_df['Total Spend'], bins=5, labels=bin_labels)

The values total spend varies in large range, that is why we divide into 5 bins for better classification.

In [6]:
train_df.head()

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
89623,92144.0,54.0,Male,5.0,20.0,1.0,3.0,Basic,Quarterly,Bin 2,0.041571,1.0
486640,45808.0,30.0,Female,33.0,21.0,10.0,27.0,Basic,Monthly,Bin 2,0.505844,1.0
405193,412308.0,48.0,Female,8.0,20.0,3.0,10.0,Basic,Quarterly,Bin 5,-0.074497,0.0
453212,12380.0,34.0,Male,30.0,5.0,1.0,7.0,Premium,Monthly,Bin 3,-0.306634,0.0
100779,103786.0,24.0,Female,26.0,2.0,4.0,23.0,Standard,Monthly,Bin 4,-0.074497,1.0


In [7]:
# Drop unecessary columns 

train_df.drop('CustomerID', axis=1, inplace=True)
test_df.drop('CustomerID', axis=1, inplace = True)

# Drop gender to ensure equality 
train_df.drop('Gender', axis=1, inplace=True)
test_df.drop('Gender', axis=1, inplace = True)


<h1> Prepare decision tree classifier </h1>

<h2> Tree class </h2> 

In [8]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, gain=0, y=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
        self.gain = gain
        self.y = y
        self.count = Counter(y)

    def is_leaf_node(self):
        return self.value is not None


class DecisionTree:
    def __init__(self, min_samples_split=2, max_depth=100, depth=None, n_features=None, criterion="GINI"):
        self.labels = None
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.n_features = n_features
        self.root = None
        self.depth = depth if depth else 0
        self.criterion = criterion

    def fit(self, X, y):
        self.n_features = X.shape[1] if not self.n_features else min(X.shape[1], self.n_features)
        self.labels = X.columns
        self.root = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))

        if depth >= self.max_depth or n_labels == 1 or n_samples < self.min_samples_split:
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value, y=y)

        feat_idxs = np.random.choice(n_features, self.n_features, replace=False)
        best_feature, best_threshold, best_gain = self._best_split(X, y, feat_idxs)

        left_idxs, right_idxs = self._split(X.iloc[:, best_feature], best_threshold)
        left = self._grow_tree(X.iloc[left_idxs, :], y.iloc[left_idxs], depth + 1)
        right = self._grow_tree(X.iloc[right_idxs, :], y.iloc[right_idxs], depth + 1)
        return Node(best_feature, best_threshold, left, right, best_gain, y)

    def _best_split(self, X, y, feat_idxs):
        best_gain = -1
        best_split_idx, best_split_threshold = None, None

        for feat_idx in feat_idxs:
            X_column = X.iloc[:, feat_idx]
            thresholds = np.unique(X_column)

            for thr in thresholds:
                if self.criterion == "GINI":
                    gain = self._gini_gain(y, X_column, thr)
                elif self.criterion == "GAIN_RATIO":
                    gain = self._gain_ratio(y, X_column, thr)
                else:
                    gain = self._information_gain(y, X_column, thr)

                if gain > best_gain:
                    best_gain = gain
                    best_split_idx = feat_idx
                    best_split_threshold = thr

        return best_split_idx, best_split_threshold, best_gain

    def _gini_gain(self, y, X_column, threshold):
        gini_base = self._get_gini(y)

        left_idxs, right_idxs = self._split(X_column, threshold)
        left_counts = Counter(y.iloc[left_idxs])
        right_counts = Counter(y.iloc[right_idxs])

        left_class0_count = Counter(left_counts).get(0, 0)
        left_class1_count = Counter(left_counts).get(1, 0)
        right_class0_count = Counter(right_counts).get(0, 0)
        right_class1_count = Counter(right_counts).get(1, 0)

        gini_left = self._gini_impurity(left_class0_count, left_class1_count)
        gini_right = self._gini_impurity(right_class0_count, right_class1_count)

        n_left = left_class0_count + left_class1_count
        n_right = right_class0_count + right_class1_count

        w_left = n_left / (n_left + n_right)
        w_right = n_right / (n_left + n_right)

        w_gini = w_left * gini_left + w_right * gini_right

        gini_gain = gini_base - w_gini
        return gini_gain

    def _get_gini(self, y):
        class0_count = Counter(y).get(0, 0)
        class1_count = Counter(y).get(1, 0)
        return self._gini_impurity(class0_count, class1_count)

    def _gini_impurity(self, class0_count, class1_count):
        if class0_count is None:
            class0_count = 0
        if class1_count is None:
            class1_count = 0
        n = class0_count + class1_count

        if n == 0:
            return 0.0

        p0 = class0_count / n
        p1 = class1_count / n
        gini = 1 - (p0 ** 2 + p1 ** 2)
        return gini

    def _gain_ratio(self, y, X_column, threshold):
        parent_entropy = self._entropy(y)
        left_idxs, right_idxs = self._split(X_column, threshold)
        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0

        n = len(y)
        n_left, n_right = len(left_idxs), len(right_idxs)
        e_left, e_right = self._entropy(y.iloc[left_idxs]), self._entropy(y.iloc[right_idxs])
        child_entropy = (n_left / n) * e_left + (n_right / n) * e_right

        information_gain = parent_entropy - child_entropy

        split_info = -(n_left / n * np.log(n_left / n) + n_right / n * np.log(n_right / n))

        return information_gain / split_info

    def _information_gain(self, y, X_column, threshold):
        parent_entropy = self._entropy(y)
        left_idxs, right_idxs = self._split(X_column, threshold)
        if len(left_idxs) == 0 or len(right_idxs) == 0:
            return 0

        n = len(y)
        n_left, n_right = len(left_idxs), len(right_idxs)
        e_left, e_right = self._entropy(y.iloc[left_idxs]), self._entropy(y.iloc[right_idxs])
        child_entropy = (n_left / n) * e_left + (n_right / n) * e_right

        information_gain = parent_entropy - child_entropy
        return information_gain

    def _split(self, X_column, split_threshold):
        left_idxs = np.argwhere(X_column.to_numpy() <= split_threshold).flatten()
        right_idxs = np.argwhere(X_column.to_numpy() > split_threshold).flatten()
        return left_idxs, right_idxs

    def _entropy(self, y):
        hist = np.bincount(y)
        ps = hist / len(y)
        return -np.sum([p * np.log(p) for p in ps if p > 0])

    def _most_common_label(self, y):
        counter = Counter(y)
        if len(Counter(y).most_common(1)) == 0:
            value = None
        else:
            value = counter.most_common(1)[0][0]
        return value

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for idx, x in X.iterrows()])

    def _traverse_tree(self, x, node):
        if node.is_leaf_node() or node.feature is None:
            return node.value
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

    def counter_to_str(self, counter: Counter):
        inside = [f"{key}: {value}" for key, value in counter.items()]
        return f"({', '.join(inside)})"

    def print_info(self, node, depth):
        if node.feature is None:
            return

        preamble0 = ' ' * depth * 2 + ('\t\t' if depth > 0 else '')

        print(f"{preamble0} {self.labels[node.feature]} <= {node.threshold} ? {node.gain}")

        self.print_info(node.left, depth + 1)
        self.print_info(node.right, depth + 1)

    def print_tree(self):
        self.print_info(self.root, 0)


<h2> Create cross validation </h2>

In [9]:
train = train_df.sample(frac=0.6, random_state= 40)

test = train_df.drop(train.index)

# Set up the train and target

X_train = train[train.columns[:-1]]
X_test = test[test.columns[:-1]]
y_train = train['Churn']
y_test = test['Churn']


In [20]:
# Initialize a decision tree - Information gain
clf = DecisionTree(max_depth=12, criterion="")
clf.fit(X_train, y_train)

predictions_info_gain = clf.predict(X_test)
predictions_info_gain_train = clf.predict(X_train)

print("Root")
clf.print_tree()

Root
 Support Calls <= 4.0 ? 0.16194190220629678
  		 Total Spend <= Bin 2 ? 0.13804033019010176
    		 Payment Delay <= 20.0 ? 0.006851750935880474
      		 Support Calls <= 2.0 ? 0.0029385679758466288
        		 Usage Frequency <= 24.0 ? 0.0036466548597930415
          		 Payment Delay <= 4.0 ? 0.003894856417806014
            		 Age <= 37.0 ? 0.01811232945409269
              		 Age <= 25.0 ? 0.021353233114417836
                		 Tenure <= 53.0 ? 0.041930315256836326
                  		 Last Interaction <= -1.583385684635876 ? 0.0403535131167162
                    		 Tenure <= 41.0 ? 0.056654519052554975
                      		 Usage Frequency <= 4.0 ? 0.057320030671008326
                		 Contract Length <= Monthly ? 0.04790410018040858
                  		 Contract Length <= Annual ? 0.09592188591380724
                    		 Last Interaction <= 0.7379810586126535 ? 0.08806631707443069
                      		 Payment Delay <= 2.0 ? 0.08573783404972213
                    	

In [15]:
# Function to check accuracy

def accuracy(y_test, y_pred):
    return np.sum(y_test == y_pred) / len(y_test)


In [21]:
# Check accuracy of the Decision Tree
acc = accuracy(y_test, predictions_info_gain)
acc_train = accuracy(y_train, predictions_info_gain_train)

print("Training set accuracy: ", acc_train)
print("Test set accuracy: ", acc)

Training set accuracy:  0.9403
Test set accuracy:  0.90585


In [25]:
# Initialize a decision tree - Gain Ratio

clf_gain = DecisionTree(max_depth=11, criterion="GAIN_RATIO")
clf_gain.fit(X_train, y_train)

predictions_gain_ratio = clf_gain.predict(X_test)
predictions_gain_ratio_train = clf_gain.predict(X_train)


In [27]:

# Initialize a decision tree - Gini Index

clf_gini = DecisionTree(max_depth=12, criterion="GINI")
clf_gini.fit(X_train, y_train)

predictions_gini = clf_gini.predict(X_test)
predictions_gini_train = clf_gini.predict(X_train)


In [26]:
# Check accuracy of the Decision Tree - Gain Ratio
acc = accuracy(y_test, predictions_gain_ratio)
acc_train = accuracy(y_train, predictions_gain_ratio_train)

print("Training set accuracy: ", acc_train)
print("Test set accuracy: ", acc)

Training set accuracy:  0.9297
Test set accuracy:  0.915


In [24]:
# Check accuracy of the Decision Tree - Gain Ratio
acc = accuracy(y_test, predictions_gini)
acc_train = accuracy(y_train, predictions_gini_train)

print("Training set accuracy: ", acc_train)
print("Test set accuracy: ", acc)

Training set accuracy:  0.9461
Test set accuracy:  0.9005


## Voting function

In [30]:
def ensemble_voting(predictions_list):
    # Initialize an empty array to store the ensemble predictions
    ensemble_predictions = []

    # Perform majority voting for each data point
    for i in range(len(predictions_list[0])):
        votes = [pred[i] for pred in predictions_list]
        # Use the most common vote as the ensemble prediction
        ensemble_predictions.append(max(set(votes), key=votes.count))

    return ensemble_predictions

In [32]:
ensemble_predictions = ensemble_voting([predictions_gini, predictions_gain_ratio, predictions_info_gain])

# Check accuracy of the Decision Tree
def accuracy(y_test, y_pred):
    return np.sum(y_test == y_pred) / len(y_test)
acc = accuracy(y_test, ensemble_predictions)

print("Best accuracy: ", acc)

Best accuracy:  0.9146


The voting function has chosen the method to split the decision tree that can gain the highest accuracy. And the highest accuracy can be achieved is 89%