In [1]:
import numpy as np
import pandas as pd


# Building the Decision Tree model (Entropy)

In [2]:
class Node():
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, info_gain=None, value=None):
        ''' constructor ''' 
        
        # for decision node
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.info_gain = info_gain
        
        # for leaf node
        self.value = value

In [3]:
class DecisionTreeClassifier():
    def __init__(self, min_samples_split=2, max_depth=2):
        ''' constructor '''
        
        # initialize the root of the tree 
        self.root = None
        
        # stopping conditions
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        
    def build_tree(self, dataset, curr_depth=0):
        ''' recursive function to build the tree ''' 
        
        X, Y = dataset[:,:-1], dataset[:,-1]
        num_samples, num_features = np.shape(X)
        
        # split until stopping conditions are met
        if num_samples>=self.min_samples_split and curr_depth<=self.max_depth:
            # find the best split
            best_split = self.get_best_split(dataset, num_samples, num_features)
            # check if information gain is positive
            if best_split["info_gain"]>0:
                # recur left
                left_subtree = self.build_tree(best_split["dataset_left"], curr_depth+1)
                # recur right
                right_subtree = self.build_tree(best_split["dataset_right"], curr_depth+1)
                # return decision node
                return Node(best_split["feature_index"], best_split["threshold"], 
                            left_subtree, right_subtree, best_split["info_gain"])
        
        # compute leaf node
        leaf_value = self.calculate_leaf_value(Y)
        # return leaf node
        return Node(value=leaf_value)
    
    def get_best_split(self, dataset, num_samples, num_features):
        ''' function to find the best split '''
        
        # dictionary to store the best split
        best_split = {}
        max_info_gain = -float("inf")
        
        # loop over all the features
        for feature_index in range(num_features):
            feature_values = dataset[:, feature_index]
            possible_thresholds = np.unique(feature_values)
            # loop over all the feature values present in the data
            for threshold in possible_thresholds:
                # get current split
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                # check if childs are not null
                if len(dataset_left)>0 and len(dataset_right)>0:
                    y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                    # compute information gain
                    curr_info_gain = self.information_gain(y, left_y, right_y, "gini")
                    # update the best split if needed
                    if curr_info_gain>max_info_gain:
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["info_gain"] = curr_info_gain
                        max_info_gain = curr_info_gain
                        
        # return best split
        return best_split
    
    def split(self, dataset, feature_index, threshold):
        ''' function to split the data '''
        
        dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
        dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
        return dataset_left, dataset_right
    
    def information_gain(self, parent, l_child, r_child, mode="entropy"):
        ''' function to compute information gain '''
        
        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)
        if mode=="gini":
            gain = self.gini_index(parent) - (weight_l*self.gini_index(l_child) + weight_r*self.gini_index(r_child))
        else:
            gain = self.entropy(parent) - (weight_l*self.entropy(l_child) + weight_r*self.entropy(r_child))
        return gain
    
    def entropy(self, y):
        ''' function to compute entropy '''
        
        class_labels = np.unique(y)
        entropy = 0
        for cls in class_labels:
            p_cls = len(y[y == cls]) / len(y)
            entropy += -p_cls * np.log2(p_cls)
        return entropy
    
    def gini_index(self, y):
        ''' function to compute gini index '''
        
        class_labels = np.unique(y)
        gini = 0
        for cls in class_labels:
            p_cls = len(y[y == cls]) / len(y)
            gini += p_cls**2
        return 1 - gini
        
    def calculate_leaf_value(self, Y):
        ''' function to compute leaf node '''
        
        Y = list(Y)
        return max(Y, key=Y.count)
    
    def print_tree(self, tree=None, indent=" "):
        ''' function to print the tree '''
        
        if not tree:
            tree = self.root

        if tree.value is not None:
            print(tree.value)

        else:
            print("X_"+str(tree.feature_index), "<=", tree.threshold, "?", tree.info_gain)
            print("%sleft:" % (indent), end="")
            self.print_tree(tree.left, indent + indent)
            print("%sright:" % (indent), end="")
            self.print_tree(tree.right, indent + indent)
    
    def fit(self, X, Y):
        ''' function to train the tree '''
        
        dataset = np.concatenate((X, Y), axis=1)
        self.root = self.build_tree(dataset)
    
    def predict(self, X):
        ''' function to predict new dataset '''
        
        preditions = [self.make_prediction(x, self.root) for x in X]
        return preditions
    
    def make_prediction(self, x, tree):
        ''' function to predict a single data point '''
        
        if tree.value!=None: return tree.value
        feature_val = x[tree.feature_index]
        if feature_val<=tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)

### Importing the data

In [4]:
train_data = pd.read_excel(r"training_data.xlsx")
test_data = pd.read_excel(r"test_data.xlsx")
print(train_data)
print(test_data)

    patron  P_smoker  Income ($K) Worked_in_service_industry Large_Meal  \
0        1  0.081745   101.257746                        yes        yes   
1        2  0.101462   227.686797                        yes         no   
2        3  0.018554   471.003974                         no         no   
3        4  0.014987   457.095231                         no        yes   
4        5  0.001330   785.029310                         no        yes   
..     ...       ...          ...                        ...        ...   
95      96  0.950000   286.314459                         no         no   
96      97  0.980000   278.206809                         no        yes   
97      98  0.967300   796.610512                         no         no   
98      99  0.970000   885.747488                         no         no   
99     100  0.990000   584.411075                         no        yes   

         %Tip  
0   15.817449  
1   16.014624  
2   10.185544  
3   10.149870  
4   10.013304  
.. 

In [5]:
x = train_data.iloc[:,1:5]
x["Work industry"] = pd.Series(np.where(train_data.Worked_in_service_industry.values =="yes",1,0)) # Changing the Premiums to integer values
x = x.drop(columns=['Worked_in_service_industry'])
x["Large meal _"] = pd.Series(np.where(train_data.Large_Meal.values =="yes",1,0)) # Changing the Premiums to integer values
x = x.drop(columns=["Large_Meal"])
y = train_data.iloc[:,[5]]


In [6]:
print(x)
print(y)

    P_smoker  Income ($K)  Work industry  Large meal _
0   0.081745   101.257746              1             1
1   0.101462   227.686797              1             0
2   0.018554   471.003974              0             0
3   0.014987   457.095231              0             1
4   0.001330   785.029310              0             1
..       ...          ...            ...           ...
95  0.950000   286.314459              0             0
96  0.980000   278.206809              0             1
97  0.967300   796.610512              0             0
98  0.970000   885.747488              0             0
99  0.990000   584.411075              0             1

[100 rows x 4 columns]
         %Tip
0   15.817449
1   16.014624
2   10.185544
3   10.149870
4   10.013304
..        ...
95  19.500000
96  19.800000
97  19.673000
98  24.700000
99  19.900000

[100 rows x 1 columns]


In [8]:
y["%Tip_encoded"] = pd.Series(np.where(y["%Tip"].values >=18,1,0))
y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y["%Tip_encoded"] = pd.Series(np.where(y["%Tip"].values >=18,1,0))


Unnamed: 0,%Tip,%Tip_encoded
0,15.817449,0
1,16.014624,0
2,10.185544,0
3,10.149870,0
4,10.013304,0
...,...,...
95,19.500000,1
96,19.800000,1
97,19.673000,1
98,24.700000,1


In [9]:
y__ = y[["%Tip_encoded"]]
y__

Unnamed: 0,%Tip_encoded
0,0
1,0
2,0
3,0
4,0
...,...
95,1
96,1
97,1
98,1


### Training our model

In [10]:
classifier = DecisionTreeClassifier(min_samples_split=3, max_depth=3)
classifier.fit(x,y__)
classifier.print_tree()

X_0 <= 0.31698752480003123 ? 0.24856506238859163
 left:X_1 <= 798.6865931393814 ? 0.04013840830449819
  left:0.0
  right:X_2 <= 0.0 ? 0.48
    left:0.0
    right:1.0
 right:X_2 <= 0.0 ? 0.07759412304866858
  left:X_0 <= 0.7484692605222749 ? 0.2921190514773402
    left:X_1 <= 784.8337613563792 ? 0.15594002306805088
        left:0.0
        right:1.0
    right:1.0
  right:1.0


In [11]:
x_test = test_data.iloc[:,1:5]
x_test["Work industry"] = pd.Series(np.where(test_data.Worked_in_service_industry.values =="yes",1,0)) # Changing the Premiums to integer values
x_test = x_test.drop(columns=['Worked_in_service_industry'])
x_test["Large meal _"] = pd.Series(np.where(test_data.Large_Meal.values =="yes",1,0)) # Changing the Premiums to integer values
x_test = x_test.drop(columns=["Large_Meal"])
x_test

Unnamed: 0,P_smoker,Income ($K),Work industry,Large meal _
0,0.0,25,1,1
1,0.2,27,1,1
2,0.4,45,0,0
3,0.6,145,1,1
4,0.8,243,0,1
5,1.0,234,1,0
6,0.8,333,0,1
7,0.4,63,0,0
8,0.3,11,0,1
9,0.2,123,1,0


### Predicting the values on test data uisng our trained model

In [13]:

Y_pred = classifier.predict(np.array(x_test)) 


In [14]:
Y_pred

[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0]

# Building the Decision Tree model (Gini)

In [15]:
class Node():
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, info_gain=None, value=None):
        ''' constructor ''' 
        
        # for decision node
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.info_gain = info_gain
        
        # for leaf node
        self.value = value

In [16]:
class DecisionTreeClassifier():
    def __init__(self, min_samples_split=2, max_depth=2):
        ''' constructor '''
        
        # initialize the root of the tree 
        self.root = None
        
        # stopping conditions
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        
    def build_tree(self, dataset, curr_depth=0):
        ''' recursive function to build the tree ''' 
        
        X, Y = dataset[:,:-1], dataset[:,-1]
        num_samples, num_features = np.shape(X)
        
        # split until stopping conditions are met
        if num_samples>=self.min_samples_split and curr_depth<=self.max_depth:
            # find the best split
            best_split = self.get_best_split(dataset, num_samples, num_features)
            # check if information gain is positive
            if best_split["info_gain"]>0:
                # recur left
                left_subtree = self.build_tree(best_split["dataset_left"], curr_depth+1)
                # recur right
                right_subtree = self.build_tree(best_split["dataset_right"], curr_depth+1)
                # return decision node
                return Node(best_split["feature_index"], best_split["threshold"], 
                            left_subtree, right_subtree, best_split["info_gain"])
        
        # compute leaf node
        leaf_value = self.calculate_leaf_value(Y)
        # return leaf node
        return Node(value=leaf_value)
    
    def get_best_split(self, dataset, num_samples, num_features):
        ''' function to find the best split '''
        
        # dictionary to store the best split
        best_split = {}
        max_info_gain = -float("inf")
        
        # loop over all the features
        for feature_index in range(num_features):
            feature_values = dataset[:, feature_index]
            possible_thresholds = np.unique(feature_values)
            # loop over all the feature values present in the data
            for threshold in possible_thresholds:
                # get current split
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                # check if childs are not null
                if len(dataset_left)>0 and len(dataset_right)>0:
                    y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                    # compute information gain
                    curr_info_gain = self.information_gain(y, left_y, right_y, "gini")
                    # update the best split if needed
                    if curr_info_gain>max_info_gain:
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["info_gain"] = curr_info_gain
                        max_info_gain = curr_info_gain
                        
        # return best split
        return best_split
    
    def split(self, dataset, feature_index, threshold):
        ''' function to split the data '''
        
        dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
        dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
        return dataset_left, dataset_right
    
    def information_gain(self, parent, l_child, r_child, mode="gini"):
        ''' function to compute information gain '''
        
        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)
        if mode=="gini":
            gain = self.gini_index(parent) - (weight_l*self.gini_index(l_child) + weight_r*self.gini_index(r_child))
        else:
            gain = self.entropy(parent) - (weight_l*self.entropy(l_child) + weight_r*self.entropy(r_child))
        return gain
    
    def entropy(self, y):
        ''' function to compute entropy '''
        
        class_labels = np.unique(y)
        entropy = 0
        for cls in class_labels:
            p_cls = len(y[y == cls]) / len(y)
            entropy += -p_cls * np.log2(p_cls)
        return entropy
    
    def gini_index(self, y):
        ''' function to compute gini index '''
        
        class_labels = np.unique(y)
        gini = 0
        for cls in class_labels:
            p_cls = len(y[y == cls]) / len(y)
            gini += p_cls**2
        return 1 - gini
        
    def calculate_leaf_value(self, Y):
        ''' function to compute leaf node '''
        
        Y = list(Y)
        return max(Y, key=Y.count)
    
    def print_tree(self, tree=None, indent=" "):
        ''' function to print the tree '''
        
        if not tree:
            tree = self.root

        if tree.value is not None:
            print(tree.value)

        else:
            print("X_"+str(tree.feature_index), "<=", tree.threshold, "?", tree.info_gain)
            print("%sleft:" % (indent), end="")
            self.print_tree(tree.left, indent + indent)
            print("%sright:" % (indent), end="")
            self.print_tree(tree.right, indent + indent)
    
    def fit(self, X, Y):
        ''' function to train the tree '''
        
        dataset = np.concatenate((X, Y), axis=1)
        self.root = self.build_tree(dataset)
    
    def predict(self, X):
        ''' function to predict new dataset '''
        
        preditions = [self.make_prediction(x, self.root) for x in X]
        return preditions
    
    def make_prediction(self, x, tree):
        ''' function to predict a single data point '''
        
        if tree.value!=None: return tree.value
        feature_val = x[tree.feature_index]
        if feature_val<=tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)

### Training our model

In [17]:
classifier = DecisionTreeClassifier(min_samples_split=3, max_depth=3)
classifier.fit(x,y__)
classifier.print_tree()

X_0 <= 0.31698752480003123 ? 0.24856506238859163
 left:X_1 <= 798.6865931393814 ? 0.04013840830449819
  left:0.0
  right:X_2 <= 0.0 ? 0.48
    left:0.0
    right:1.0
 right:X_2 <= 0.0 ? 0.07759412304866858
  left:X_0 <= 0.7484692605222749 ? 0.2921190514773402
    left:X_1 <= 784.8337613563792 ? 0.15594002306805088
        left:0.0
        right:1.0
    right:1.0
  right:1.0


### Predicting the values on test data uisng our trained model

In [18]:

Y_pred = classifier.predict(np.array(x_test)) 


In [19]:
Y_pred

[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0]