In [1]:
# Topgyal Gurung
# Decision Tree Algorithm from scratch on monk dataset

import pandas as pd
import numpy as np
import random
from pprint import pprint

In [2]:
# load and prepare dataset
# train_set
monk1_train=pd.read_csv("/Users/topgyalgurung/Desktop/monk-dataset/train1.csv")
monk2_train=pd.read_csv("/Users/topgyalgurung/Desktop/monk-dataset/train2.csv")
monk3_train=pd.read_csv("/Users/topgyalgurung/Desktop/monk-dataset/train3.csv")
# test_set
monk1_test=pd.read_csv("/Users/topgyalgurung/Desktop/monk-dataset/test1.csv", header=None)
monk2_test=pd.read_csv("/Users/topgyalgurung/Desktop/monk-dataset/test2.csv", header=None)
monk3_test=pd.read_csv("/Users/topgyalgurung/Desktop/monk-dataset/test3.csv", header=None)

In [3]:
monk1_train=monk1_train.drop("Id", axis=1)
monk2_train=monk2_train.drop("Id",axis=1)
monk3_train=monk3_train.drop("Id",axis=1)

monk1_test=monk1_test.drop([7],axis=1)
monk2_test=monk2_test.drop([7],axis=1)
monk3_test=monk3_test.drop([7],axis=1)

In [4]:
# dataframe into numpy array
monk1_data=monk1_train.values
monk2_data=monk2_train.values
monk3_data=monk3_train.values

In [5]:
#data[:2]  # print first 2 rows
#data[:,0] # print first class data

## HELPER FUNCTIONS

In [6]:
label_column=monk1_data[:,0]   # class
np.unique(label_column,return_counts=True) #0,1

(array([0, 1]), array([62, 62]))

In [7]:
# use max class as index
unique_classes,counts_unique_classes=np.unique(label_column,return_counts=True)
_,counts_unique_classes=np.unique(label_column,return_counts=True)
print(unique_classes) 
print(len(unique_classes))
print(counts_unique_classes) # 50-50 

[0 1]
2
[62 62]


In [8]:
# base case check if data is pure or just one class type
def check_purity(data):
    label_column=data[:,0]
    unique_classes=np.unique(label_column)
    
    if len(unique_classes)==1:
        return True
    else:
        return False

In [9]:
def classify_data(data):
    label_column=data[:,0]  # label_column = class
    unique_classes,counts_unique_classes=np.unique(label_column,return_counts=True) # check distinct column for class
    # use max class as index 
    index=counts_unique_classes.argmax()
    classification=unique_classes[index]
    return classification

### POTENTIAL SPLITS

In [10]:
monk1_data.shape  # 124 rows, 7 columns

(124, 7)

In [11]:
 #iterate over column 
_,n_columns=monk1_train.shape
for column_i in range(1,n_columns):  # column index
    values=monk1_data[:,column_i]
    #unique_values=np.unique(values)
    unique_values,unique_value_counts=np.unique(values,return_counts=True)
    if column_i!=0:
        print(unique_values) 
        print(unique_value_counts)
        print("")

[1 2 3]
[45 42 37]

[1 2 3]
[35 42 47]

[1 2]
[65 59]

[1 2 3]
[42 39 43]

[1 2 3 4]
[29 31 30 34]

[1 2]
[56 68]



In [12]:
# potential splits
def get_potential_splits(data):
    potential_splits={}
    _,n_columns=data.shape  #iterate over column
    for column_i in range(1,n_columns):  # column index exclude class
        values = data[:, column_i]
        unique_values = np.unique(values)
        potential_splits[column_i]=unique_values
    return potential_splits  

In [13]:
# should start from 1: classes should not include
get_potential_splits(monk1_data)

{1: array([1, 2, 3]),
 2: array([1, 2, 3]),
 3: array([1, 2]),
 4: array([1, 2, 3]),
 5: array([1, 2, 3, 4]),
 6: array([1, 2])}

# SPLIT DATA

In [14]:
# split data to data equal if equal and data_not for not equal
def split_data(data,split_column,split_value):
    #random.shuffle(data)
    split_column_values=data[:,split_column]
    data_equal=data[split_column_values==split_value]
    data_not=data[split_column_values!=split_value]
    return data_equal,data_not  

In [15]:
# split_value=data[:,1] # e.g a1
# split=split_data(train_data,2,split_value)
# _,data_not=split
#data_equal,_=split
# print(data_not)

# entropy 

In [16]:
label_column=monk1_data[:,0] # first column=class

In [17]:
_,counts=np.unique(label_column,return_counts=True)
print(counts)
counts.sum()
counts/counts.sum()

[62 62]


array([0.5, 0.5])

In [18]:
# entropy (each feature)
def entropy(data):
    label_column=data[:,0]
    _,counts=np.unique(label_column,return_counts=True)
    
    probabilities=counts/counts.sum()
    entropy=sum(probabilities*-np.log2(probabilities))
    return entropy

In [19]:
entropy(monk1_data)

1.0

In [20]:
entropy(monk2_data)

0.957117428264771

In [21]:
entropy(monk3_data)

0.999806132804711

In [22]:
def overall_entropy(data_below, data_above):
    n=len(data_below)+len(data_above)
    p_data_below=len(data_below)/n
    p_data_above=len(data_above)/n
    overall_entropy=(p_data_below * entropy(data_below) + p_data_above * entropy(data_above))
    return overall_entropy

In [23]:
get_potential_splits(monk1_data)

{1: array([1, 2, 3]),
 2: array([1, 2, 3]),
 3: array([1, 2]),
 4: array([1, 2, 3]),
 5: array([1, 2, 3, 4]),
 6: array([1, 2])}

# Determine best split

In [24]:
# best_split
def best_split(data,potential_splits):
    temp_entropy=999
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below,data_above=split_data(data,split_column=column_index,split_value=value)
            current_overall_entropy=overall_entropy(data_below,data_above)
            
            if current_overall_entropy<=temp_entropy:
                temp_entropy=current_overall_entropy
                best_split_column=column_index
                best_split_value=value
    return best_split_column, best_split_value

# representation of decision tree
sub_tree={question:[1,2,3,4]} # depends on attribute

In [25]:
potential_splits=get_potential_splits(monk1_data)

In [26]:
best_split(monk1_data,potential_splits)

(5, 1)

In [27]:
potential_splits=get_potential_splits(monk1_data)
split_column,split_value=best_split(monk1_data,potential_splits)
data_below,data_above=split_data(monk1_data,split_column,split_value)
#data_below

In [28]:
# recursive decision_tree algorithm
def decision_tree(train_set,counter=0):
    if counter==0:
        global column_headers
        column_headers=train_set.columns
        data=train_set.values  # numpy array
    else:
        data= train_set   # pandas dataframe
    
    # base case
    if check_purity(data):
        classification=classify_data(data)
        return classification
    # recursive
    else:
        counter+=1
        # run helper functions
        potential_splits=get_potential_splits(data)
        split_column,split_value=best_split(data,potential_splits)
        data_below,data_above=split_data(data,split_column,split_value)

        # sub_tree
        feature_name=column_headers[split_column]
        question="{} == {}".format(feature_name,split_value)
        sub_tree={question:[]} #empty list
        # find ans (recursive part)
        yes_ans=decision_tree(data_below,counter)
        no_ans=decision_tree(data_above,counter)
        
        if yes_ans==no_ans:
            sub_tree=yes_ans
        else:
            sub_tree[question].append(yes_ans)
            sub_tree[question].append(no_ans)

        return sub_tree

In [29]:
tree=decision_tree(monk1_train)

In [30]:
tree2=decision_tree(monk2_train)

In [31]:
tree3=decision_tree(monk3_train)

In [32]:
#monk1_train.columns

In [33]:
pprint(tree)

{'a5 == 1': [1,
             {'a1 == 1': [{'a2 == 1': [1, 0]},
                          {'a2 == 1': [0,
                                       {'a5 == 3': [{'a2 == 3': [{'a1 == 3': [1,
                                                                              0]},
                                                                 {'a1 == 3': [0,
                                                                              1]}]},
                                                    {'a4 == 1': [{'a6 == 2': [{'a3 == 2': [1,
                                                                                           {'a5 == 4': [{'a2 == 3': [1,
                                                                                                                     0]},
                                                                                                        1]}]},
                                                                              1]},
                                   

In [34]:
pprint(tree2)

{'a4 == 1': [{'a5 == 1': [{'a6 == 2': [{'a3 == 2': [{'a2 == 3': [1, 0]}, 0]},
                                       0]},
                          {'a1 == 1': [{'a6 == 2': [{'a3 == 2': [1, 0]}, 0]},
                                       {'a6 == 2': [{'a3 == 2': [{'a2 == 1': [1,
                                                                              0]},
                                                                 {'a2 == 1': [0,
                                                                              1]}]},
                                                    {'a3 == 2': [{'a2 == 1': [0,
                                                                              1]},
                                                                 0]}]}]}]},
             {'a5 == 2': [{'a3 == 2': [{'a2 == 2': [0,
                                                    {'a1 == 1': [1,
                                                                 {'a2 == 3': [0,
                          

In [35]:
pprint(tree3)

{'a2 == 3': [{'a4 == 1': [{'a3 == 2': [0, {'a1 == 1': [0, 1]}]}, 0]},
             {'a5 == 4': [0,
                          {'a5 == 3': [{'a3 == 2': [1,
                                                    {'a1 == 3': [1,
                                                                 {'a6 == 2': [{'a4 == 3': [{'a2 == 2': [{'a1 == 2': [0,
                                                                                                                     1]},
                                                                                                        1]},
                                                                                           1]},
                                                                              {'a4 == 1': [{'a2 == 2': [1,
                                                                                                        0]},
                                                                                           0]}]}]}]},
             

## classification

In [80]:

example=monk1_train.iloc[2] # first row
example

class    1
a1       1
a2       1
a3       1
a4       3
a5       2
a6       1
Name: 2, dtype: int64

In [81]:
# list
tree.keys()

dict_keys(['a5 == 1'])

In [82]:
# get string
list(tree.keys())[0]

'a5 == 1'

In [83]:
question=list(tree.keys())[0]
feature_name,operator,value=question.split(" ")
feature_name

'a5'

In [84]:
def classify_test(test,tree):
    question=list(tree.keys())[0]
    feature_name,operator,value=question.split(" ")
    
    if example[feature_name] == value:
        ans=tree[question][0]
    else:
        ans=tree[question][1]
    
    # base case
    if not isinstance(ans,dict):
        return ans
    # recursion
    else:
        residual_tree=ans
        return classify_test(example,residual_tree)

In [86]:
classify_test(example,tree)

1

## Accuracy

In [104]:
def accuracy(df,tree):
    df["classification"]=df.apply(classify_test, axis=1,args=(tree,))
    df["classification_correct"]=df["classification"]==df["class"]
    
    accuracy=df["classification_correct"].mean()
    return accuracy

In [106]:
accuracy(train_set,tree)

0.5

# decision tree

In [108]:
# train_dfn=pd.read_csv()
# test_dfn=pd.read_csv()

# tree_clf=decision_tree(train_dfn)
# accuracy=accuracy(test_dfn,tree_clf)

# pprint(tree_clf)
# print(accuracy)