In [11]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split

'''
data = pd.read_excel('DecisionData.xlsx','Sheet1')
data = data[["District","House Type","Incom","Previous Customer","Outcome"]]

# attribute
features = data[["District", "House Type","Incom","Previous Customer"]]
# Label for decision
target = data["Outcome"]
'''

data = pd.read_csv('decision_tree_data.csv',encoding='utf-8')

# split data set(30) into training set(27), test set(3) randomly
# data : original data we want to split
# test_size : size of test data set
# random_state = used to split data randomly as random seed
train, test= train_test_split(data, test_size=0.1,random_state=random.randint(1,100))

# attribute
features = train[["level", "lang","tweets","phd"]]
# Label for decision
target = train["interview"]

# attribute
features_test = test[["level", "lang","tweets","phd"]]
# Label for decision
target_test = test["interview"]

print(train) # 27
print("****************************************************TEST")
print(test) # 3

     level    lang tweets  phd  interview
0   senior    java     no   no      False
25     mid  python    yes  yes       True
28     mid    java     no   no      False
22  junior       R    yes  yes       True
6      mid       R    yes  yes       True
9   junior  python    yes   no       True
18  senior    java     no  yes       True
15  senior       R    yes  yes       True
8   senior       R    yes   no       True
17  junior    java    yes  yes      False
14  senior    java    yes  yes      False
26     mid    java    yes  yes      False
24  senior    java     no   no      False
7   senior  python     no   no      False
20     mid  python    yes  yes      False
3   junior  python     no   no       True
13  junior  python     no  yes      False
23     mid    java     no   no       True
10  senior  python    yes  yes       True
16     mid    java    yes   no       True
29     mid  python     no  yes      False
2      mid  python     no   no       True
4   junior       R    yes   no    

In [12]:
# Calculate entropy about each node
# *Input*
# target_col : node to calculate
# *Output*
# return entropy : Return the calculated entropy value.
def entropy(target) :
    # find the number of'Respond', 'Nothing' in input attribute
    element, count = np.unique(target,return_counts=True) 
    
    # Calculate entropy
    entropy = -np.sum([(count[i]/np.sum(count))*np.log(count[i]/np.sum(count)) for i in range(len(element))])
    
    return entropy

# Calculate information gain to set the node on which to classify
# *Input*
# data : original data
# split_attribute_name : child node(branch) of parent node.
# -> If it have a lot of information gain, it can be a classification criterion.
# target_name : mean 'Label' attribute name - "Outcome"
# *Output*
# return Information_Gain : it will be used to find largest information gain and select next node
def InfoGain(data,split_attribute_name,target_name) :
    
    # calculate total entropy
    total_entropy = entropy(data[target_name])
    
    # Calculate sum of entropy of child nodes.
    # The weights are taken into account and calculated.
    vals, counts = np.unique(data[split_attribute_name],return_counts=True)
    
    # 선택된 attribute의 자식들 = unique 한 값들의 weight 을 고려하여 entroypy 계산
    Weighted_Entropy = np.sum([counts[i]*
                               entropy(data.where(data[split_attribute_name]
                                                  ==vals[i]).dropna()[target_name])
                               for i in range(len(vals))])/np.sum(counts)
    
    # Calculate information gain
    Information_Gain = total_entropy - Weighted_Entropy
    return Information_Gain

In [13]:
# Create overall tree by selecting some attribute as classification criteria
# use entropy, information gain function
# *input *
# data : Data other than attribution completed with calculation and tree configuration
# originaldata : original data
# features : attribution used for decision making except label
# target_attribute_name : Lable - "Outcome"
# * Output *
#  return structure of tree
def ID3(data,originaldata,features,target_attribute_name,parent_node_class = None):
 
    # Define the criteria that are stopped.
 
    # 1.If the target property has a single value: return the target property
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]
 
    # 2. Data missing: Returns the target property with the maximum value from the source data
    elif len(data)==0:
        return np.unique(originaldata[target_attribute_name])\
               [np.argmax(np.unique(originaldata[target_attribute_name], return_counts=True)[1])]
 
    # 3. When no technical properties exist: return destination properties of the parent node
    elif len(features) ==0:
        return parent_node_class
 
    # Perform structuralization, such as adding nodes and attribution to the tree.
    else:
        # Define destination attribute for parent node such as "Outcom"
        parent_node_class = np.unique(data[target_attribute_name])\
                            [np.argmax(np.unique(data[target_attribute_name], return_counts=True)[1])]
        
        # Select attribute to split data
        item_values = [InfoGain(data,feature,target_attribute_name) for feature in features] # get informatinGain to all attributes
        best_feature_index = np.argmax(item_values) # find attribute that has largest informatin gain
        best_feature = features[best_feature_index]
        
        
       # InfoGainResult(data,best_feature,target_attribute_name) # print entropy about child node of selected attribute
       # print(best_feature,"information gain : ",round(np.max(item_values),5)) # print information gain about selected attribute
        
        # create tree
        tree = {best_feature:{}}
        
        # except attribute that has largest information gain
        features = [i for i in features if i != best_feature]
        # create and select branch node
        for value in np.unique(data[best_feature]):
            # data partitioning. delete data have attribution completed with calculation and tree configuration
            sub_data = data.where(data[best_feature] == value).dropna()
            
            # Use recursive function to set classification criteria for newly formed branches
            subtree = ID3(sub_data,data,features,target_attribute_name,parent_node_class)
            tree[best_feature][value] = subtree # Add a branch to an existing tree
            
                        
        return(tree)


In [14]:
def classify(tree, input):
    # Input is classified based on the given tree.
    
    # if it is leaf node, return tree
    if tree in [True, False]:
        return tree

    #attribute, subtree_dict = tree.items()
    # if it is not leaf node, will be classify.
    # to do this, get the nodes that are the criteria for classification from tree
    attribute = list(tree.keys()).pop()
    subtree_dict = list(tree.values())[0]

    # get attribute of input
    subtree_key = input.get(attribute)


    # When there is not sub tree about key data of input --> return None
    if subtree_key not in subtree_dict:
        subtree_key = None 
        return None

    # select sub tree for key of input data
    subtree = subtree_dict[subtree_key]
    
    # classify input data and  recursive function for classify about sub-tree
    return classify(subtree, input)

In [15]:
tree = ID3(train, train, ["level", "lang","tweets","phd"],"interview")
from pprint import pprint
print("* Structure of decision tree *")
pprint(tree)

print("****Classify****")

for i in range(3) :
    result = classify(tree,test.iloc[i])
    print(test.iloc[i])
    if result == 1 :
        print("Result: Interview OK!\n")
    elif result == 0 :
        print("Result: Interview NO!\n")
    else :
        print("Result: None\n")
  



* Structure of decision tree *
{'lang': {'R': 1.0,
          'java': {'level': {'junior': 0.0,
                             'mid': {'phd': {'no': {'tweets': {'no': 1.0,
                                                               'yes': 1.0}},
                                             'yes': 0.0}},
                             'senior': {'phd': {'no': 0.0,
                                                'yes': {'tweets': {'no': 0.0,
                                                                   'yes': 0.0}}}}}},
          'python': {'phd': {'no': {'level': {'junior': 1.0,
                                              'mid': 1.0,
                                              'senior': 0.0}},
                             'yes': {'level': {'junior': 0.0,
                                               'mid': {'tweets': {'no': 0.0,
                                                                  'yes': 0.0}},
                                               'senior': 1.0}}}}}}
****C