In [1]:
import pandas as pd 
import numpy as np 

In [53]:
DT =pd.read_csv("PlayTennis.csv") 
DT.head()

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes


In [3]:
def calc_total_entropy(DT,label):
    entropy = 0
    DT_count = len(DT)
    for _class in DT[label].unique():
        class_count = len(DT[DT[label]==_class])
        entropy+= - (class_count/DT_count) * np.log2(class_count/DT_count)
    return entropy
calc_total_entropy(DT,'Play Tennis')

0.9402859586706311

In [4]:
def calc_entropy(DT, feature_label, class_label):
    entropy = {}
    uniqued_feature = DT[feature_label].unique()
    uniqued_class = DT[class_label].unique()

    for feature in uniqued_feature:
        entropy_sum = 0
        feature_class_entropy = 0
        feature_count = len(DT[DT[feature_label]==feature])
        feature_p = feature_count / len(DT[feature_label])
        
        for _class in uniqued_class:
            feature_class_count = len(DT[ (DT[feature_label]==feature) & (DT[class_label]==_class) ])   
            if feature_class_count != 0:
                feature_class_probability = feature_class_count/feature_count 
                feature_class_entropy = - feature_class_probability * np.log2(feature_class_probability) 
            entropy_sum += feature_class_entropy
        entropy[feature] = {'P':feature_p,'H':entropy_sum}
        
    return entropy
calc_entropy(DT,'Outlook','Play Tennis')

{'Sunny': {'P': 0.35714285714285715, 'H': 0.9709505944546686},
 'Overcast': {'P': 0.2857142857142857, 'H': 0.0},
 'Rain': {'P': 0.35714285714285715, 'H': 0.9709505944546686}}

In [5]:
def calc_gain(DT,class_label):
    import operator
    features = DT.columns.difference([class_label])
    total_entropy = calc_total_entropy(DT,class_label)
    
    gains = {}
    for feature in features:
        I = 0
        entropies = calc_entropy(DT,feature,class_label)
        for item in entropies.keys() :
            I += entropies[item]['P'] * entropies[item]['H']
        gains[feature] = total_entropy - I
    best_feature = max(gains.items(), key=operator.itemgetter(1))[0]
    return {
        'gains' : gains,
        'best' : {
            'key' : best_feature,
            'value' : gains[best_feature],
        }
    }
calc_gain(DT,'Play Tennis')

{'gains': {'Humidity': 0.15183550136234159,
  'Outlook': 0.24674981977443933,
  'Temperature': 0.02922256565895487,
  'Wind': 0.04812703040826949},
 'best': {'key': 'Outlook', 'value': 0.24674981977443933}}

In [6]:
def sub_tree_maker(best_feature, DT, class_label):
    class_list = DT[class_label].unique()
    tree = {} 
    best_feature_unique_list = DT[best_feature].value_counts(sort=False)
    
    for best_feature_item_name, best_feature_item_count in best_feature_unique_list.iteritems():
        best_feature_item_data = DT[DT[best_feature] == best_feature_item_name]
        leaf_node = False 
        for _class in class_list: 
            class_count = len(best_feature_item_data[best_feature_item_data[class_label] == _class])
            if class_count == best_feature_item_count:
                tree[best_feature_item_name] = _class 
                DT = DT[DT[best_feature] != best_feature_item_name] 
                leaf_node = True
            if not leaf_node: 
                tree[best_feature_item_name] = "Branch"
    return tree, DT
sub_tree_maker('Outlook',DT,'Play Tennis')

({'Sunny': 'Branch', 'Overcast': 'Yes', 'Rain': 'Branch'},
    Outlook Temperature Humidity    Wind Play Tennis
 0    Sunny         Hot     High    Weak          No
 1    Sunny         Hot     High  Strong          No
 3     Rain        Mild     High    Weak         Yes
 4     Rain        Cool   Normal    Weak         Yes
 5     Rain        Cool   Normal  Strong          No
 7    Sunny        Mild     High    Weak          No
 8    Sunny        Cool   Normal    Weak         Yes
 9     Rain        Mild   Normal    Weak         Yes
 10   Sunny        Mild   Normal  Strong         Yes
 13    Rain        Mild     High  Strong          No)

In [7]:
def tree_maker(DT, class_label, class_unique_value, root, node=None):

    if len(DT) == 0: return False
    
    best_feature = calc_gain(DT, class_label)['best']['key'] 
    sub_tree, DT = sub_tree_maker(best_feature, DT, class_label) 
    
    next_root = None 
    if node != None: 
        root[node] = {}
        root[node][best_feature] = sub_tree
        next_root = root[node][best_feature]
    else: 
        root[best_feature] = sub_tree
        next_root = root[best_feature]      
    for node, branch in list(next_root.items()): 
        if branch == "Branch": 
            feature_value_data = DT[DT[best_feature] == node] 
            tree_maker(feature_value_data, class_label, class_unique_value, next_root, node)

In [8]:
def predict(tree, instance):
    if not isinstance(tree, dict): #if it is leaf node
        return tree #return the value
    else:
        root_node = next(iter(tree)) #getting first key/feature name of the dictionary
        feature_value = instance[root_node] #value of the feature
        if feature_value in tree[root_node]: #checking the feature value in current tree node
            return predict(tree[root_node][feature_value], instance) #goto next feature
        else:
            return None

In [9]:
def evaluate(tree, test_data_m, label):
    correct_preditct = 0
    wrong_preditct = 0
    for index, row in test_data_m.iterrows(): #for each row in the dataset
        result = predict(tree, test_data_m.iloc[index]) #predict the row
        if result == test_data_m[label].iloc[index]: #predicted value and expected value is same or not
            correct_preditct += 1 #increase correct count
        else:
            wrong_preditct += 1 #increase incorrect count
    accuracy = correct_preditct / (correct_preditct + wrong_preditct) #calculating accuracy
    return accuracy

# <font color="orange">Make Tree</font>

In [58]:
DT =pd.read_csv("Diabetes.csv")
DT.dropna(inplace=True)
DT.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [59]:
# import sys, threading
# sys.setrecursionlimit(10**7) 
# threading.stack_size(2**27)

tree = {} 
class_label = DT.columns[-1]
class_uniqued_value = DT[class_label].unique()

tree_maker(DT[0:700].reset_index(drop=True), class_label, class_uniqued_value, tree) 
display(tree)

{'DiabetesPedigreeFunction': {0.627: 1,
  0.351: 0,
  0.672: 1,
  0.167: 0,
  2.288: 1,
  0.201: 0,
  0.248: {'Age': {26: 1, 21: 0}},
  0.134: 0,
  0.158: {'Age': {53: 1, 21: 0}},
  0.232: 1,
  0.191: 0,
  0.537: 1,
  1.441: 0,
  0.398: 1,
  0.587: {'Age': {51: 1, 36: 0, 68: 0}},
  0.484: 1,
  0.551: {'Age': {31: 1, 38: 0, 67: 0, 21: 0}},
  0.254: {'Age': {31: 1, 51: 1, 41: 1, 40: 0, 65: 0, 36: 1}},
  0.183: {'Age': {33: 0, 38: 1}},
  0.529: 1,
  0.704: 0,
  0.388: 0,
  0.451: 1,
  0.263: {'Age': {29: 1, 33: 0, 38: 0, 25: 0}},
  0.205: {'Age': {41: 1, 24: 0, 29: 1}},
  0.257: {'Age': {43: 1, 23: 0, 44: 1}},
  0.487: 0,
  0.245: {'Age': {57: 0, 36: 0, 40: 1}},
  0.337: {'Age': {38: 0, 36: 1, 29: 1}},
  0.546: 0,
  0.851: 1,
  0.267: 0,
  0.188: 0,
  0.512: 0,
  0.966: 0,
  0.42: 0,
  0.665: 1,
  0.503: 1,
  1.39: 1,
  0.271: 0,
  0.696: 0,
  0.235: {'Age': {48: 0, 27: 0, 70: 1}},
  0.721: 1,
  0.294: 0,
  1.893: 1,
  0.564: 0,
  0.586: {'Age': {22: 0, 51: 1}},
  0.344: 1,
  0.305: 0,
  

# <font color="orange">Predict And Calculate Accuracy</font>

In [61]:
evaluate(tree,DT[700:].reset_index(drop=True),class_label)

0.23529411764705882