In [1]:
import pandas as pd 
import numpy as np 

In [102]:
DT =pd.read_csv("PlayTennis.csv") 
DT

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [3]:
def calc_total_entropy(DT,label):
    entropy = 0
    DT_count = len(DT)
    for _class in DT[label].unique():
        class_count = len(DT[DT[label]==_class])
        entropy+= - (class_count/DT_count) * np.log2(class_count/DT_count)
    return entropy
calc_total_entropy(DT,'Play Tennis')

0.9402859586706311

In [63]:
def calc_entropy(DT, feature_label, class_label):
    entropy = {}
    uniqued_feature = DT[feature_label].unique()
    uniqued_class = DT[class_label].unique()

    for feature in uniqued_feature:
        entropy_sum = 0
        feature_class_entropy = 0
        feature_count = len(DT[DT[feature_label]==feature])
        feature_p = feature_count / len(DT[feature_label])
        
        for _class in uniqued_class:
            feature_class_count = len(DT[ (DT[feature_label]==feature) & (DT[class_label]==_class) ])   
            if feature_class_count != 0:
                feature_class_probability = feature_class_count/feature_count 
                feature_class_entropy = - feature_class_probability * np.log2(feature_class_probability) 
            entropy_sum += feature_class_entropy
        entropy[feature] = {'P':feature_p,'H':entropy_sum}
        
    return entropy
calc_entropy(DT,'Outlook','Play Tennis')

{'Sunny': {'P': 0.35714285714285715, 'H': 0.9709505944546686},
 'Overcast': {'P': 0.2857142857142857, 'H': 0.0},
 'Rain': {'P': 0.35714285714285715, 'H': 0.9709505944546686}}

In [93]:
def calc_gain(DT,class_label):
    import operator
    features = DT.columns.difference([class_label])
    total_entropy = calc_total_entropy(DT,class_label)
    
    gains = {}
    for feature in features:
        I = 0
        entropies = calc_entropy(DT,feature,class_label)
        for item in entropies.keys() :
            I += entropies[item]['P'] * entropies[item]['H']
        gains[feature] = total_entropy - I
    best_feature = max(gains.items(), key=operator.itemgetter(1))[0]
    return {
        'gains' : gains,
        'best' : {
            'key' : best_feature,
            'value' : gains[best_feature],
        }
    }
calc_gain(DT,'Play Tennis')

{'gains': {'Humidity': 0.15183550136234159,
  'Outlook': 0.24674981977443933,
  'Temperature': 0.02922256565895487,
  'Wind': 0.04812703040826949},
 'best': {'key': 'Outlook', 'value': 0.24674981977443933}}

In [100]:
def sub_tree_maker(best_feature, DT, class_label):
    class_list = DT[class_label].unique()
    tree = {} 
    best_feature_unique_list = DT[best_feature].value_counts(sort=False)
    
    for best_feature_item_name, best_feature_item_count in best_feature_unique_list.iteritems():
        best_feature_item_data = DT[DT[best_feature] == best_feature_item_name]
        leaf_node = False 
        for _class in class_list: 
            class_count = len(best_feature_item_data[best_feature_item_data[class_label] == _class])
            if class_count == best_feature_item_count:
                tree[best_feature_item_name] = _class 
                DT = DT[DT[best_feature] != best_feature_item_name] 
                leaf_node = True
            if not leaf_node: 
                tree[best_feature_item_name] = "Branch"
    return tree, DT
sub_tree_maker('Outlook',DT,'Play Tennis')

({'Sunny': 'Branch', 'Overcast': 'Yes', 'Rain': 'Branch'},
    Outlook Temperature Humidity    Wind Play Tennis
 0    Sunny         Hot     High    Weak          No
 1    Sunny         Hot     High  Strong          No
 3     Rain        Mild     High    Weak         Yes
 4     Rain        Cool   Normal    Weak         Yes
 5     Rain        Cool   Normal  Strong          No
 7    Sunny        Mild     High    Weak          No
 8    Sunny        Cool   Normal    Weak         Yes
 9     Rain        Mild   Normal    Weak         Yes
 10   Sunny        Mild   Normal  Strong         Yes
 13    Rain        Mild     High  Strong          No)

In [136]:
def tree_maker(DT, class_label, class_unique_value, root, node=None):

    if len(DT) == 0: return False
    
    best_feature = calc_gain(DT, class_label)['best']['key'] 
    sub_tree, DT = sub_tree_maker(best_feature, DT, class_label) 
    
    next_root = None 
    if node != None: 
        root[node] = {}
        root[node][best_feature] = sub_tree
        next_root = root[node][best_feature]
    else: 
        root[best_feature] = sub_tree
        next_root = root[best_feature]      
    for node, branch in list(next_root.items()): 
        if branch == "Branch": 
            feature_value_data = DT[DT[best_feature] == node] 
            tree_maker(feature_value_data, class_label, class_unique_value, next_root, node)

In [203]:
def predict(tree, instance):
    if not isinstance(tree, dict): #if it is leaf node
        return tree #return the value
    else:
        root_node = next(iter(tree)) #getting first key/feature name of the dictionary
        feature_value = instance[root_node] #value of the feature
        if feature_value in tree[root_node]: #checking the feature value in current tree node
            return predict(tree[root_node][feature_value], instance) #goto next feature
        else:
            return None

In [204]:
def evaluate(tree, test_data_m, label):
    correct_preditct = 0
    wrong_preditct = 0
    for index, row in test_data_m.iterrows(): #for each row in the dataset
        result = predict(tree, test_data_m.iloc[index]) #predict the row
        if result == test_data_m[label].iloc[index]: #predicted value and expected value is same or not
            correct_preditct += 1 #increase correct count
        else:
            wrong_preditct += 1 #increase incorrect count
    accuracy = correct_preditct / (correct_preditct + wrong_preditct) #calculating accuracy
    return accuracy

# __main__

In [191]:
DT =pd.read_csv("./Diabetes.csv") 
DT.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [193]:
DTTEST =pd.read_csv("./Diabetes_Test.csv") 
DTTEST

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,190,92,0,0,35.5,0.278,66,1
1,2,88,58,26,16,28.4,0.766,22,0
2,9,170,74,31,0,44.0,0.403,43,1
3,9,89,62,0,0,22.5,0.142,33,0
4,10,101,76,48,180,32.9,0.171,63,0
5,2,122,70,27,0,36.8,0.34,27,0
6,5,121,72,23,112,26.2,0.245,30,0
7,1,126,60,0,0,30.1,0.349,47,1
8,1,93,70,31,0,30.4,0.315,23,0


In [210]:
tree = {} 
class_label = DT.columns[-1]
class_uniqued_value = DT[class_label].unique()

tree_maker(DT[0:100], class_label, class_uniqued_value, tree) 

In [209]:
evaluate(tree,DT[101:110],class_label)

IndexError: single positional indexer is out-of-bounds