In [1]:
import pandas as pd 
import numpy as np 

In [102]:
DT =pd.read_csv("PlayTennis.csv") 
DT

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [3]:
def calc_total_entropy(DT,label):
    entropy = 0
    DT_count = len(DT)
    for _class in DT[label].unique():
        class_count = len(DT[DT[label]==_class])
        entropy+= - (class_count/DT_count) * np.log2(class_count/DT_count)
    return entropy
calc_total_entropy(DT,'Play Tennis')

0.9402859586706311

In [63]:
def calc_entropy(DT, feature_label, class_label):
    entropy = {}
    uniqued_feature = DT[feature_label].unique()
    uniqued_class = DT[class_label].unique()

    for feature in uniqued_feature:
        entropy_sum = 0
        feature_class_entropy = 0
        feature_count = len(DT[DT[feature_label]==feature])
        feature_p = feature_count / len(DT[feature_label])
        
        for _class in uniqued_class:
            feature_class_count = len(DT[ (DT[feature_label]==feature) & (DT[class_label]==_class) ])   
            if feature_class_count != 0:
                feature_class_probability = feature_class_count/feature_count 
                feature_class_entropy = - feature_class_probability * np.log2(feature_class_probability) 
            entropy_sum += feature_class_entropy
        entropy[feature] = {'P':feature_p,'H':entropy_sum}
        
    return entropy
calc_entropy(DT,'Outlook','Play Tennis')

{'Sunny': {'P': 0.35714285714285715, 'H': 0.9709505944546686},
 'Overcast': {'P': 0.2857142857142857, 'H': 0.0},
 'Rain': {'P': 0.35714285714285715, 'H': 0.9709505944546686}}

In [93]:
def calc_gain(DT,class_label):
    import operator
    features = DT.columns.difference([class_label])
    total_entropy = calc_total_entropy(DT,class_label)
    
    gains = {}
    for feature in features:
        I = 0
        entropies = calc_entropy(DT,feature,class_label)
        for item in entropies.keys() :
            I += entropies[item]['P'] * entropies[item]['H']
        gains[feature] = total_entropy - I
    best_feature = max(gains.items(), key=operator.itemgetter(1))[0]
    return {
        'gains' : gains,
        'best' : {
            'key' : best_feature,
            'value' : gains[best_feature],
        }
    }
calc_gain(DT,'Play Tennis')

{'gains': {'Humidity': 0.15183550136234159,
  'Outlook': 0.24674981977443933,
  'Temperature': 0.02922256565895487,
  'Wind': 0.04812703040826949},
 'best': {'key': 'Outlook', 'value': 0.24674981977443933}}

In [100]:
def sub_tree_maker(best_feature, DT, class_label):
    class_list = DT[class_label].unique()
    tree = {} 
    best_feature_unique_list = DT[best_feature].value_counts(sort=False)
    
    for best_feature_item_name, best_feature_item_count in best_feature_unique_list.iteritems():
        best_feature_item_data = DT[DT[best_feature] == best_feature_item_name]
        leaf_node = False 
        for _class in class_list: 
            class_count = len(best_feature_item_data[best_feature_item_data[class_label] == _class])
            if class_count == best_feature_item_count:
                tree[best_feature_item_name] = _class 
                DT = DT[DT[best_feature] != best_feature_item_name] 
                leaf_node = True
            if not leaf_node: 
                tree[best_feature_item_name] = "Branch"
    return tree, DT
sub_tree_maker('Outlook',DT,'Play Tennis')

({'Sunny': 'Branch', 'Overcast': 'Yes', 'Rain': 'Branch'},
    Outlook Temperature Humidity    Wind Play Tennis
 0    Sunny         Hot     High    Weak          No
 1    Sunny         Hot     High  Strong          No
 3     Rain        Mild     High    Weak         Yes
 4     Rain        Cool   Normal    Weak         Yes
 5     Rain        Cool   Normal  Strong          No
 7    Sunny        Mild     High    Weak          No
 8    Sunny        Cool   Normal    Weak         Yes
 9     Rain        Mild   Normal    Weak         Yes
 10   Sunny        Mild   Normal  Strong         Yes
 13    Rain        Mild     High  Strong          No)

In [136]:
def tree_maker(DT, class_label, class_unique_value, root, node=None):

    if len(DT) == 0: return False
    
    best_feature = calc_gain(DT, class_label)['best']['key'] 
    sub_tree, DT = sub_tree_maker(best_feature, DT, class_label) 
    
    next_root = None 
    if node != None: 
        root[node] = {}
        root[node][best_feature] = sub_tree
        next_root = root[node][best_feature]
    else: 
        root[best_feature] = sub_tree
        next_root = root[best_feature]      
    for node, branch in list(next_root.items()): 
        if branch == "Branch": 
            feature_value_data = DT[DT[best_feature] == node] 
            tree_maker(feature_value_data, class_label, class_unique_value, next_root, node)

In [203]:
def predict(tree, instance):
    if not isinstance(tree, dict): #if it is leaf node
        return tree #return the value
    else:
        root_node = next(iter(tree)) #getting first key/feature name of the dictionary
        feature_value = instance[root_node] #value of the feature
        if feature_value in tree[root_node]: #checking the feature value in current tree node
            return predict(tree[root_node][feature_value], instance) #goto next feature
        else:
            return None

In [231]:
def evaluate(tree, test_data_m, label):
    correct_preditct = 0
    wrong_preditct = 0
    for index, row in test_data_m.iterrows(): #for each row in the dataset
        result = predict(tree, test_data_m.iloc[index]) #predict the row
        if result == test_data_m[label].iloc[index]: #predicted value and expected value is same or not
            correct_preditct += 1 #increase correct count
        else:
            wrong_preditct += 1 #increase incorrect count
    accuracy = correct_preditct / (correct_preditct + wrong_preditct) #calculating accuracy
    return accuracy

# __main__

In [191]:
DT =pd.read_csv("./Diabetes.csv") 
DT.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [218]:
DTTEST =pd.read_csv("./Diabetes_Test.csv") 
DTTEST

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,190,92,0,0,35.5,0.278,66,1
1,2,88,58,26,16,28.4,0.766,22,0
2,9,170,74,31,0,44.0,0.403,43,1
3,9,89,62,0,0,22.5,0.142,33,0
4,10,101,76,48,180,32.9,0.171,63,0
5,2,122,70,27,0,36.8,0.34,27,0
6,5,121,72,23,112,26.2,0.245,30,0
7,1,126,60,0,0,30.1,0.349,47,1
8,1,93,70,31,0,30.4,0.315,23,0


In [215]:
DT[0:100].describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,4.61,117.94,68.07,19.9,74.35,30.996,0.46893,34.42,0.37
std,3.589814,33.712636,22.009298,16.306719,126.446649,9.611734,0.37117,11.212529,0.485237
min,0.0,0.0,0.0,0.0,0.0,0.0,0.102,21.0,0.0
25%,1.0,99.0,64.0,0.0,0.0,25.3,0.248,25.75,0.0
50%,4.0,112.5,72.0,21.5,0.0,31.8,0.343,31.0,0.0
75%,7.0,139.0,80.0,32.25,110.0,37.225,0.58375,42.0,1.0
max,15.0,197.0,110.0,60.0,846.0,49.7,2.288,60.0,1.0


In [216]:
DT[101:110].describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0
mean,1.444444,109.555556,74.666667,14.555556,42.888889,29.677778,0.392444,25.333333,0.111111
std,1.333333,27.231008,22.422087,14.266316,60.589695,6.196729,0.273926,4.924429,0.333333
min,0.0,81.0,56.0,0.0,0.0,22.4,0.179,21.0,0.0
25%,1.0,85.0,58.0,0.0,0.0,26.1,0.247,22.0,0.0
50%,1.0,96.0,65.0,18.0,18.0,28.7,0.283,24.0,0.0
75%,2.0,126.0,85.0,28.0,40.0,34.3,0.336,27.0,0.0
max,4.0,151.0,122.0,31.0,152.0,39.6,0.93,37.0,1.0


In [229]:
tree = {} 
class_label = DT.columns[-1]
class_uniqued_value = DT[class_label].unique()

tree_maker(DT[0:100], class_label, class_uniqued_value, tree) 
display(tree)

{'DiabetesPedigreeFunction': {0.627: 1,
  0.351: 0,
  0.672: 1,
  0.167: 0,
  2.288: 1,
  0.201: 0,
  0.248: {'Age': {26: 1, 21: 0}},
  0.134: 0,
  0.158: 1,
  0.232: 1,
  0.191: 0,
  0.537: 1,
  1.441: 0,
  0.398: 1,
  0.587: {'Age': {51: 1, 36: 0}},
  0.484: 1,
  0.551: 1,
  0.254: 1,
  0.183: 0,
  0.529: 1,
  0.704: 0,
  0.388: 0,
  0.451: 1,
  0.263: 1,
  0.205: 1,
  0.257: 1,
  0.487: 0,
  0.245: 0,
  0.337: 0,
  0.546: 0,
  0.851: 1,
  0.267: 0,
  0.188: 0,
  0.512: 0,
  0.966: 0,
  0.42: 0,
  0.665: 1,
  0.503: 1,
  1.39: 1,
  0.271: 0,
  0.696: 0,
  0.235: 0,
  0.721: 1,
  0.294: 0,
  1.893: 1,
  0.564: 0,
  0.586: 0,
  0.344: 1,
  0.305: 0,
  0.491: 0,
  0.526: 0,
  0.342: 0,
  0.467: 1,
  0.718: 0,
  0.962: 0,
  1.781: 0,
  0.173: 0,
  0.304: 0,
  0.27: 1,
  0.699: 0,
  0.258: {'Age': {42: 1, 21: 0}},
  0.203: 0,
  0.855: 1,
  0.845: 0,
  0.334: 0,
  0.189: 0,
  0.867: 1,
  0.411: 0,
  0.583: 1,
  0.231: 0,
  0.396: 0,
  0.14: 0,
  0.391: 0,
  0.37: 0,
  0.307: 0,
  0.102: 0,

In [236]:
evaluate(tree,DT[240:250],class_label)

IndexError: single positional indexer is out-of-bounds