In [3]:
import pandas as pd 
import numpy as np 

In [6]:
train_data =pd.read_csv("./Diabetes.csv") 
train_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [7]:


#Importing Train_Dataset 
#print(train_data.head()) 

#Calculating Information_Gain
def info_gain(feature_name, train_data, target, class_list):
    feature_info_gain = 0
    feature_value_list = train_data[feature_name].unique()
    total_row = train_data.shape[0]
    for feature_value in feature_value_list:
        feature_value_data = train_data[train_data[feature_name] == feature_value]
        feature_value_count = feature_value_data.shape[0]
        feature_value_entropy = feature_entropy(feature_value_data, target, class_list)
        feature_value_p = feature_value_count/total_row
        feature_info_gain += feature_value_p * feature_value_entropy 
    return dataset_entropy(train_data, target, class_list) - feature_info_gain 

#Calculating Total_Entropy for each classes
def dataset_entropy(train_data, target, class_list):
    total_entropy = 0
    total_row = train_data.shape[0]    
    for x in class_list: 
        each_class_count = train_data[train_data[target] == x].shape[0] 
        each_class_entropy = - (each_class_count/total_row)*np.log2(each_class_count/total_row) 
        total_entropy += each_class_entropy
    return total_entropy
    
#Calculating Entropy for each feature of the dataset
def feature_entropy(feature_value_data, target, class_list):
    entropy = 0 
    total_class_count = feature_value_data.shape[0] 
    for x in class_list:
        each_class_count = feature_value_data[feature_value_data[target] == x].shape[0] 
        class_entropy = 0
        if each_class_count != 0:
            class_p = each_class_count/total_class_count 
            class_entropy = - class_p * np.log2(class_p) 
        entropy += class_entropy
    return entropy

#Finding the best feature that has the highest Information_Gain
def best_feature(train_data, target, class_list):
    max_info_gain = 0
    max_info_feature = None 
    feature_list = train_data.columns.drop(target)                                     
    for feature in feature_list:
        feature_info_gain = info_gain(feature, train_data, target, class_list)
        if max_info_gain < feature_info_gain: 
            max_info_gain = feature_info_gain
            max_info_feature = feature          
    return max_info_feature

#Building Branches of Decision_Tree                 
def sub_tree(feature_name, train_data, target, class_list):
    tree = {} 
    feature_value_count = train_data[feature_name].value_counts(sort=False)
    for feature_value, count in feature_value_count.iteritems():
        feature_value_data = train_data[train_data[feature_name] == feature_value]
        leaf_node = False 
        for x in class_list: 
            each_class_count = feature_value_data[feature_value_data[target] == x].shape[0] 
            if each_class_count == count: #Reaching to a leaf_node
                tree[feature_value] = x 
                train_data = train_data[train_data[feature_name] != feature_value] 
                leaf_node = True
        if not leaf_node: 
            tree[feature_value] = "Branch" #Tree should be expanded
    return tree, train_data

#Building Decision_Tree
def ID3(root, parent_node, train_data, target, class_list):
    if train_data.shape[0] != 0: 
        max_info_feature = best_feature(train_data, target, class_list) 
        tree, train_data = sub_tree(max_info_feature, train_data, target, class_list) 
        next_root = None 
        if parent_node != None: 
            root[parent_node] = dict()
            root[parent_node][max_info_feature] = tree
            next_root = root[parent_node][max_info_feature]
        else: #add to root of the tree
            root[max_info_feature] = tree
            next_root = root[max_info_feature]      
        for node, branch in list(next_root.items()): 
            if branch == "Branch": #if it is expandable
                feature_value_data = train_data[train_data[max_info_feature] == node] 
                ID3(next_root, node, feature_value_data, target, class_list)

#Predicting New_new_instances for Test_dataset
def predict(tree, new_instance):
    if not isinstance(tree, dict): #Whether reaching a leaf_node or not
        return tree 
    else:
        root_node = next(iter(tree))
        feature_value = new_instance[root_node] 
        if feature_value in tree[root_node]: 
            return predict(tree[root_node][feature_value], new_instance) 
        else:
            return None
        
#Starting Training the dataset with making a Decicion_Tree 
target="Outcome"
tree = {} 
class_list = train_data[target].unique() 
ID3(tree, None, train_data[0:100], target, class_list)
print ('The final Decision_Tree is:',tree)

#Importing Test_Dataset for prediction of new_new_instances
test_data = train_data[101:110]

#Starting Testing new_data as Test_dataset & calculating Accuracy 
correct_count = 0
wrong_count = 0
for index, row in test_data.iterrows(): 
    result = predict(tree, test_data.iloc[index]) 
    if result == test_data[target].iloc[index]: 
        correct_count += 1 
    else:
        wrong_count+= 1 
accuracy =  correct_count / ( correct_count + wrong_count) 
print ('Accuracy=', accuracy)
                


The final Decision_Tree is: {'DiabetesPedigreeFunction': {0.627: 1, 0.351: 0, 0.672: 1, 0.167: 0, 2.288: 1, 0.201: 0, 0.248: {'Pregnancies': {3: 1, 1: 0}}, 0.134: 0, 0.158: 1, 0.232: 1, 0.191: 0, 0.537: 1, 1.441: 0, 0.398: 1, 0.587: {'Glucose': {166: 1, 44: 0}}, 0.484: 1, 0.551: 1, 0.254: 1, 0.183: 0, 0.529: 1, 0.704: 0, 0.388: 0, 0.451: 1, 0.263: 1, 0.205: 1, 0.257: 1, 0.487: 0, 0.245: 0, 0.337: 0, 0.546: 0, 0.851: 1, 0.267: 0, 0.188: 0, 0.512: 0, 0.966: 0, 0.42: 0, 0.665: 1, 0.503: 1, 1.39: 1, 0.271: 0, 0.696: 0, 0.235: 0, 0.721: 1, 0.294: 0, 1.893: 1, 0.564: 0, 0.586: 0, 0.344: 1, 0.305: 0, 0.491: 0, 0.526: 0, 0.342: 0, 0.467: 1, 0.718: 0, 0.962: 0, 1.781: 0, 0.173: 0, 0.304: 0, 0.27: 1, 0.699: 0, 0.258: {'Pregnancies': {7: 1, 1: 0}}, 0.203: 0, 0.855: 1, 0.845: 0, 0.334: 0, 0.189: 0, 0.867: 1, 0.411: 0, 0.583: 1, 0.231: 0, 0.396: 0, 0.14: 0, 0.391: 0, 0.37: 0, 0.307: 0, 0.102: 0, 0.767: 0, 0.237: 0, 0.227: 1, 0.698: 0, 0.178: 0, 0.324: 0, 0.153: 1, 0.165: 0, 0.443: 0, 0.261: 0, 0.27

IndexError: single positional indexer is out-of-bounds