In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def getData(dataset_name):
    attribute_file_name = 'Data/'+dataset_name+".attribute"
    dataset_file_name = 'Data/'+dataset_name+".data"
    att = pd.read_csv(attribute_file_name, 
                      delim_whitespace=True,
                     header = None)
    attributes = {rows[0]:rows[1] for _,rows in att.iterrows()}
    dataset = pd.read_csv(dataset_file_name,
                      names=list(attributes.keys()))
    return attributes, dataset


def entropy(target_col, col_type, split_point=0.0):
    if col_type == 'category':
        counts = list(data[split_attribute_name].value_counts().values)
#         _,counts = np.unique(target_col,return_counts = True)
    else:
        left = target_col <= split_point
        right = target_col > split_point
        # print(left, right)
        counts = [len(target_col[left]), len(target_col[right])]
    # print(split_point, counts)
    entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(counts))])
    return entropy

def InfoGainRatio(data,split_attribute_name,split_att_type,target_name="class"):
    total_entropy = entropy(data[target_name], attributes[target_name])

    if split_att_type == 'category':
#         vals,counts= np.unique(data[split_attribute_name],return_counts=True
        print("check")
        tmp = data[split_attribute_name].value_counts()
        val = list(tmp.index)
        counts = list(tmp.values)
        
        information = data[split_att_type].value_counts()

        Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name], attributes[split_attribute_name]) for i in range(len(vals))])
        Information_Gain = total_entropy - Weighted_Entropy
        
        if Information_Gain == 0.0:
            return Information_Gain
        Gain_Ratio = Information_Gain/entropy(data[split_attribute_name], attributes[split_attribute_name])
        return Gain_Ratio, None
    else:
        values = list(np.unique(data[split_attribute_name]))
#         print("......................")
#         print(values)
        best = 0
        idx = None
        for val in values:
#             val = values[i]
            left = data[split_attribute_name] <= val
            right = data[split_attribute_name] > val
            counts = [len(data[split_attribute_name][left]), len(data[split_attribute_name][right])]
            # print(counts, val)
            Weighted_Entropy = (counts[0]/np.sum(counts))*entropy(data.where(data[split_attribute_name]<=val).dropna()[target_name], attributes[target_name], val) + (counts[1]/np.sum(counts))*entropy(data.where(data[split_attribute_name]>val).dropna()[target_name], attributes[target_name], val)
            Information_Gain = total_entropy - Weighted_Entropy
            if Information_Gain == 0.0:
                continue
            # print(Information_Gain)
            Gain_Ratio = Information_Gain/entropy(data[split_attribute_name], attributes[split_attribute_name], val)
#             print(entropy(data[split_attribute_name], attributes[split_attribute_name]), Gain_Ratio)
#             print("Gain ratio", Gain_Ratio)
            if Gain_Ratio>=best:
                best = Gain_Ratio
                idx = val
        # print(best, idx)
        return best, idx

def ID3(data,features,target_attribute_name="class",parent_node_class = None):
    if len(np.unique(data[target_attribute_name])) <= 1:
        return np.unique(data[target_attribute_name])[0]

#     elif len(data)==0 :
#         return np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_attribute_name],return_counts=True)[1])]

    elif len(features)==0 or len(data) == 0:
        return parent_node_class

    else:
        parent_node_class = np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name],return_counts=True)[1])]

        item_values = []
        split_point = []
        for feature in features:
            GR, point = InfoGainRatio(data,feature,target_attribute_name)
            item_values.append(GR)
            split_point.append(point)
        best_feature_index = np.argmax(item_values)
        best_feature = features[best_feature_index]

        tree = {best_feature:{}}

        features = [i for i in features if i != best_feature]
        if attributes[best_feature] == 'category':
            for value in np.unique(data[best_feature]):
                value = value
                sub_data = data.where(data[best_feature] == value).dropna()

                subtree = ID3(sub_data,features,target_attribute_name,parent_node_class)
                tree[best_feature][value] = subtree

            return(tree)
        else:
            sub_data1 = data.where(data[best_feature]<=split_point[best_feature_index]).dropna()
            sub_data2 = data.where(data[best_feature]>split_point[best_feature_index]).dropna()
            subtree1 = ID3(sub_data1,features,target_attribute_name,parent_node_class)
            tree[best_feature][split_point[best_feature_index] - 0.00001] = subtree1
            subtree2 = ID3(sub_data2,features,target_attribute_name,parent_node_class)
            tree[best_feature][split_point[best_feature_index] + 0.00001] = subtree2
            return(tree)


def predict(query,tree,default = 1):
    if not isinstance(tree, dict):
        return tree
    att_name = list(tree.keys())[0]
    if attributes[att_name] == 'category':
        try:
            result_tree = tree[att_name][query[att_name]]
        except:
            return default
        result_tree = tree[att_name][query[att_name]]
        return predict(query, result_tree)
    else:
        key_vals = list(tree[att_name].keys())
        if abs(key_vals[0]-query[att_name]) < abs(key_vals[1]-query[att_name]):
            result_tree = tree[att_name][key_vals[0]]
        else:
            result_tree = tree[att_name][key_vals[1]]
        return predict(query, result_tree)

def printDecisionTree(tree,level):
    level += " "
    for key,value in tree.items():
        if isinstance(value, dict):
            print(level+str(key)+":")
            printDecisionTree(value, level)
        else: 
            print(level+str(key)+"-->",value)    


def test(data,tree):
    queries = data.iloc[:,:-1].to_dict(orient = "records")
    predictions = [predict(query,tree) for query in queries]
    accuracy = accuracy_score(predictions, list(data["class"]))
    precision = precision_score(predictions, list(data["class"]), average="macro")
    recall = recall_score(predictions, list(data["class"]), average="macro")
    f1 = f1_score(predictions, list(data["class"]), average="macro")
    return accuracy, precision, recall, f1

attributes, dataset = getData(dataset_name='iris')
training_data, testing_data = train_test_split(dataset, test_size = 0.2)

tree = ID3(training_data,training_data,training_data.columns[:-1])
accuracy, precision, recall, f1 = test(testing_data,tree)
printDecisionTree(tree,"")


print("accuracy \t precision \t recall \t f1")
print("{:.2f}".format(accuracy*100),"%\t\t", 
      "{:.2f}".format(precision*100),"%\t",
      "{:.2f}".format(recall*100),"%\t", 
      "{:.2f}".format(f1*100),"%")

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [None]:
def get_data(dataset_name):
    attribute_file_name = 'Data/'+dataset_name+".attribute"
    dataset_file_name = 'Data/'+dataset_name+".data"
    att = pd.read_csv(attribute_file_name, 
                      delim_whitespace=True,
                     header = None)
    attributes = {rows[0]:rows[1] for _,rows in att.iterrows()}
    print(attributes)
#     for _ , rows in att.iterrows():
#         print(rows['c1'])
get_data(dataset_name='iris')

In [None]:
training_data.dtype

In [None]:
dtype(training_data)

In [None]:
training_data2, testing_data2 = train_test_split(dataset, test_size = 0.2)

training_data2

In [None]:
training_data

In [None]:
testing_data

In [None]:
4+5

In [None]:
max([3,1,5,0,2])