In [5]:
import pandas as pd
import numpy as np
import math
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def getData(dataset_name):
    attribute_file_name = 'Data/'+dataset_name+".attribute"
    dataset_file_name = 'Data/'+dataset_name+".data"
    att = pd.read_csv(attribute_file_name, 
                      delim_whitespace=True,
                     header = None)
    attributes = {rows[0]:rows[1] for _,rows in att.iterrows()}
    dataset = pd.read_csv(dataset_file_name,
                      names=list(attributes.keys()))
    print(dataset.head(5))
    return attributes, dataset


def entropy(target_col, col_type, split_point=0.0):
    if col_type == 'category':
        counts = list(target_col.value_counts().values)
    else:
        left = target_col <= split_point
        right = target_col > split_point
        # print(left, right)
        counts = [len(target_col[left]), len(target_col[right])]
    # print(split_point, counts)
    entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(counts))])
    return entropy

def InfoGainRatio(data,split_attribute_name,split_att_type,target_name="class"):
    total_entropy = entropy(data[target_name], attributes[target_name])
    

    if split_att_type == 'category':
        tmp = data[split_attribute_name].value_counts()
        val = list(tmp.index)
        counts = list(tmp.values)
        
        information = data[split_att_type].value_counts()

        Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name], attributes[split_attribute_name]) for i in range(len(vals))])
        Information_Gain = total_entropy - Weighted_Entropy
        
        if Information_Gain == 0.0:
            return Information_Gain
        Gain_Ratio = Information_Gain/entropy(data[split_attribute_name], attributes[split_attribute_name])
        return Gain_Ratio, None
    else:
        values = list(np.unique(data[split_attribute_name]))
#         print("......................")
#         print(values)
        best = 0
        idx = None
        for val in values:
#             val = values[i]
            left = data[split_attribute_name] <= val
            right = data[split_attribute_name] > val
            counts = [len(data[split_attribute_name][left]), len(data[split_attribute_name][right])]
            # print(counts, val) 
            Weighted_Entropy = (counts[0]/np.sum(counts))*entropy(data.where(data[split_attribute_name]<=val).dropna()[target_name], attributes[target_name], val) + (counts[1]/np.sum(counts))*entropy(data.where(data[split_attribute_name]>val).dropna()[target_name], attributes[target_name], val)
            Information_Gain = total_entropy - Weighted_Entropy
            if Information_Gain == 0.0:
                continue
            # print(Information_Gain)
            Gain_Ratio = Information_Gain/entropy(data[split_attribute_name], attributes[split_attribute_name], val)
#             print(entropy(data[split_attribute_name], attributes[split_attribute_name]), Gain_Ratio)
#             print("Gain ratio", Gain_Ratio)
            if Gain_Ratio>=best:
                best = Gain_Ratio
                idx = val
        # print(best, idx)
        return best, idx

def makeDecisionTree(data,features,target_attribute_name="class",parent_node_class = None):
#     print(len(data))
#     print("tan",target_attribute_name)
#     print(features)
    
    if len(features)==0 or len(data) == 0 or len(data.columns) == 1:
        return parent_node_class

    try:
        if len(np.unique(data[target_attribute_name])) <= 1:
            return np.unique(data[target_attribute_name])[0]
    except KeyError:
        print(data)
    

    else:
        parent_node_class = data[target_attribute_name].value_counts().idxmax()
        max_GR = -math.inf 
        
        for feature in features:
            GR, point = InfoGainRatio(data,feature,target_attribute_name)
            if GR> max_GR:
                max_GR = GR
                split_point = point
                best_feature = feature

        tree = {best_feature:{}}

        features = features[features != best_feature]
        if attributes[best_feature] == 'category':
            grouped = data.groupby(data[best_feature])
            for value in np.unique(data[best_feature]):
                sub_data = grouped.get_group(value)
                if best_feature != target_attribute_name:
                    del sub_data[best_feature]
                subtree = makeDecisionTree(sub_data,features,target_attribute_name,parent_node_class)
                tree[best_feature][value] = subtree

            return(tree)
        else:
            sub_data1 = data[data[best_feature]<=split_point]
            sub_data2 = data[data[best_feature]>split_point]
            if best_feature != target_attribute_name:
                del sub_data1[best_feature]
                del sub_data2[best_feature]
            subtree1 = makeDecisionTree(sub_data1,features,target_attribute_name,parent_node_class)
            subtree2 = makeDecisionTree(sub_data2,features,target_attribute_name,parent_node_class)
            tree[best_feature][split_point] = [subtree1, subtree2]
            return(tree)


def predict(query,tree,default = 1):
    if not isinstance(tree, dict):
        return tree
    att_name = list(tree.keys())[0]
    if attributes[att_name] == 'category':
        try:
            result_tree = tree[att_name][query[att_name]]
        except:
            return default
        result_tree = tree[att_name][query[att_name]]
        return predict(query, result_tree)
    else:
        key_val = list(tree[att_name].keys())[0]
        if  query[att_name]<=key_val:
            result_tree = tree[att_name][key_val][0]
        else:
            result_tree = tree[att_name][key_val][1]
        return predict(query, result_tree)

def printDecisionTree(tree,level):
    level += " "
    for key,value in tree.items():
        if isinstance(value, dict):
            print(level+str(key)+":")
            printDecisionTree(value, level)
        elif isinstance(value,list):
            if isinstance(value[0],dict):
                print(level+str(key)+">= (less than):")
                printDecisionTree(value[0], level)
            else:
                print(level+str(key)+">= (less than):"+"-->",value[0])
                
            if isinstance(value[1],dict):
                print(level+str(key)+" < (greater than):")
                printDecisionTree(value[1], level)
            else:
                print(level+str(key)+"< (greater than):"+"-->",value[1])
        else: 
            print(level+str(key)+"-->",value)    


def test(data,tree, features):
    original_data = list(data['class'])
    queries = data[features].to_dict(orient = "records")
    predictions = [predict(query,tree) for query in queries]
    accuracy = accuracy_score(predictions, original_data)
    precision = precision_score(predictions, original_data, average="macro")
    recall = recall_score(predictions, original_data, average="macro")
    f1 = f1_score(predictions, original_data, average="macro")
    return accuracy, precision, recall, f1

attributes, dataset = getData(dataset_name='winequality-red')
training_data, testing_data = train_test_split(dataset, test_size = 0.2)

# print(attributes)
features = training_data.columns
features = features[features!= 'class']

tree = makeDecisionTree(training_data,features)
accuracy, precision, recall, f1 = test(testing_data,tree, features)
printDecisionTree(tree,"")
# print(tree)

print("accuracy \t precision \t recall \t f1")
print("{:.2f}".format(accuracy*100),"%\t\t", 
      "{:.2f}".format(precision*100),"%\t",
      "{:.2f}".format(recall*100),"%\t", 
      "{:.2f}".format(f1*100),"%")

  buying  maint doors persons lug_boot safety  class
0  vhigh  vhigh     2       2    small    low  unacc
1  vhigh  vhigh     2       2    small    med  unacc
2  vhigh  vhigh     2       2    small   high  unacc
3  vhigh  vhigh     2       2      med    low  unacc
4  vhigh  vhigh     2       2      med    med  unacc
 persons:
  2--> unacc
  4:
   safety:
    high:
     buying:
      high:
       maint:
        high--> acc
        low--> acc
        med--> acc
        vhigh--> unacc
      low:
       maint:
        high:
         lug_boot:
          big--> vgood
          med:
           doors:
            2--> acc
            3--> acc
            4--> acc
            5more--> acc
          small--> acc
        low:
         lug_boot:
          big--> vgood
          med:
           doors:
            2--> vgood
            4--> vgood
            5more--> vgood
          small--> good
        med:
         lug_boot:
          big--> vgood
          med:
           doors:
            2--

In [71]:
data.colums

NameError: name 'data' is not defined

In [72]:
training_data.columns

Index(['class', 'Alcohol', 'MalicAcid', 'Ash', 'AlcalinityofAsh', 'Magnesium',
       'Phenols', 'Flavanoids', 'NonflavanoidPhenols', 'Proanthocyanins',
       'ColorIntensity', 'Hue', 'DilutedWines', 'Proline'],
      dtype='object')

In [73]:
len(training_data.columns)

14