In [28]:
import pandas as pd
import numpy as np
import math
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('always')

def getData(dataset_name):
    attribute_file_name = 'Data/'+dataset_name+".attribute"
    dataset_file_name = 'Data/'+dataset_name+".data"
    att = pd.read_csv(attribute_file_name,
                      delim_whitespace=True,
                     header = None)
    attributes = {rows[0]:rows[1] for _,rows in att.iterrows()}
    dataset = pd.read_csv(dataset_file_name,
                      names=list(attributes.keys()))
    return attributes, dataset


def getEntropy(target_col, col_type, split_point=0.0):
    if col_type == 'category':
        counts = list(target_col.value_counts().values)
    else:
        left = target_col <= split_point
        right = target_col > split_point
        counts = [len(target_col[left]), len(target_col[right])]
    entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(counts))])
    return entropy

def InfoGainRatio(data,split_attribute_name,split_att_type,target_name="class"):
    total_entropy = getEntropy(data[target_name], attributes[target_name])
    

    if split_att_type == 'category':
        tmp = data[split_attribute_name].value_counts()
        val = list(tmp.index)
        counts = list(tmp.values)
        information = data[split_att_type].value_counts()
        Weighted_Entropy = np.sum([(counts[i]/np.sum(counts))*getEntropy(data.where(data[split_attribute_name]==vals[i]).dropna()[target_name], attributes[split_attribute_name]) for i in range(len(vals))])
        Information_Gain = total_entropy - Weighted_Entropy
        if Information_Gain == 0.0:
            return Information_Gain
        Gain_Ratio = Information_Gain/getEntropy(data[split_attribute_name], attributes[split_attribute_name])
        return Gain_Ratio, None
    else:
        values = list(np.unique(data[split_attribute_name]))
        best = 0
        idx = None
        for val in values:
            left = data[split_attribute_name] <= val
            right = data[split_attribute_name] > val
            counts = [len(data[split_attribute_name][left]), len(data[split_attribute_name][right])]
            Weighted_Entropy = (counts[0]/np.sum(counts))*getEntropy(data.where(data[split_attribute_name]<=val).dropna()[target_name], attributes[target_name], val) + (counts[1]/np.sum(counts))*getEntropy(data.where(data[split_attribute_name]>val).dropna()[target_name], attributes[target_name], val)
            Information_Gain = total_entropy - Weighted_Entropy
            if Information_Gain == 0.0:
                continue
            Gain_Ratio = Information_Gain/getEntropy(data[split_attribute_name], attributes[split_attribute_name], val)
            if Gain_Ratio>=best:
                best = Gain_Ratio
                idx = val
        return best, idx

def makeDecisionTree(data,features,target_attribute_name="class",parent_node_class = None):
    if len(features)==0 or len(data) == 0 or len(data.columns) == 1:
        return parent_node_class
    try:
        if len(np.unique(data[target_attribute_name])) <= 1:
            return np.unique(data[target_attribute_name])[0]
    except KeyError:
        print("Key Error")
    else:
        parent_node_class = data[target_attribute_name].value_counts().idxmax()
        max_GR = -math.inf 
        for feature in features:
            GR, point = InfoGainRatio(data,feature,target_attribute_name)
            if GR> max_GR:
                max_GR = GR
                split_point = point
                best_feature = feature
        tree = {best_feature:{}}
        features = features[features != best_feature]
        if attributes[best_feature] == 'category':
            grouped = data.groupby(data[best_feature])
            for value in np.unique(data[best_feature]):
                sub_data = grouped.get_group(value)
                if best_feature != target_attribute_name:
                    del sub_data[best_feature]
                subtree = makeDecisionTree(sub_data,features,target_attribute_name,parent_node_class)
                tree[best_feature][value] = subtree
            return(tree)
        else:
            sub_data1 = data[data[best_feature]<=split_point]
            sub_data2 = data[data[best_feature]>split_point]
            if best_feature != target_attribute_name:
                del sub_data1[best_feature]
                del sub_data2[best_feature]
            subtree1 = makeDecisionTree(sub_data1,features,target_attribute_name,parent_node_class)
            subtree2 = makeDecisionTree(sub_data2,features,target_attribute_name,parent_node_class)
            tree[best_feature][split_point] = [subtree1, subtree2]
            return(tree)


def predict(query,tree,default = 1):
    if not isinstance(tree, dict):
        return tree
    att_name = list(tree.keys())[0]
    if attributes[att_name] == 'category':
        try:
            result_tree = tree[att_name][query[att_name]]
        except:
            return default
        result_tree = tree[att_name][query[att_name]]
        return predict(query, result_tree)
    else:
        key_val = list(tree[att_name].keys())[0]
        if  query[att_name]<=key_val:
            result_tree = tree[att_name][key_val][0]
        else:
            result_tree = tree[att_name][key_val][1]
        return predict(query, result_tree)

def printDecisionTree(tree,level):
    level += " "
    for key,value in tree.items():
        if isinstance(value, dict):
            print(level+str(key)+":")
            printDecisionTree(value, level)
        elif isinstance(value,list):
            if isinstance(value[0],dict):
                print(level+str(key)+">= (less than):")
                printDecisionTree(value[0], level)
            else:
                print(level+str(key)+">= (less than):"+"-->",value[0])
                
            if isinstance(value[1],dict):
                print(level+str(key)+" < (greater than):")
                printDecisionTree(value[1], level)
            else:
                print(level+str(key)+"< (greater than):"+"-->",value[1])
        else: 
            print(level+str(key)+"-->",value)    


def test(data,tree, features):
    original_data = list(data['class'])
    queries = data[features].to_dict(orient = "records")
    predictions = [predict(query,tree) for query in queries]
    accuracy = accuracy_score(original_data, predictions)
    precision = precision_score(original_data, predictions, average="macro")
    recall = recall_score(original_data, predictions, average="macro")
    f1 = f1_score(original_data, predictions, average="macro")
    return accuracy*100, precision*100, recall*100, f1*100

attributes, dataset = getData(dataset_name='WholesaleCustomer')
dataset = dataset.dropna()
print(attributes.keys())

print("Data Load successfully!!!")
for j in [0.2]:    
    testSize = j
    for i in range(5):
        training_data, testing_data = train_test_split(dataset, test_size = testSize)
        # print(attributes)
        features = training_data.columns
        features = features[features!= 'class']

        tree = makeDecisionTree(training_data,features)
        accuracy, precision, recall, f1 = test(testing_data,tree, features)

        print('DecisionTree,'+str(testSize)+","+str(accuracy)+","+str(precision)+","+str(recall)+","+str(f1))

# print("accuracy \t precision \t recall \t f1")
# print("{:.2f}".format(accuracy),"%\t\t", 
#       "{:.2f}".format(precision),"%\t",
#       "{:.2f}".format(recall),"%\t", 
#       "{:.2f}".format(f1),"%")

dict_keys(['TR', 'TM', 'TL', 'MR', 'MM', 'ML', 'BR', 'BM', 'BL', 'class'])
Data Load successfully!!!


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


DecisionTree,0.1,76.04166666666666,56.17690058479532,51.028433151845135,53.35981463091692


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


DecisionTree,0.1,82.29166666666666,59.64015151515151,54.785954785954786,57.07083876575402


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


DecisionTree,0.1,83.33333333333334,60.478927203065126,54.071075123706706,56.67001843472431


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


DecisionTree,0.1,83.33333333333334,56.423611111111114,54.07969639468691,55.142195767195766


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


DecisionTree,0.1,76.04166666666666,54.88215488215488,50.649350649350644,52.6029526029526


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


DecisionTree,0.2,86.45833333333334,58.517060367454064,56.18085618085619,57.287157287157285


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


DecisionTree,0.2,81.77083333333334,57.469519208649636,54.7979797979798,55.9801630404968


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


DecisionTree,0.2,78.64583333333334,56.512721238938056,51.651893634165994,53.95167379813742


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


DecisionTree,0.2,85.9375,57.47927031509121,56.770833333333336,57.07953705983747


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


DecisionTree,0.2,86.45833333333334,60.21971288515405,56.78811158964808,58.425925925925924


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


DecisionTree,0.3,84.72222222222221,59.18289423444062,54.75972004821791,56.74984191838125


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


DecisionTree,0.3,81.94444444444444,58.0542264752791,54.03472931562819,55.957554736019524


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


DecisionTree,0.3,85.41666666666666,58.178613631356534,55.7375241246209,56.93189881671557


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


DecisionTree,0.3,78.81944444444444,54.940302522399485,51.188686000716075,52.990664533394835


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


DecisionTree,0.3,84.375,57.213403880070544,54.81919083422843,55.96848743427084


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


DecisionTree,0.4,77.08333333333334,55.12946574893478,50.63251106894371,52.78448803427062


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


DecisionTree,0.4,80.46875,55.22995916612938,52.79250618593142,53.98372731706065


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


DecisionTree,0.4,80.46875,56.66738166738167,51.9370651486401,54.037786774628884


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


DecisionTree,0.4,79.42708333333334,55.44513757918662,50.977352181182546,53.091858648426324
DecisionTree,0.4,79.94791666666666,55.41863564764328,50.819574594675,52.72124374864101


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


In [10]:
data.colums

NameError: name 'data' is not defined

In [None]:
training_data.columns

In [None]:
len(training_data.columns)