In [1]:

from os import write
import pandas as pd
import xlsxwriter
import numpy as np
import math

In [2]:
df = pd.read_excel("trainDATA.xlsx")
training_data = df.iloc[:,:].values
header = df.columns

In [3]:
def unique_vals(rows, column):
    unique_values_arr = []
    for row in rows:
        unique_values_arr.append(row[column])
    unique_values = set(unique_values_arr)
    return unique_values

In [4]:
def class_acceptibility_counts(rows):
    counts = {} 
    for row in rows:
        size_of_counts = row[-1]
        if size_of_counts not in counts:
            counts[size_of_counts] = 0
        counts[size_of_counts] += 1
    return counts

In [5]:
def is_numeric(value):
    return isinstance(value, int) or isinstance(value, float)

In [6]:
class Question:
    def __init__(self, column, value):
        self.column = column
        self.value = value

    def match(self, example):
        val = example[self.column]
        if is_numeric(val):
            return val >= self.value
        else:
            return val == self.value

    def __repr__(self):
        condition = "=="
        if is_numeric(self.value):
            condition = ">="
        return "Is %s %s %s?" % (
            header[self.column], condition, str(self.value))

In [7]:
def partition(rows, question):
    true_rows, false_rows = [], []
    for row in rows:
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows

In [8]:
def cal_entropy(rows):
    counts = class_acceptibility_counts(rows)
    entropy_result = 0
    for acceptibilities in counts:
        size_of_acceptibilities = counts[acceptibilities] / float(len(rows))
        try:
            entropy_result += (-size_of_acceptibilities*(math.log2(size_of_acceptibilities)))
        except:
            entropy_result += 0
    print(entropy_result)
    return entropy_result

In [9]:
def cal_gain(left, right, system_entropy):
    size_of_acceptibilities = float(len(left)) / (len(left) + len(right))
    return system_entropy - size_of_acceptibilities * cal_entropy(left) - (1 - size_of_acceptibilities) * cal_entropy(right)

In [10]:
system_entropy = cal_entropy(training_data)


0.7872804393182292


In [11]:
def find_best(rows):
    best_gain = 0 
    best_question = None
    system_entropy = cal_entropy(rows)
    n_features = len(rows[0]) - 1

    for col in range(n_features): 

        values = set([row[col] for row in rows])

        for val in values:  

            question = Question(col, val)

            true_rows, false_rows = partition(rows, question)

            if len(true_rows) == 0 or len(false_rows) == 0:
                continue

            gain = cal_gain(true_rows, false_rows, system_entropy)

            if gain >= best_gain:
                best_gain, best_question = gain, question

    return best_gain, best_question

In [12]:
class Leaf:
    def __init__(self, rows):
        self.predictions = class_acceptibility_counts(rows)

In [13]:
class Decision_Node:
    def __init__(self,question,true_branch,false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

In [14]:
def build_tree(rows):
    gain, question = find_best(rows)

    if gain == 0:
        return Leaf(rows)

    true_rows, false_rows = partition(rows, question)

    true_branch = build_tree(true_rows)

    false_branch = build_tree(false_rows)

    return Decision_Node(question, true_branch, false_branch)

In [15]:
def print_tree(node, spacing=""):
    if isinstance(node, Leaf):
        print (spacing + "Predict", node.predictions)
        return

    print (spacing + str(node.question))

    print (spacing + 'True:')
    print_tree(node.true_branch, spacing + "  ")

    print (spacing + 'False:')
    print_tree(node.false_branch, spacing + "  ")

In [16]:
my_tree = build_tree(training_data)

0.7872804393182292
0.6256869870089501
0.8347756013217693
0.7994852069542833
0.7828202581796384
0.8208660768919656
0.777237644388995
0.8793419305040906
0.75135853850485
0.8171387756817723
0.7762912242902802
0.8072782057018655
0.781081016266322
0.8457590140108325
0.766237955740555
0.6662998308582158
0.8239669592378774
0.6886987913976084
0.816583479098704
0.8088077134828131
0.7798959085642831
0.8292743048721372
0.7723238991962156
0.8112781244591328
0.7790770580735856
0.0
0.9567929028921142
0.9672947789468944
0.648307257349972
0.9453042722469087
0.6709665978143192
0.6840384356390417
0.8340751313153625
0.8249658680139926
0.7674044165839726
0.8431619569518378
0.7578137553096411
0.0
0.9486131982385806
0.9111942950464766
0.7016642239899264
0.9772665935373288
0.6266982158348953
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.9567929028921142
0.7888108387831283
0.9885403823755425
0.9510825404057

In [17]:
print_tree(my_tree)

Is Persons == 2?
True:
  Predict {1: 491}
False:
  Is Safety == 1?
  True:
    Predict {1: 303}
  False:
    Is Price == 1?
    True:
      Is MaintPrice == 4?
      True:
        Predict {1: 40}
      False:
        Is MaintPrice == 1?
        True:
          Predict {1: 38}
        False:
          Is Lug_size == 1?
          True:
            Is Safety == 3?
            True:
              Is NoofDoors == 2?
              True:
                Is Persons == 6?
                True:
                  Predict {1: 2}
                False:
                  Predict {2: 2}
              False:
                Predict {2: 10}
            False:
              Predict {1: 13}
          False:
            Is Safety == 3?
            True:
              Predict {2: 24}
            False:
              Is NoofDoors == 2?
              True:
                Is Lug_size == 3?
                True:
                  Predict {2: 3}
                False:
                  Predict {1: 3}
         

In [18]:
def classify(row, node):
    if isinstance(node, Leaf):
        return node.predictions

    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)

In [19]:
df_testing = pd.read_excel("testDATA.xlsx")
testing_data = df_testing.iloc[:,:].values

In [20]:
def prediction(testing_data):
    car_acceptibility = []
    for row in testing_data:
        car_acceptibility.append(list(classify(row, my_tree).keys())[0])
    return car_acceptibility

In [21]:
def add_column(dataframe, column_name, column_data):
    car_acceptibility_arr = prediction(testing_data)
    dataframe[column_name] = car_acceptibility_arr
    return dataframe
    # return dataframe.to_excel("testDATA.xlsx", index=False)

In [22]:
add_column(df_testing, "Car Acceptibility", prediction(testing_data))

Unnamed: 0,Price,MaintPrice,NoofDoors,Persons,Lug_size,Safety,Car Acceptibility
0,4,3,4,4,3,1,1
1,2,3,3,4,3,2,2
2,1,4,4,4,2,3,1
3,4,3,4,6,1,2,2
4,1,4,5,4,1,2,1
...,...,...,...,...,...,...,...
289,1,3,5,2,3,3,1
290,2,1,2,4,3,3,2
291,3,3,2,4,1,2,2
292,3,4,5,4,1,1,1
