In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt', names = ["variance", "skewness", "curtosis", "entropy", "class"])

In [3]:
df = df.astype(np.float64)
df.head()

Unnamed: 0,variance,skewness,curtosis,entropy,class
0,3.6216,8.6661,-2.8073,-0.44699,0.0
1,4.5459,8.1674,-2.4586,-1.4621,0.0
2,3.866,-2.6383,1.9242,0.10645,0.0
3,3.4566,9.5228,-4.0112,-3.5944,0.0
4,0.32924,-4.4552,4.5718,-0.9888,0.0


In [4]:
train, test = train_test_split(df, test_size = 0.5)

In [5]:
train_data = train.values
test_data = test.values

In [6]:
def isPure (data) :
    label = data[:, -1]
    different = np.unique(label)
    
    if len(different) == 1 :
        return True
    else:
        return False

def predict (data) :
    label = data[:, -1]
    different, counts = np.unique(label, return_counts = True)
    
    i = counts.argmax()
    return different[i]

In [7]:
def potential_splits (data) :
    potential_split = {}
    _, n = data.shape
    for i in range(n - 1) :
        values = data[:, i]
        different = np.unique(values)
        
        potential_split[i] = different
    return potential_split
        

In [8]:
def split (data, column, value) :
    values = data[:, column]
    left = data[values <= value]
    right = data[values > value]
    
    return left, right

In [9]:
def entropy (data) :
    label = data[:, -1]
    different, count = np.unique(label, return_counts = True)
    prob = count / count.sum()
    e = sum(prob * -np.log2(prob))
    
    return e

In [10]:
def total_entropy(left, right) :
    total = len(left) + len(right)
    pleft = len(left) / total
    pright = len(right) / total
    
    te = (pleft * entropy(left) + pright * entropy(right))
    
    return te

In [11]:
def best_split(data) :
    potential_split = potential_splits(data)
    
    te = 10000000
    for i in potential_split :
        for j in potential_split[i] :
            left, right = split(data, i, j)
            
            oe = total_entropy(left, right)
            if oe <= te :
                b_column = i
                b_value = j
                te = oe
    return b_column, b_value

In [12]:
def build_tree (data, depth , min_samples, max_depth) :
    if (isPure(data)) or (len(data) < min_samples) or (depth == max_depth) :
        return predict(data)
    
    else :
        depth += 1
        column, value = best_split(data)
        left, right = split(data, column, value)
        
        question = "{} <= {}".format(column, value)
        
        sub_tree = {question : []}
        
        yes = build_tree(left, depth, min_samples, max_depth)
        no = build_tree(right, depth, min_samples, max_depth)
        
        if yes == no :
            sub_tree = yes
        else :
            sub_tree[question].append(yes)
            sub_tree[question].append(no)
            
        return sub_tree

In [13]:
tree = build_tree(train_data, 0, 5, 10)
tree

{'0 <= 0.2952': [{'1 <= 5.8333': [{'2 <= 6.7807': [1.0,
      {'1 <= -5.1877': [1.0, 0.0]}]},
    {'0 <= -4.2249': [{'3 <= -2.9155': [1.0, 0.0]}, 0.0]}]},
  {'0 <= 2.2279': [{'2 <= -2.2126': [{'1 <= 5.2022': [1.0, 0.0]},
      {'0 <= 0.74428': [{'3 <= -0.18967': [0.0, {'2 <= 1.6131': [1.0, 0.0]}]},
        0.0]}]},
    0.0]}]}

In [14]:
def classify (test, tree) :
    question = list(tree.keys())[0]
    column, operator, value = question.split(" ")
    
    if test[int(column)] <= float(value) :
        answer = tree[question][0]
    else :
        answer = tree[question][1]
    
    
    if not isinstance(answer, dict) :
        return answer
    else :
        return classify(test, answer)


In [15]:
correct = 0
for i in range(len(test)) :
    prediction = classify(test.iloc[i], tree)
    if float(prediction) == test.iloc[i, -1] :
        correct += 1
print(correct / len(test) * 100)

97.81341107871721
