In [97]:
import numpy as np
import pandas as pd
from collections import Counter
import math

In [19]:
df = pd.read_csv('golf_weather.txt')
df.head()

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,overcast,hot,high,False,yes
1,overcast,cool,normal,True,yes
2,overcast,mild,high,True,yes
3,overcast,hot,normal,False,yes
4,rainy,mild,high,False,yes


In [20]:
header = list(df.columns)
header

['Outlook', 'Temperature', 'Humidity', 'Windy', 'Play']

In [45]:
def unique_vals(data, column):
    x = data.loc[:,column]
    return list(set(x))

In [52]:
def class_counts(data):
    x = df.iloc[:,-1]
    return dict(Counter(x))

In [54]:
def is_numeric(value):
    return isinstance(value, int) or isinstance(value, float)

In [63]:
class Question:
    
    def __init__(self,column,value):
        self.column = column
        self.value = value
    
    
    def match(self,example):
        val = example[self.column]
        if is_numeric(val):
            return val>=self.value
        else:
            return val==self.value
    
    def __repr__(self):
        condition = "=="
        if is_numeric(self.value):
            condition = ">="
        return "Is %s %s %s?" % (
            header[self.column], condition, str(self.value))
    

In [85]:
def partition(data, question):
    true_rows, false_rows = [], []
    for i in range(len(data)):
        row = df.loc[i,:]
        if question.match(row):
            true_rows.append(row)
        else:
            false_rows.append(row)
    return true_rows, false_rows

In [98]:
def entropy(data):
    log2 = lambda x:math.log(x)/math.log(2)
    results = class_counts(data)
    entropy_value = 0.0
    for r in results.keys():
        p = float(results[r])/len(data)
        entropy_value = entropy_value - p * log2(p)
    return entropy_value

In [99]:
entropy(df)

0.9402859586706309

In [101]:
def info_gain(left, right, current_uncertainty):
    p = float(len(left))/(len(left)+len(right)) 
    return current_uncertainty-p*entropy(left)-(1-p)*entropy(right)

In [124]:
def find_best_split(rows):
    best_gain = 0  
    best_question = None  
    current_uncertainty = entropy(rows)
    n_features = len(df.columns) - 1 

    for col in range(n_features): 
        
        values = set([row[col] for row in rows])  

        for val in values:  

            question = Question(col, val)

            true_rows, false_rows = partition(rows, question)

            if len(true_rows) == 0 or len(false_rows) == 0:
                continue
            gain = info_gain(true_rows, false_rows, current_uncertainty)

            if gain >= best_gain:
                best_gain, best_question = gain, question

    return best_gain, best_question

In [113]:
best_gain, best_question = find_best_split(df)
best_question

Is Temperature == cool?

In [115]:
class Leaf:

    def __init__(self, rows):
        self.predictions = class_counts(rows)
             

In [117]:
class Decision_Node:

    def __init__(self,question,true_branch,false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

In [118]:
def build_tree(rows):

    gain, question = find_best_split(rows)

    if gain == 0:
        return Leaf(rows)

    true_rows, false_rows = partition(rows, question)
    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)

    return Decision_Node(question, true_branch, false_branch)

In [132]:
def print_tree(node, spacing=""):
 
    if isinstance(node, Leaf):
        print (spacing + "Predict", node.predictions)
        return
    print (spacing + str(node.question))
    print (spacing + '--> True:')
    print_tree(node.true_branch, spacing + "  ")
    print (spacing + '--> False:')
    print_tree(node.false_branch, spacing + "  ")

In [120]:
def classify(row, node):

    if isinstance(node, Leaf):
        return node.predictions

    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)

In [121]:
def print_leaf(counts):
    total = sum(counts.values()) * 1.0
    probs = {}
    for lbl in counts.keys():
        probs[lbl] = str(int(counts[lbl] / total * 100)) + "%"
    return probs

In [133]:
my_tree = build_tree(df)
print_tree(my_tree)

Predict {'yes': 9, 'no': 5}


In [127]:
testing_data = pd.read_csv('golf_weather_train.txt')
testing_data.head()

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,sunny,cool,normal,False,yes
1,sunny,mild,normal,True,yes
2,rainy,mild,high,True,no
3,sunny,hot,high,False,no


In [129]:
for row in range(len(testing_data)):
    print ("Actual: %s. Predicted: %s" %
           (testing_data.loc[row,'Play'], print_leaf(classify(testing_data.loc[row,:], my_tree))))

Actual: yes. Predicted: {'yes': '64%', 'no': '35%'}
Actual: yes. Predicted: {'yes': '64%', 'no': '35%'}
Actual: no. Predicted: {'yes': '64%', 'no': '35%'}
Actual: no. Predicted: {'yes': '64%', 'no': '35%'}
