##Importing the data

In [1]:
#for manipulating the csv data
import pandas as pd
#for mathematical calculation 
import numpy as np

In [2]:
data = pd.read_csv("play_tennis.csv")

In [3]:
data.head()

Unnamed: 0,day,outlook,temp,humidity,wind,play
0,D1,Sunny,Hot,High,Weak,No
1,D2,Sunny,Hot,High,Strong,No
2,D3,Overcast,Hot,High,Weak,Yes
3,D4,Rain,Mild,High,Weak,Yes
4,D5,Rain,Cool,Normal,Weak,Yes


In [4]:
newData = data[['outlook', 'temp', 'humidity', 'wind', 'play']]
newData.head()

Unnamed: 0,outlook,temp,humidity,wind,play
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes


##Code for calculating total entropy of the samples


In [5]:
def calculateTotalEntropy(train_data, label, class_list):
    totalRow = train_data.shape[0]
    entropy = 0
    # for every class calculate the entropt with sum(-plog(p))
    for c in class_list:
        totalClassCount = train_data[train_data[label] == c].shape[0]
        totalClassEntropy = - (totalClassCount/totalRow)*np.log2(totalClassCount/totalRow) 
        entropy += totalClassEntropy
    
    return entropy


##Code for calculating the entropy of System


In [6]:

def calculateEntropy(feature_value_data, label, class_list):
    classCount = feature_value_data.shape[0]
    entropy = 0
    
    for c in class_list:
        labelClassCount = feature_value_data[feature_value_data[label] == c].shape[0]
    
        entropyClass = 0
        # if count is not 0 the calculate entropy else it is 0
        if labelClassCount != 0:
            probability_class = labelClassCount/classCount
            entropyClass = - probability_class * np.log2(probability_class) 
        
        entropy += entropyClass
        
    return entropy





##Calculating the information gain i.e totalEntropy - entropyOfClasses

In [7]:

def calculateInformationGain(feature_name, train_data, label, class_list):
    feature_value_list = train_data[feature_name].unique()
    totalRow = train_data.shape[0]
    feature_info = 0.0
    
    for feature_value in feature_value_list:
        feature_value_data = train_data[train_data[feature_name] == feature_value]
        feature_value_count = feature_value_data.shape[0]
        feature_value_entropy = calculateEntropy(feature_value_data, label, class_list)
        feature_value_probability = feature_value_count/totalRow
        feature_info += feature_value_probability * feature_value_entropy
        
    return calculateTotalEntropy(train_data, label, class_list) - feature_info





##Code for finding the most informative feature

In [8]:

def findMostInformativeFeature(train_data, label, class_list):
    feature_list = train_data.columns.drop(label)
    maxInformationGain = -1
    maxInformationFeature = None
    
    for feature in feature_list:  
        feature_info_gain = calculateInformationGain(feature, train_data, label, class_list)
        if maxInformationGain < feature_info_gain:
            maxInformationGain = feature_info_gain
            maxInformationFeature = feature
            
    return maxInformationFeature







##Code for generating the subtree of a node after finding the maximum information gain feature

In [9]:

def generateSubTree(feature_name, train_data, label, class_list):
    feature_value_count_dict = train_data[feature_name].value_counts(sort=False)
    tree = {}
    
    for feature_value, count in feature_value_count_dict.iteritems():
        feature_value_data = train_data[train_data[feature_name] == feature_value]
        
        assigned_to_node = False
        for c in class_list:
            classCount = feature_value_data[feature_value_data[label] == c].shape[0]

            if classCount == count:
                tree[feature_value] = c
                train_data = train_data[train_data[feature_name] != feature_value]
                assigned_to_node = True
        if not assigned_to_node:
            tree[feature_value] = "?"
            
    return tree, train_data





##Code for actually making the decision tree
###ID3

In [10]:


def makeTree(root, prev_feature_value, train_data, label, class_list):
    if train_data.shape[0] != 0:
        maxInformationFeature = findMostInformativeFeature(train_data, label, class_list)
        tree, train_data = generateSubTree(maxInformationFeature, train_data, label, class_list)
        next_root = None
        
        if prev_feature_value != None:
            root[prev_feature_value] = dict()
            root[prev_feature_value][maxInformationFeature] = tree
            next_root = root[prev_feature_value][maxInformationFeature]
        else:
            root[maxInformationFeature] = tree
            next_root = root[maxInformationFeature]
        
        for node, branch in list(next_root.items()):
            if branch == "?":
                feature_value_data = train_data[train_data[maxInformationFeature] == node]
                makeTree(next_root, node, feature_value_data, label, class_list)





##Wrapper over makeTree function

In [11]:

def ID3(trainData, label):
    train_data = trainData.copy()
    tree = {}
    class_list = train_data[label].unique()
    makeTree(tree, None, trainData, label, class_list)
    
    return tree





##Code for prediction

In [12]:
def predict(tree, instance):
    if not isinstance(tree, dict):
        return tree
    else:
        root_node = next(iter(tree))
        feature_value = instance[root_node]
        if feature_value in tree[root_node]:
            return predict(tree[root_node][feature_value], instance)
        else:
            return None



##Code for evaluation

In [13]:

def evaluate(tree, test_data_m, label):
    correct_preditct = 0
    wrong_preditct = 0
    for index, row in test_data_m.iterrows():
        result = predict(tree, row)
        if result == row[label]:
            correct_preditct += 1
        else:
            wrong_preditct += 1
    accuracy = correct_preditct / (correct_preditct + wrong_preditct)
    return accuracy




##Test-Train split

In [14]:
from sklearn.model_selection import train_test_split
trainData , testData = train_test_split(newData,test_size = 0.3)

In [15]:
trainData

Unnamed: 0,outlook,temp,humidity,wind,play
8,Sunny,Cool,Normal,Weak,Yes
0,Sunny,Hot,High,Weak,No
4,Rain,Cool,Normal,Weak,Yes
11,Overcast,Mild,High,Strong,Yes
3,Rain,Mild,High,Weak,Yes
1,Sunny,Hot,High,Strong,No
12,Overcast,Hot,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes
13,Rain,Mild,High,Strong,No


In [16]:
testData

Unnamed: 0,outlook,temp,humidity,wind,play
10,Sunny,Mild,Normal,Strong,Yes
5,Rain,Cool,Normal,Strong,No
7,Sunny,Mild,High,Weak,No
2,Overcast,Hot,High,Weak,Yes
6,Overcast,Cool,Normal,Strong,Yes


##Making the classifier
###parameter1 : input Data -> testing Data
###parameter2 : label -> output class Name


In [17]:
tree = ID3(trainData, 'play')
tree

{'humidity': {'High': {'outlook': {'Overcast': 'Yes',
    'Rain': {'wind': {'Strong': 'No', 'Weak': 'Yes'}},
    'Sunny': 'No'}},
  'Normal': 'Yes'}}

In [18]:
accuracy = evaluate(tree , testData , 'play')
print("Accuracy of Decision Tree : " , accuracy)

Accuracy of Decision Tree :  0.8


In [19]:
testData.iloc[1]['play']

'No'

In [20]:
predict(tree , testData.iloc[1])

'Yes'

##Now training on day (earlier we excluded day from training data)

In [21]:
trainDataDay , testDataDay = train_test_split(data,test_size = 0.3)

In [22]:
trainDataDay

Unnamed: 0,day,outlook,temp,humidity,wind,play
2,D3,Overcast,Hot,High,Weak,Yes
3,D4,Rain,Mild,High,Weak,Yes
1,D2,Sunny,Hot,High,Strong,No
13,D14,Rain,Mild,High,Strong,No
8,D9,Sunny,Cool,Normal,Weak,Yes
7,D8,Sunny,Mild,High,Weak,No
6,D7,Overcast,Cool,Normal,Strong,Yes
10,D11,Sunny,Mild,Normal,Strong,Yes
0,D1,Sunny,Hot,High,Weak,No


In [23]:
testDataDay

Unnamed: 0,day,outlook,temp,humidity,wind,play
9,D10,Rain,Mild,Normal,Weak,Yes
12,D13,Overcast,Hot,Normal,Weak,Yes
4,D5,Rain,Cool,Normal,Weak,Yes
5,D6,Rain,Cool,Normal,Strong,No
11,D12,Overcast,Mild,High,Strong,Yes


In [24]:
treeDay = ID3(trainDataDay, 'play')
treeDay

{'day': {'D1': 'No',
  'D11': 'Yes',
  'D14': 'No',
  'D2': 'No',
  'D3': 'Yes',
  'D4': 'Yes',
  'D7': 'Yes',
  'D8': 'No',
  'D9': 'Yes'}}

##Now as you can see the all the features are classified only using day
and the future prediction ability decreases as can be seen by the accuracy of this model

In [25]:
accuracyDay = evaluate(treeDay , testDataDay , 'play')
print("Accuracy of Decision Tree : " , accuracyDay)

Accuracy of Decision Tree :  0.0
