In [43]:
import numpy as np
import pandas as pd

In [44]:
# Load the dataset
dataset = pd.read_csv('tree_Data.csv')

In [45]:
# printing attributes of the dataset
dataset.keys()

Index(['Days', 'Outlook', 'Temperature', 'Humidity', 'Wind', 'Play Tennis'], dtype='object')

In [46]:
# Remove the first column
dataset =dataset[['Outlook', 'Temperature', 'Humidity', 'Wind', 'Play Tennis']]
dataset

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [47]:
dataset.shape[0]

14

In [48]:
# Calculate the entropy of the dataset
def entropy(df,dataset):
    count_yes =df[df['Play Tennis'] =='Yes']['Play Tennis'].count()
    count_no =df[df['Play Tennis'] =='No']['Play Tennis'].count()
    total = count_yes + count_no
    proportion_yes, proportion_no = count_yes/total, count_no/total
    if proportion_yes == 0 or proportion_no == 0:
        return 0
    entropy =  -proportion_yes*np.log2(proportion_yes) -proportion_no*np.log2(proportion_no)
    weighted_entropy = total/dataset.shape[0] * entropy
    return weighted_entropy

In [49]:
# Calculate the total entropy of attribute of the dataset
def entropy_attribute(df, attribute,dataset):
    attribute_values = df[attribute].unique()
    total_entropy = 0
    for i in attribute_values:
        new_df = df[df[attribute] == i]
        total_entropy += entropy(new_df,dataset)
    return total_entropy

In [50]:
# Calculate the information gain of  attribute of the dataset
def information_gain(df,attribute,dataset):
    return entropy(dataset,dataset) - entropy_attribute(dataset, attribute,dataset)

In [51]:
# Dividing the dataset into  small dataset containing 1  other attribute and the target attribute
def dataset_column(df,column):
    return df[[column, 'Play Tennis']]

In [52]:
# creating a list of all keys of the dataset except the last one
dataset_keys = dataset.keys()
dataset_keys = dataset_keys[:-1]
dataset_keys

Index(['Outlook', 'Temperature', 'Humidity', 'Wind'], dtype='object')

In [53]:
# Displaying the result
print("Entropy for the dataset is ", entropy(dataset,dataset))
IG = {}
for i in dataset_keys:
    print(i)
    print('-----------------------------------------------------------')
    print("Entropy of",i," is ", entropy_attribute(dataset_column(dataset, i), i,dataset))
    val = information_gain(dataset, i,dataset)
    IG[i] = val
    print("Information gain of attribute",i," is ", val)
    print('')
    


Entropy for the dataset is  0.9402859586706311
Outlook
-----------------------------------------------------------
Entropy of Outlook  is  0.6935361388961918
Information gain of attribute Outlook  is  0.24674981977443933

Temperature
-----------------------------------------------------------
Entropy of Temperature  is  0.9110633930116763
Information gain of attribute Temperature  is  0.02922256565895487

Humidity
-----------------------------------------------------------
Entropy of Humidity  is  0.7884504573082896
Information gain of attribute Humidity  is  0.15183550136234159

Wind
-----------------------------------------------------------
Entropy of Wind  is  0.8921589282623617
Information gain of attribute Wind  is  0.04812703040826949



In [54]:
# finding  attributes at level1 of the tree
level1_nodes=dataset[max(IG, key=IG.get)].unique()
level1_nodes

array(['Sunny', 'Overcast', 'Rain'], dtype=object)

In [55]:
# Splitting the dataset into 3 small datasets
dataset1 = dataset[dataset[max(IG, key=IG.get)] == level1_nodes[0]]
dataset2 = dataset[dataset[max(IG, key=IG.get)] == level1_nodes[1]]
dataset3 = dataset[dataset[max(IG, key=IG.get)] == level1_nodes[2]]

In [56]:
# Finding the attribute with maximum information gain for each of the 3 small datasets
max(IG, key=IG.get)

'Outlook'

In [57]:
# subdataset1
dataset1

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
10,Sunny,Mild,Normal,Strong,Yes


In [58]:
# creating a list of all keys of the dataset except the last one
dataset_keys = dataset1.keys()
dataset_keys = dataset_keys[1:-1]
dataset_keys

Index(['Temperature', 'Humidity', 'Wind'], dtype='object')

In [59]:
# Displaying the result
print("Entropy for the dataset is ", entropy(dataset1,dataset1))
IG = {}
for i in dataset_keys:
    print(i)
    print('-----------------------------------------------------------')
    print("Entropy of",i," is ", entropy_attribute(dataset_column(dataset1, i), i,dataset1))
    val = information_gain(dataset1, i,dataset1)
    IG[i] = val
    print("Information gain of attribute",i," is ", val)
    print('')

Entropy for the dataset is  0.9709505944546686
Temperature
-----------------------------------------------------------
Entropy of Temperature  is  0.4
Information gain of attribute Temperature  is  0.5709505944546686

Humidity
-----------------------------------------------------------
Entropy of Humidity  is  0
Information gain of attribute Humidity  is  0.9709505944546686

Wind
-----------------------------------------------------------
Entropy of Wind  is  0.9509775004326937
Information gain of attribute Wind  is  0.01997309402197489



In [60]:
# Splitting the sub dataset1 into 2 small datasets
dataset1_1 = dataset1[dataset1[max(IG, key=IG.get)] == dataset1[max(IG, key=IG.get)].unique()[0]]
dataset1_2 = dataset1[dataset1[max(IG, key=IG.get)] == dataset1[max(IG, key=IG.get)].unique()[1]]

In [61]:
# displaying subsubdataset1
dataset1_1

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
7,Sunny,Mild,High,Weak,No


In [62]:
# displaying subsubdataset2
dataset1_2

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
8,Sunny,Cool,Normal,Weak,Yes
10,Sunny,Mild,Normal,Strong,Yes


In [63]:
# Entropy of subsubdataset1 of subdataset1
entropy(dataset1_1,dataset1_1)

0

In [64]:
# Entropy of subsubdataset2 of subdataset1
entropy(dataset1_2,dataset1_2)

0

In [65]:
# subdataset2
dataset2


Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
2,Overcast,Hot,High,Weak,Yes
6,Overcast,Cool,Normal,Strong,Yes
11,Overcast,Mild,High,Strong,Yes
12,Overcast,Hot,Normal,Weak,Yes


In [66]:
# creating a list of all keys of the dataset except the last one
dataset_keys = dataset2.keys()
dataset_keys = dataset_keys[1:-1]
dataset_keys

Index(['Temperature', 'Humidity', 'Wind'], dtype='object')

In [67]:
# Displaying the result
# Entropy is coming out to be 0 for all the attributes of the dataset2.
# Hence, we can conclude that the dataset2 is a pure dataset.
# This is leaf node of the tree.
print("Entropy for the dataset is ", entropy(dataset2,dataset2))
IG = {}
for i in dataset_keys:
    print(i)
    print('-----------------------------------------------------------')
    print("Entropy of",i," is ", entropy_attribute(dataset_column(dataset2, i), i,dataset2))
    val = information_gain(dataset2, i,dataset2)
    IG[i] = val
    print("Information gain of attribute",i," is ", val)
    print('')

Entropy for the dataset is  0
Temperature
-----------------------------------------------------------
Entropy of Temperature  is  0
Information gain of attribute Temperature  is  0

Humidity
-----------------------------------------------------------
Entropy of Humidity  is  0
Information gain of attribute Humidity  is  0

Wind
-----------------------------------------------------------
Entropy of Wind  is  0
Information gain of attribute Wind  is  0



In [68]:
# subdataset3
dataset3

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
9,Rain,Mild,Normal,Weak,Yes
13,Rain,Mild,High,Strong,No


In [69]:
# creating a list of all keys of the dataset except the last one
dataset_keys = dataset3.keys()
dataset_keys = dataset_keys[1:-1]
dataset_keys

Index(['Temperature', 'Humidity', 'Wind'], dtype='object')

In [70]:
# Displaying the result
print("Entropy for the dataset is ", entropy(dataset3,dataset3))
IG = {}
for i in dataset_keys:
    print(i)
    print('-----------------------------------------------------------')
    print("Entropy of",i," is ", entropy_attribute(dataset_column(dataset3, i), i,dataset3))
    val = information_gain(dataset3, i,dataset3)
    IG[i] = val
    print("Information gain of attribute",i," is ", val)
    print('')

Entropy for the dataset is  0.9709505944546686
Temperature
-----------------------------------------------------------
Entropy of Temperature  is  0.9509775004326937
Information gain of attribute Temperature  is  0.01997309402197489

Humidity
-----------------------------------------------------------
Entropy of Humidity  is  0.9509775004326937
Information gain of attribute Humidity  is  0.01997309402197489

Wind
-----------------------------------------------------------
Entropy of Wind  is  0
Information gain of attribute Wind  is  0.9709505944546686



In [71]:
# Splitting the sub dataset3 into 2 small datasets
dataset3_1 = dataset3[dataset3[max(IG, key=IG.get)] == dataset3[max(IG, key=IG.get)].unique()[0]]
dataset3_2 = dataset3[dataset3[max(IG, key=IG.get)] == dataset3[max(IG, key=IG.get)].unique()[1]]

In [72]:
# displaying subsubdataset1 of subdataset3
dataset3_1

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [73]:
# entropy of subsubdataset1 of subdataset3
entropy(dataset3_1,dataset3_1)

0

In [74]:
# entropy of subsubdataset2 of subdataset3
entropy(dataset3_2,dataset3_2)

0

In [75]:
dataset

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [76]:
# Splitting the dataset into training and testing datasets
split_idx = int(0.8*dataset.shape[0])
train_df = dataset[:split_idx]
test_df = dataset[split_idx:]
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)

In [77]:
# Training the decision tree
cnt = 0
decision_tree = []
def training_data(df,key=None):
    dataset_keys = df.keys()
    dataset_keys = dataset_keys[:-1]
    if key in df.keys():
        dataset_keys = dataset_keys.drop(key)
    IG = {}
    ENTROPY = entropy(df,df)
    if ENTROPY == 0:
        global cnt
        global dict1
        print("Class",cnt)
        print(df)
        cnt+=1
        return 
    for i in dataset_keys:
        val = information_gain(df, i,df)
        IG[i] = val
    max_IG = max(IG, key=IG.get) # Get the key with the maximum value
    nextlevel_nodes = df[max_IG].unique()
    if len(nextlevel_nodes) == 1:
        return df['Play Tennis'].unique()[0]
    else:
        for i in nextlevel_nodes:
            new_df = df[df[max_IG] == i]
            global decision_tree
            if max_IG not in decision_tree:
                decision_tree.append(max_IG)
            for j in df[max_IG].unique():
                if j not in decision_tree:
                    decision_tree.append(j)
                    break
            training_data(new_df,max_IG)


In [78]:
decision_tree = []
training_data(train_df,None)

Class 0
  Outlook Temperature Humidity    Wind Play Tennis
0   Sunny         Hot     High    Weak          No
1   Sunny         Hot     High  Strong          No
7   Sunny        Mild     High    Weak          No
Class 1
   Outlook Temperature Humidity    Wind Play Tennis
8    Sunny        Cool   Normal    Weak         Yes
10   Sunny        Mild   Normal  Strong         Yes
Class 2
    Outlook Temperature Humidity    Wind Play Tennis
2  Overcast         Hot     High    Weak         Yes
6  Overcast        Cool   Normal  Strong         Yes
Class 3
  Outlook Temperature Humidity  Wind Play Tennis
3    Rain        Mild     High  Weak         Yes
4    Rain        Cool   Normal  Weak         Yes
9    Rain        Mild   Normal  Weak         Yes
Class 4
  Outlook Temperature Humidity    Wind Play Tennis
5    Rain        Cool   Normal  Strong          No


In [79]:
print(decision_tree)

['Outlook', 'Sunny', 'Humidity', 'High', 'Normal', 'Overcast', 'Rain', 'Wind', 'Weak', 'Strong']


In [80]:
# Removing the last attribute value of test dataset
test_df['Play Tennis'] = np.nan
test_df
        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['Play Tennis'] = np.nan


Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
11,Overcast,Mild,High,Strong,
12,Overcast,Hot,Normal,Weak,
13,Rain,Mild,High,Strong,


In [81]:
# Testing the decision tree
def testing_data(df,decision_tree):
    for index, row in df.iterrows():
        if row[decision_tree[0]] == decision_tree[1]:
            if row[decision_tree[2]] == decision_tree[3]:
                test_df.at[index, 'Play Tennis'] = 'No'
            elif row[decision_tree[2]] == decision_tree[4]:
                test_df.at[index, 'Play Tennis'] = 'Yes'
        if row[decision_tree[0]] == decision_tree[5]:
            test_df.at[index, 'Play Tennis'] = 'Yes'
        if row[decision_tree[0]] == decision_tree[6]:
            if row[decision_tree[7]] == decision_tree[8]:
                test_df.at[index, 'Play Tennis'] = 'Yes'
            elif row[decision_tree[7]] == decision_tree[9]:
                test_df.at[index, 'Play Tennis'] = 'No'



In [82]:
testing_data(test_df,decision_tree)
test_df

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play Tennis
11,Overcast,Mild,High,Strong,Yes
12,Overcast,Hot,Normal,Weak,Yes
13,Rain,Mild,High,Strong,No


In [83]:
# Calculating the accuracy of the decision tree
def accuracy(df,test_df):
    count = 0
    for index, row in df.iterrows():
        if row['Play Tennis'] == test_df['Play Tennis'][index]:
            count+=1
    return count/df.shape[0]

In [84]:
Accuracy = accuracy(test_df,dataset)
print("Accuracy of the model is ", Accuracy*100,"%")

Accuracy of the model is  100.0 %
