In [3]:
import pandas as pd

In [4]:
dataSet = {
    "Outlook": ["Sunny", "Sunny", "Overcast", "Rain", "Rain", "Rain", "Overcast", "Sunny", "Sunny", "Rain", "Sunny", "Overcast", "Overcast", "Rain"], 
    "Temperature": ["Hot", "Hot", "Hot", "Mild", "Cool", "Cool", "Cool", "Mild", "Cool", "Mild", "Mild", "Mild", "Hot", "Mild"],
    "Humidity": ["High", "High", "High", "High", "Normal", "Normal", "Normal", "High", "Normal", "Normal", "Normal", "High", "Normal", "High"], 
    "Wind": ["Weak", "Strong", "Weak", "Weak", "Weak", "Strong", "Strong", "Weak", "Weak", "Weak", "Strong", "Strong", "Weak", "Strong"], 
    "PlayTennis": ["No", "No", "Yes", "Yes", "Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "Yes", "Yes", "No"]
}

data = pd.DataFrame(dataSet)

data

Unnamed: 0,Outlook,Temperature,Humidity,Wind,PlayTennis
0,Sunny,Hot,High,Weak,No
1,Sunny,Hot,High,Strong,No
2,Overcast,Hot,High,Weak,Yes
3,Rain,Mild,High,Weak,Yes
4,Rain,Cool,Normal,Weak,Yes
5,Rain,Cool,Normal,Strong,No
6,Overcast,Cool,Normal,Strong,Yes
7,Sunny,Mild,High,Weak,No
8,Sunny,Cool,Normal,Weak,Yes
9,Rain,Mild,Normal,Weak,Yes


In [12]:
from sklearn import preprocessing

# Label encoding
outlook, temperature, humidity, wind, play = [], [], [], [], []
# Label encoder can be used to transform non-numerical labels 
# (as long as they are hashable and comparable) to numerical labels.
label_encoder = preprocessing.LabelEncoder()

# Encoding outlook
cur=[]
for index, row in data.iterrows():
  cur.append(row["Outlook"])
  if(index == 13):
    outlook = cur
outlook_encoded = label_encoder.fit_transform(outlook)

# Encoding Temperature
cur=[]
for index, row in data.iterrows():
  cur.append(row["Temperature"])
  if(index == 13):
    temperature = cur
temperature_encoded = label_encoder.fit_transform(temperature)

# Encoding Humidity
cur=[]
for index, row in data.iterrows():
  cur.append(row["Humidity"])
  if(index == 13):
    humidity = cur
humidity_encoded = label_encoder.fit_transform(humidity)

# Encoding Wind
cur=[]
for index, row in data.iterrows():
  cur.append(row["Wind"])
  if(index == 13):
    wind = cur
wind_encoded = label_encoder.fit_transform(wind)

# Encoding PlayTennis
cur=[]
for index, row in data.iterrows():
  cur.append(row["PlayTennis"])
  if(index == 13):
    play = cur
play_encoded = label_encoder.fit_transform(play)

print("Outlook:", outlook_encoded)
print("Temperature: ", temperature_encoded)
print("Humidity: ", humidity_encoded)
print("Wind: ", wind_encoded)
print("Play: ", play_encoded)

0 Outlook        Sunny
Temperature      Hot
Humidity        High
Wind            Weak
PlayTennis        No
Name: 0, dtype: object
1 Outlook         Sunny
Temperature       Hot
Humidity         High
Wind           Strong
PlayTennis         No
Name: 1, dtype: object
2 Outlook        Overcast
Temperature         Hot
Humidity           High
Wind               Weak
PlayTennis          Yes
Name: 2, dtype: object
3 Outlook        Rain
Temperature    Mild
Humidity       High
Wind           Weak
PlayTennis      Yes
Name: 3, dtype: object
4 Outlook          Rain
Temperature      Cool
Humidity       Normal
Wind             Weak
PlayTennis        Yes
Name: 4, dtype: object
5 Outlook          Rain
Temperature      Cool
Humidity       Normal
Wind           Strong
PlayTennis         No
Name: 5, dtype: object
6 Outlook        Overcast
Temperature        Cool
Humidity         Normal
Wind             Strong
PlayTennis          Yes
Name: 6, dtype: object
7 Outlook        Sunny
Temperature     Mild
Humidi

In [13]:
# zip() function returns a zip object, which is an iterator of tuples on which the play 
# column depends
features = list(zip(outlook_encoded, temperature_encoded, humidity_encoded, wind_encoded))

features

[(2, 1, 0, 1),
 (2, 1, 0, 0),
 (0, 1, 0, 1),
 (1, 2, 0, 1),
 (1, 0, 1, 1),
 (1, 0, 1, 0),
 (0, 0, 1, 0),
 (2, 2, 0, 1),
 (2, 0, 1, 1),
 (1, 2, 1, 1),
 (2, 2, 1, 0),
 (0, 2, 0, 0),
 (0, 1, 1, 1),
 (1, 2, 0, 0)]

In [14]:
# Gaussian Naïve Bayes classifier assumes that the data from each label is drawn 
# from a simple Gaussian distribution. The Scikit-learn provides sklearn.naive_bayes.GaussianNB 
# to implement the Gaussian Naïve Bayes algorithm for classification.
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

# Fit Gaussian Naive Bayes according to X, y.
model.fit(features,play_encoded)

# From the encoded data we can decifer the following:
# Outlook:     Sunny-2, Overcast-0, Rain-1
# Temperature: Hot-1, Mild-2, Cool-0
# Humidity:    High-0, Normal-1
# Wind:        Weak-1, Strong-0
# Play:        No-0, Yes-1
predicted_1 = model.predict([[2, 0, 0, 1]]) # Sunny, Cool, High, Weak
predicted_2 = model.predict([[1, 1, 0, 1]]) # Rain, Hot, High, Weak

print(predicted_1)
print(predicted_2)

[0]
[1]


In [15]:
# Total number of entries in the model
tot = 14

# This function will count the number of times
# val appears in the list arr
def count1(arr, val):
  res = 0
  for x in arr:
    res += x==val
  return res

def count2(arr1, val1, arr2, val2):
  res = 0
  for i in range(0, tot):
    res += (arr1[i] == val1 and arr2[i]==val2)
  return res

In [16]:
# Number of times we can play as per the model
play_times = count1(play_encoded, 1)
# Number of times we can't play 
no_play_times = tot - play_times

# P(Play=Yes)
play_prob = play_times/tot
# P(Play=No)
no_play_prob = no_play_times/tot

In [17]:
# Learning Phase

possible_outcomes = ["Yes", "No"]

outlook_items = ["Sunny", "Overcast", "Rain"]
outlook_learning = {}

for outlook_item in outlook_items:
  for outcome in possible_outcomes:
    # This will count the number of times we have a particular outlook for a 
    # specific outcome of Yes/No
    num = count2(data["Outlook"], outlook_item, data["PlayTennis"], outcome)
    # Count the number of times outcome appears in the model to 
    # find the conditional probability based on outcome
    dem = count1(data["PlayTennis"], outcome)
    # Store this in our learning model for outlook
    outlook_learning[(outlook_item, outcome)] = num/dem

temperature_items = ["Hot", "Mild", "Cool"]
temperature_learning = {}

for temperature_item in temperature_items:
  for outcome in possible_outcomes:
    # This will count the number of times we have a particular temperature for a 
    # specific outcome of Yes/No
    num = count2(data["Temperature"], temperature_item, data["PlayTennis"], outcome)
    # Count the number of times outcome appears in the model to 
    # find the conditional probability based on outcome
    dem = count1(data["PlayTennis"], outcome)
    # Store this in our learning model for temperature
    temperature_learning[(temperature_item, outcome)] = num/dem

humidity_items = ["High", "Normal"]
humidity_learning = {}

for humidity_item in humidity_items:
  for outcome in possible_outcomes:
    # This will count the number of times we have a particular humidity for a 
    # specific outcome of Yes/No
    num = count2(data["Humidity"], humidity_item, data["PlayTennis"], outcome)
    # Count the number of times outcome appears in the model to 
    # find the conditional probability based on outcome
    dem = count1(data["PlayTennis"], outcome)
    # Store this in our learning model for humidity
    humidity_learning[(humidity_item, outcome)] = num/dem

wind_items = ["Strong", "Weak"]
wind_learning = {}

for wind_item in wind_items:
  for outcome in possible_outcomes:
    # This will count the number of times we have a particular wind type for a 
    # specific outcome of Yes/No
    num = count2(data["Wind"], wind_item, data["PlayTennis"], outcome)
    # Count the number of times outcome appears in the model to 
    # find the conditional probability based on outcome
    dem = count1(data["PlayTennis"], outcome)
    # Store this in our learning model for humidity
    wind_learning[(wind_item, outcome)] = num/dem

print("Outlook Learning:\n", outlook_learning)
print("Temperature Learning:\n", temperature_learning)
print("Humidity Learning:\n", humidity_learning)
print("Wind Learning:\n", wind_learning)


Outlook Learning:
 {('Sunny', 'Yes'): 0.2222222222222222, ('Sunny', 'No'): 0.6, ('Overcast', 'Yes'): 0.4444444444444444, ('Overcast', 'No'): 0.0, ('Rain', 'Yes'): 0.3333333333333333, ('Rain', 'No'): 0.4}
Temperature Learning:
 {('Hot', 'Yes'): 0.2222222222222222, ('Hot', 'No'): 0.4, ('Mild', 'Yes'): 0.4444444444444444, ('Mild', 'No'): 0.4, ('Cool', 'Yes'): 0.3333333333333333, ('Cool', 'No'): 0.2}
Humidity Learning:
 {('High', 'Yes'): 0.3333333333333333, ('High', 'No'): 0.8, ('Normal', 'Yes'): 0.6666666666666666, ('Normal', 'No'): 0.2}
Wind Learning:
 {('Strong', 'Yes'): 0.3333333333333333, ('Strong', 'No'): 0.6, ('Weak', 'Yes'): 0.6666666666666666, ('Weak', 'No'): 0.4}


In [18]:
# Sunny, Cool, High, Weak

# [P(Sunny|Yes)P(Cool|Yes)P(High|Yes)P(Weak|Yes)]P(Play=Yes) 
predicted_1 = outlook_learning[("Sunny", "Yes")] * temperature_learning[("Cool", "Yes")] * humidity_learning[("High", "Yes")] * wind_learning[("Weak", "Yes")] * play_prob
# [P(Sunny|No)P(Cool|No)P(High|No)P(Weak|No)]P(Play=No) 
predicted_2 = outlook_learning[("Sunny", "No")] * temperature_learning[("Cool", "No")] * humidity_learning[("High", "No")] * wind_learning[("Weak", "No")] * no_play_prob

print("P(Yes|x'):", predicted_1)
print("P(No|x'): ", predicted_2)
if predicted_2 > predicted_1:
  print("Since P(No|x') > P(Yes|x'), resultant is No: ", [0])
else:
  print("Since P(Yes|x') > P(No|x'), resultant is Yes: ", [1])
print()

# Rain, Hot, High, Weak 

# [P(Rain|Yes)P(Hot|Yes)P(High|Yes)P(Weak|Yes)]P(Play=Yes)
predicted_1 = outlook_learning[("Rain", "Yes")] * temperature_learning[("Hot", "Yes")] * humidity_learning[("High", "Yes")] * wind_learning[("Weak", "Yes")] * play_prob
# [P(Rain|No)P(Hot|No)P(High|No)P(Weak|No)]P(Play=No)
predicted_2 = outlook_learning[("Rain", "No")] * temperature_learning[("Hot", "No")] * humidity_learning[("High", "No")] * wind_learning[("Weak", "No")] * play_prob

print("P(Yes|x'):", predicted_1)
print("P(No|x'): ", predicted_2)
if predicted_2 > predicted_1:
  print("Since P(No|x') > P(Yes|x'), resultant is No: ", [0])
else:
  print("Since P(Yes|x') > P(No|x'), resultant is Yes: ", [1])

P(Yes|x'): 0.010582010582010581
P(No|x'):  0.013714285714285715
Since P(No|x') > P(Yes|x'), resultant is No:  [0]

P(Yes|x'): 0.010582010582010581
P(No|x'):  0.03291428571428573
Since P(No|x') > P(Yes|x'), resultant is No:  [0]
