## Module 9 - Bayes

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
import random
from sklearn.preprocessing import LabelEncoder

<b>The Monty Hall Problem</b><br>
You are a contestant on <i>Let's Make a Deal</i>. The host (Monty) shows you three doors, behind two is a goat, and behind one is a new car (all randomly placed). If you pick the door with the new car, you get to keep it. You select a door, then Monty opens one of remaining two doors to reveal a goat. He then gives you the option of switching your selection to the other closed door. How often will you win if you stay with your original selection? How often will you win if you switch?

In [2]:
doors = ["goat", "goat", "car"]
wins_when_switching = 0
loses_when_switching = 0
wins_when_staying = 0
loses_when_staying = 0

for game_ct in range(10000):
    
    # Randomize the location of the car
    random.shuffle(doors)
    car_location = doors.index("car")  # This variable holds the true location of the car
    
    # Step 1: Pick our door
    remaining_doors = [0, 1, 2]
    door_selection = random.choice(remaining_doors)  # This variable holds our initial choice
    remaining_doors.remove(door_selection)           # Now the remaining_doors array only has two integers
    
    # Step 2: Monty opens one of the goat doors
    # If there are two goats, we randomly choose to open one of the remaining doors. 
    # If not, we take the door with the goat.
    if (doors[remaining_doors[0]] == "goat") & (doors[remaining_doors[1]] == "goat"):
        opened_door = random.choice(remaining_doors)
    elif doors[remaining_doors[0]] == "goat":
        opened_door = remaining_doors[0]
    else:
        opened_door = remaining_doors[1]
    remaining_doors.remove(opened_door)  # Now the remaining_doors array only has one integer
    
    # Step 3: Decide whether to switch or stay. Here we'll make this a random choice
    should_i_stay = random.choice([0, 1])
    if should_i_stay == 1:
        if door_selection == car_location:
            wins_when_staying += 1
        else:
            loses_when_staying += 1
    else:
        door_selection = remaining_doors[0]  # Switch our selection for the one reminaing door
        if door_selection == car_location:
            wins_when_switching += 1
        else:
            loses_when_switching += 1        
win_pct_when_staying = 100*(float(wins_when_staying) / 
                            (float(wins_when_staying) + float(loses_when_staying)))
win_pct_when_switching = 100*(float(wins_when_switching) / 
                              (float(wins_when_switching) + float(loses_when_switching)))

print("When staying we win %.2f%% of the time" % win_pct_when_staying)
print("When switching we win %.2f%% of the time" % win_pct_when_switching)

When staying we win 31.90% of the time
When switching we win 66.97% of the time


<b>Naive Bayesian Classifier</b><br>
Here, we'll train a Naive Bayesian classifier to make a prediction as to whether or not we'll play golf.

In [3]:
golf_df = pd.read_csv("golf_data.csv")
golf_df

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play
0,Sunny,85,85,False,No
1,Sunny,80,90,True,No
2,Overcast,83,78,False,Yes
3,Rain,70,96,False,Yes
4,Rain,68,80,False,Yes
5,Rain,65,70,True,No
6,Overcast,64,65,True,Yes
7,Sunny,72,95,False,No
8,Sunny,69,70,False,Yes
9,Rain,75,80,False,Yes


The first thing we have to do is encode the catagorical and binary variables

In [4]:
golf_df["Outlook"] = LabelEncoder().fit_transform(golf_df["Outlook"])
golf_df["Wind"] = LabelEncoder().fit_transform(golf_df["Wind"])
golf_df["Play"] = LabelEncoder().fit_transform(golf_df["Play"])
golf_df

Unnamed: 0,Outlook,Temperature,Humidity,Wind,Play
0,2,85,85,0,0
1,2,80,90,1,0
2,0,83,78,0,1
3,1,70,96,0,1
4,1,68,80,0,1
5,1,65,70,1,0
6,0,64,65,1,1
7,2,72,95,0,0
8,2,69,70,0,1
9,1,75,80,0,1


Next we split the data

In [5]:
X_Train, X_Test, Y_Train, Y_Test = train_test_split(golf_df[["Outlook", "Temperature", "Humidity", "Wind"]], 
                                                    golf_df["Play"], test_size = 0.3, random_state = 0)

Then we generate the model

In [6]:
# initialise Gaussian Naive Bayes
model = GaussianNB()
model.fit(X_Train, Y_Train)
predictions = model.predict(X_Test)

results_df = pd.DataFrame({"Predictions": predictions, "True Values": Y_Test})
results_df

Unnamed: 0,Predictions,True Values
2,1,1
8,0,1
4,1,1
