# Probability Exercise (July 20th, 2024)

In [None]:
import pandas as pd

## 1. Binary classification - Play tennis

In [None]:
df = pd.read_csv('play_tennis.csv', index_col=0)
df

Unnamed: 0_level_0,Outlook,Temperature,Humidity,Wind,PlayTennis
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
D1,Sunny,Hot,High,Weak,No
D2,Sunny,Hot,High,Strong,No
D3,Overcast,Hot,High,Weak,Yes
D4,Rain,Mild,High,Weak,Yes
D5,Rain,Cool,Normal,Weak,Yes
D6,Rain,Cool,Normal,Strong,No
D7,Overcast,Cool,Normal,Strong,Yes
D8,Overcast,Mild,High,Weak,No
D9,Sunny,Cool,Normal,Weak,Yes
D10,Rain,Mild,Normal,Weak,Yes


In [None]:
len(df[df['PlayTennis'] == 'Yes']) / len(df)

0.6

In [None]:
len(df[df['PlayTennis'] == 'No']) / len(df)

0.4

In [None]:
def calculate_conditional_probbility(df, C: str, X: list) -> float:
    P_X_given_C = 1
    for x in X:
        P_X_given_C *= len(df.query(' and '.join([x, C]))) / len(df.query(C))
    P_C_given_X = P_X_given_C * len(df.query(C)) / len(df)
    return P_C_given_X

In [None]:
X = ["Outlook=='Sunny'", "Temperature=='Cool'",
     "Humidity=='High'", "Wind=='Strong'"]
C = "PlayTennis=='Yes'"
calculate_conditional_probbility(df, C, X)

0.0027777777777777775

In [None]:
X = ["Outlook=='Sunny'", "Temperature=='Cool'",
     "Humidity=='High'", "Wind=='Strong'"]
C = "PlayTennis=='No'"
calculate_conditional_probbility(df, C, X)

0.01875

In [None]:
crosstabs = []
for col in df.columns[:-1]:
    crosstabs.append(pd.crosstab(
        df[col], df['PlayTennis'], normalize='columns'))

# 0.0027 ~ 0.1667 * 0.5 * 0.3333 * 0.1667 * 0.6
pd.concat(crosstabs, axis=0,
          keys=df.columns[:-1])['Yes'].to_frame()

Unnamed: 0,Unnamed: 1,Yes
Outlook,Overcast,0.333333
Outlook,Rain,0.5
Outlook,Sunny,0.166667
Temperature,Cool,0.5
Temperature,Hot,0.166667
Temperature,Mild,0.333333
Humidity,High,0.333333
Humidity,Normal,0.666667
Wind,Strong,0.166667
Wind,Weak,0.833333


## 2. Multi-label classification - Traffic data

In [None]:
X = ['Day=="Weekday"', 'Season=="Winter"', 'Fog=="High"', 'Rain=="Heavy"']
df = pd.read_csv('traffic_data.csv', na_filter=False)
df

Unnamed: 0,Day,Season,Fog,Rain,Class
0,Weekday,Spring,,,On Time
1,Weekday,Winter,,Slight,On Time
2,Weekday,Winter,,,On Time
3,Holiday,Winter,High,Slight,Late
4,Saturday,Summer,Normal,,On Time
5,Weekday,Autumn,Normal,,Very Late
6,Holiday,Summer,High,Slight,On Time
7,Sunday,Summer,Normal,,On Time
8,Weekday,Winter,High,Heavy,Very Late
9,Weekday,Summer,,Slight,On Time


In [None]:
df['Class'].value_counts()

Class
On Time      14
Very Late     3
Late          2
Cancelled     1
Name: count, dtype: int64

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
import numpy as np

In [None]:
C = 'Class=="On Time"'
calculate_conditional_probbility(df, C, X)

0.0026239067055393583

In [None]:
res = []
for c in df['Class'].unique():
    res.append(calculate_conditional_probbility(df, f'Class=="{c}"', X))
pd.DataFrame(res, index=df['Class'].unique(), columns=['P']).sort_values('P')

Unnamed: 0,P
Late,0.0
Cancelled,0.0
On Time,0.002624
Very Late,0.022222


In [None]:
X_train = df[df.columns[:-1]].to_numpy()
Y_train = df['Class'].to_numpy()
# x_encoder = LabelEncoder()
# y_encoder = LabelEncoder()
x_encoder = OrdinalEncoder()
y_encoder = OrdinalEncoder()
X_train = x_encoder.fit_transform(X_train).reshape(-1, df.columns.size-1)
Y_train = y_encoder.fit_transform(Y_train.reshape(-1, 1))

In [None]:
np.unravel_index()

In [None]:
model = GaussianNB().fit(X_train, Y_train.ravel())
X_test = np.array([['Weekday', 'Winter', 'High', 'Heavy']])
X_test = x_encoder.transform(X_test)
y_test = model.predict_proba(X_test)
categories = y_encoder.categories_[0].tolist()  # type: ignore
pd.DataFrame(y_test, columns=categories, index=[C]).T

Unnamed: 0,"Class==""On Time"""
Cancelled,0.0
Late,0.010129
On Time,1.4e-05
Very Late,0.989857


## 3. Iris clasification

In [None]:
df = pd.read_csv('iris.csv', header=None).T
df.columns = df.iloc[0]
df = df[1:]
df.reset_index(drop=True, inplace=True)
df = df.astype({'Length': np.float64, 'Class': np.int8})
df

Unnamed: 0,Length,Class
0,1.4,0
1,1.0,0
2,1.3,0
3,1.9,0
4,2.0,0
5,1.8,0
6,3.0,1
7,3.8,1
8,4.1,1
9,3.9,1


In [None]:
df_0 = df.query('Class==0')
# mean = df_0['Length'].mean()
mean_0 = df_0['Length'].sum() / len(df_0)
# var = df_0['Length'].var(ddof=0)
var_0 = ((df_0['Length'] - mean_0)**2).sum() / len(df_0)
print(mean_0, var_0)

1.5666666666666667 0.1288888888888889


In [None]:
df_1 = df.query('Class==1')
# mean = df_1['Length'].mean()
mean_1 = df_1['Length'].sum() / len(df_1)
# var = df_0['Length'].var(ddof=0)
var_1 = ((df_1['Length'] - mean_1)**2).sum() / len(df_1)
print(mean_1, var_1)

3.733333333333333 0.1722222222222222


In [None]:
X_train = df[df.columns[:-1]]
Y_train = df['Class']
X_test = np.array([[3.4]])
model = GaussianNB().fit(X_train, Y_train)
model.predict_proba(X_test)

array([[3.47019998e-06, 9.99996530e-01]])

In [None]:
from scipy.stats import norm

In [None]:
# 1/np.sqrt(2*np.pi*var_0) * np.exp(-0.5*(3.4-mean_0)**2/var_0) * len(df_0)/len(df)
norm.pdf(3.4, mean_0, var_0**0.5) * 0.5

1.2080820590230566e-06

In [None]:
norm.pdf(3.4, mean_1, var_1**0.5) * 0.5

0.34812922367906424

## 4. Play tennis classifier implementation

### 4.1 Implement function *create_train_data*

In [None]:
import numpy as np


def create_train_data():
    # your code here ************************
    data = '''Day Outlook Temperature Humidity Wind PlayTennis
              D1 Sunny Hot High Weak No
              D2 Sunny Hot High Strong No
              D3 Overcast Hot High Weak Yes
              D4 Rain Mild High Weak Yes
              D5 Rain Cool Normal Weak Yes
              D6 Rain Cool Normal Strong No
              D7 Overcast Cool Normal Strong Yes
              D8 Overcast Mild High Weak No
              D9 Sunny Cool Normal Weak Yes
              D10 Rain Mild Normal Weak Yes'''
    data = data.split('\n')
    data = [d.strip().split() for d in data]
    data = np.array(data)[1:, 1:]
    # your code ends *************************
    return data


train_data = create_train_data()
print(train_data)

[['Sunny' 'Hot' 'High' 'Weak' 'No']
 ['Sunny' 'Hot' 'High' 'Strong' 'No']
 ['Overcast' 'Hot' 'High' 'Weak' 'Yes']
 ['Rain' 'Mild' 'High' 'Weak' 'Yes']
 ['Rain' 'Cool' 'Normal' 'Weak' 'Yes']
 ['Rain' 'Cool' 'Normal' 'Strong' 'No']
 ['Overcast' 'Cool' 'Normal' 'Strong' 'Yes']
 ['Overcast' 'Mild' 'High' 'Weak' 'No']
 ['Sunny' 'Cool' 'Normal' 'Weak' 'Yes']
 ['Rain' 'Mild' 'Normal' 'Weak' 'Yes']]


### 4.2 Implement function *compute_prior_probability*

In [None]:
def compute_prior_probablity(train_data):
    y_unique = ['no', 'yes']
    prior_probability = np. zeros(len(y_unique))
    # your code here ******************
    for i, label in enumerate(y_unique):
        prior_probability[i] = np.count_nonzero(
            train_data[:, -1] == label.title()) / len(train_data)
    # your code ends ******************
    return prior_probability


prior_probablity = compute_prior_probablity(train_data)
print('P(play_tennis="No")', prior_probablity[0])
print('P(play_tennis="Yes")', prior_probablity[1])

P(play_tennis="No") 0.4
P(play_tennis="Yes") 0.6


### 4.3 Implement function *compute_conditional_probability*

In [None]:
def compute_conditional_probability(train_data):
    y_unique = ['no', 'yes']
    conditional_probability = []
    list_x_name = []
    for i in range(0, train_data.shape[1]-1):
        x_unique = np.unique(train_data[:, i])
        list_x_name.append(x_unique)

        # your code here ********************
        x_conditional_probability = []
        for label in y_unique:
            for x in x_unique:
                x_conditional_probability.append(
                    np.count_nonzero((train_data[:, i] == x) & (
                        train_data[:, -1] == label.title())) / np.count_nonzero(train_data[:, -1] == label.title())
                )
        x_conditional_probability = np.array(
            x_conditional_probability).reshape(len(y_unique), len(x_unique))
        # your code ends ********************

        conditional_probability.append(x_conditional_probability)
    return conditional_probability, list_x_name

In [None]:
conditional_probability, list_x_name = compute_conditional_probability(
    train_data)
list_x_name

[array(['Overcast', 'Rain', 'Sunny'], dtype='<U11'),
 array(['Cool', 'Hot', 'Mild'], dtype='<U11'),
 array(['High', 'Normal'], dtype='<U11'),
 array(['Strong', 'Weak'], dtype='<U11')]

In [None]:
conditional_probability[0].shape

(2, 3)

### 4.4 Implement function *get_index_from_value*

In [None]:
def get_index_from_value(feature_name, list_features):
    return np.where(list_features == feature_name)[0][0]

In [None]:
train_data = create_train_data()
_, list_x_name = compute_conditional_probability(train_data)
outlook = list_x_name[0]

i1 = get_index_from_value("Overcast", outlook)
i2 = get_index_from_value("Rain", outlook)
i3 = get_index_from_value("Sunny", outlook)

print(i1, i2, i3)

0 1 2


In [None]:
train_data = create_train_data()
conditional_probability, list_x_name = compute_conditional_probability(
    train_data)
# Compute P("Outlook="Sunny"| "Play_Tennis"="Yes")
x1 = get_index_from_value("Sunny", list_x_name[0])
print("P('Outlook'='Sunny’| 'Play Tennis'='Yes’) = ",
      np.round(conditional_probability[0][1, x1], 2))

P('Outlook'='Sunny’| 'Play Tennis'='Yes’) =  0.17


In [None]:
train_data = create_train_data()
conditional_probability, list_x_name = compute_conditional_probability(
    train_data)
# Compute P("Outlook="Sunny"| "Play_Tennis"="Yes")
x1 = get_index_from_value("Sunny", list_x_name[0])
print("P('Outlook'='Sunny’| 'Play Tennis'='No’) = ",
      np.round(conditional_probability[0][0, x1], 2))

P('Outlook'='Sunny’| 'Play Tennis'='No’) =  0.5


### 4.5 Implement function *train_naive_bayes*

In [None]:
# ##########################
# Train Naive Bayes Model
# ##########################
def train_naive_bayes(train_data):
    # Step 1: Calculate Prior Probability
    y_unique = ['no', 'yes']
    prior_probability = compute_prior_probablity(train_data)

    # Step 2: Calculate Conditional Probability
    conditional_probability, list_x_name = compute_conditional_probability(
        train_data)

    return prior_probability, conditional_probability, list_x_name

### 4.6 Implement function *prediction_play_tennis*

In [None]:
# ###################
# Prediction
# ###################
def prediction_play_tennis(X, list_x_name, prior_probability, conditional_probability):
    x1 = get_index_from_value(X[0], list_x_name[0])
    x2 = get_index_from_value(X[1], list_x_name[1])
    x3 = get_index_from_value(X[2], list_x_name[2])
    x4 = get_index_from_value(X[3], list_x_name[3])
    p0 = 0
    p1 = 0

    # your code here ***********************
    p0 = prior_probability[0] * conditional_probability[0][0, x1] * \
        conditional_probability[1][0, x2] * \
        conditional_probability[2][0, x3] * \
        conditional_probability[3][0, x4]
    p1 = prior_probability[1] * conditional_probability[0][1, x1] * \
        conditional_probability[1][1, x3] * \
        conditional_probability[2][1, x4] * \
        conditional_probability[3][1, x2]
    # your code ends ***********************

    if p0 > p1:
        y_pred = 0
    else:
        y_pred = 1

    return y_pred

In [None]:
X = ['Sunny', 'Cool', 'High', 'Strong']
data = create_train_data()
prior_probability, conditional_probability, list_x_name = train_naive_bayes(
    data)
pred = prediction_play_tennis(
    X, list_x_name, prior_probability, conditional_probability)
if (pred):
    print("Ad should go!")
else:
    print("Ad should not go!")

Ad should not go!
