### Naive Bayes (Classifier) - Multinomial Naive Bayes



In [1]:
# Import useful libararies used for data management

import numpy as np
import pandas as pd

data = pd.read_csv('Weather.csv')

In [2]:
data.head()

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,sunny,hot,high,No,no
1,sunny,hot,high,Yes,no
2,overcast,hot,high,No,yes
3,rainy,mild,high,No,yes
4,rainy,cool,normal,No,yes


In [3]:
data.describe()

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
count,14,14,14,14,14
unique,3,3,2,2,2
top,rainy,mild,normal,No,yes
freq,5,6,7,8,9


### Data Preparation

In [4]:
# Import LabelEncoder
from sklearn import preprocessing
#creating labelEncoder
le = preprocessing.LabelEncoder()

In [5]:
# Converting string labels into numbers.
data['Outlook_encoded'] = le.fit_transform(data['Outlook'])
data['Tem_encoded'] = le.fit_transform(data['Temperature'])
data['Humidity_encoded'] = le.fit_transform(data['Humidity'])
data['Windy_enoded'] = le.fit_transform(data['Windy'])
data['Label'] = le.fit_transform(data['Play'])

In [6]:
data

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play,Outlook_encoded,Tem_encoded,Humidity_encoded,Windy_enoded,Label
0,sunny,hot,high,No,no,2,1,0,0,0
1,sunny,hot,high,Yes,no,2,1,0,1,0
2,overcast,hot,high,No,yes,0,1,0,0,1
3,rainy,mild,high,No,yes,1,2,0,0,1
4,rainy,cool,normal,No,yes,1,0,1,0,1
5,rainy,cool,normal,Yes,no,1,0,1,1,0
6,overcast,cool,normal,Yes,yes,0,0,1,1,1
7,sunny,mild,high,No,no,2,2,0,0,0
8,sunny,cool,normal,No,yes,2,0,1,0,1
9,rainy,mild,normal,No,yes,1,2,1,0,1


In [7]:
# define independent attributes 
features = list(data.columns[5:9])
# assign values for independent variables and target variable ('Label')
X = data [features]
y = data ['Label']

In [8]:
features

['Outlook_encoded', 'Tem_encoded', 'Humidity_encoded', 'Windy_enoded']

In [9]:
X

Unnamed: 0,Outlook_encoded,Tem_encoded,Humidity_encoded,Windy_enoded
0,2,1,0,0
1,2,1,0,1
2,0,1,0,0
3,1,2,0,0
4,1,0,1,0
5,1,0,1,1
6,0,0,1,1
7,2,2,0,0
8,2,0,1,0
9,1,2,1,0


### Model Building

In [10]:
#Import Multinomial Naive Bayes model from sklearn
from sklearn.naive_bayes import MultinomialNB

#Create a Multinomial Bayes Classifier
mnb = MultinomialNB()

#### you may want to look at the description for MultinomialNB() to see what parameters you could tune for this model
https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html

### Model Evaluation

In [11]:
# import cross validation
from sklearn.model_selection import cross_val_score,cross_val_predict

In [12]:
score = cross_val_score(mnb,X,y,cv=3)

In [13]:
score.mean()

0.7999999999999999

#### Again, accuracy is not only the matrix we should rely or focus on, espectially for imbalanced dataset regarding the target variable and uneven cost for FP and FN. You should also look at the classification report 

In [14]:
# import libararies for evaluation measures
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
# write codes to create classification report



In [15]:
pred_y = cross_val_predict(mnb,X,y,cv=3)

In [16]:
pred_y

array([1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0])

In [17]:
pred_proba = cross_val_predict(mnb, X,y,cv=3, method = 'predict_proba')

In [18]:
pred_proba

array([[0.47058824, 0.52941176],
       [0.51612903, 0.48387097],
       [0.33333333, 0.66666667],
       [0.4       , 0.6       ],
       [0.26229508, 0.73770492],
       [0.29158402, 0.70841598],
       [0.19678715, 0.80321285],
       [0.46053515, 0.53946485],
       [0.33062011, 0.66937989],
       [0.15099679, 0.84900321],
       [0.40584561, 0.59415439],
       [0.44956178, 0.55043822],
       [0.19846069, 0.80153931],
       [0.53477075, 0.46522925]])

#### Once you have the predict probabilities, you can use these to build ROC curve for furtuer evaluation

In [19]:
from sklearn.metrics import roc_curve, auc
from sklearn import metrics

In [None]:
# write codes to construct ROC curve



### Fit the model

In [22]:
# train model using training dataset
mnb.fit(X, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [23]:
# Number of samples encountered for each class during fitting
mnb.class_count_

array([5., 9.])

### Make Prediction

In [25]:
# Predict class for new instance with "Outlook = overcast, Temperature = cool, humidity = high, Windy = No"
mnb.predict({'Outlook': 0, 'Temperature': 0, 'humidity': 0, 'Windy': 0})

ValueError: Expected 2D array, got scalar array instead:
array={'Outlook': 0, 'Temperature': 0, 'humidity': 0, 'Windy': 0}.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [27]:
# Predict class probability for new instance with "Outlook = overcast, Temperature = cool, humidity = high, Windy = No"


pandas.core.frame.DataFrame