# **Pattern Recognition and Machine Learning**
>Week 5 Tutorial

## Tutorial 1 - Weather Forcasting

## 1 Dummy Dataset

In [1]:
weather = ['Sunny','Sunny','Overcast','Rainy','Rainy','Rainy','Overcast',
           'Sunny','Sunny','Rainy','Sunny','Overcast','Overcast','Rainy']
temp = ['Hot','Hot','Hot','Mild','Cool','Cool','Cool','Mild','Cool','Mild',
        'Mild','Mild','Hot','Mild']
humid = ['high','high','high','high','normal','normal','normal','high',
         'normal','normal','normal','high','normal','high']
windy = [False,True,False,False,False,True,True,False,False,False,True,
         True,False,True]

play = ['No','No','Yes','Yes','Yes','No','Yes','No','Yes','Yes','Yes','Yes',
        'Yes','No']

## 2 Weather Forcasting using Naive Bayes
2.1 Covert categorical data to numeric data

In [6]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

# Convert string variables to numerical levels
weather_encoded = le.fit_transform(weather)
temp_encoded = le.fit_transform(temp)
humid_encoded = le.fit_transform(humid)
windy_encoded = le.fit_transform(windy)
play_encoded = le.fit_transform(play)

2.2 Extracting features and labels

In [44]:
X = list(zip(weather_encoded,temp_encoded,humid_encoded,windy_encoded))
y = play_encoded

print(X)

[(2, 1, 0, 0), (2, 1, 0, 1), (0, 1, 0, 0), (1, 2, 0, 0), (1, 0, 1, 0), (1, 0, 1, 1), (0, 0, 1, 1), (2, 2, 0, 0), (2, 0, 1, 0), (1, 2, 1, 0), (2, 2, 1, 1), (0, 2, 0, 1), (0, 1, 1, 0), (1, 2, 0, 1)]


2.3 Spliting data into train/test sets

In [548]:
from sklearn.model_selection import train_test_split

In [966]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=11)

2.4 Generate model - Gaussian Naive Bayes

In [977]:
from sklearn.naive_bayes import GaussianNB

# Create Gaussian classifier
md = GaussianNB()

In [988]:
# Train the model
md.fit(X_train, y_train)

GaussianNB()

In [999]:
result = md.predict(X_test)
print(f"Model predicts: {result}")

Model predicts: [1 0 1 1 1]


2.5 Evaluate model

In [75]:
from sklearn import metrics
from sklearn.metrics import classification_report

In [1010]:
# Calculate metrics from the predictions
print("Accuracy:",metrics.accuracy_score(y_test, result))
print("Precision:",metrics.precision_score(y_test, result, average = 'weighted'))
print("Recall:",metrics.recall_score(y_test, result, average = 'weighted'))
print("F1-score:",metrics.f1_score(y_test, result, average = 'weighted'))

Accuracy: 0.6
Precision: 0.6
Recall: 0.6
F1-score: 0.6


In [1021]:
# Compute the confusion metrix
metrics.confusion_matrix(y_test, result)

array([[0, 1],
       [1, 3]], dtype=int64)

2.6 Generate model - Multinominal Naive Bayes

In [1032]:
from sklearn.naive_bayes import MultinomialNB

# Create Gaussian classifier
mn = MultinomialNB()

In [1043]:
# Train the model
mn.fit(X_train, y_train)

MultinomialNB()

In [1054]:
# Predict the result
result = mn.predict(X_test)
print(f"Model predicts: {result}")

Model predicts: [1 0 1 1 1]


In [1065]:
# Calculate metrics from the predictions
print("Accuracy:",metrics.accuracy_score(y_test, result))
print("Precision:",metrics.precision_score(y_test, result, average = 'weighted'))
print("Recall:",metrics.recall_score(y_test, result, average = 'weighted'))
print("F1-score:",metrics.f1_score(y_test, result, average = 'weighted'))

Accuracy: 0.6
Precision: 0.6
Recall: 0.6
F1-score: 0.6


In [1076]:
# Compute the confusion metrix
metrics.confusion_matrix(y_test, result)

array([[0, 1],
       [1, 3]], dtype=int64)

## Tutorial 2 - Spam Detection

## 1 Load dataset

In [1780]:
import pandas as pd

spam_data = pd.read_csv("data/spam.csv")

## 2 Data exploration

In [1802]:
spam_data = spam_data.iloc[:, 0:2]

In [1824]:
spam_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5574 entries, 0 to 5573
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Label      5574 non-null   object
 1   EmailText  5573 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [1835]:
spam_data.head(5)

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [1846]:
spam_data['Label'].value_counts()

ham     4827
spam     747
Name: Label, dtype: int64

In [1857]:
spam_data.groupby('Label').describe()

Unnamed: 0_level_0,EmailText,EmailText,EmailText,EmailText
Unnamed: 0_level_1,count,unique,top,freq
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4826,4517,"Sorry, I'll call later",30
spam,747,647,Please call our customer service representativ...,4


## 2 Feature extration

In [1442]:
from sklearn.feature_extraction.text import CountVectorizer

In [1868]:
# A sample
sample = [spam_data['EmailText'][81]]

type(sample)
print(sample)

['K. Did you call me just now ah?']


In [1879]:
cv = CountVectorizer()
cv.fit_transform(sample)
cv.vocabulary_

{'did': 2, 'you': 6, 'call': 1, 'me': 4, 'just': 3, 'now': 5, 'ah': 0}

## 3 Build multinominal naive bayes model

In [1934]:
spam_data['EmailText'].describe()

count                       5573
unique                      5164
top       Sorry, I'll call later
freq                          30
Name: EmailText, dtype: object

In [1967]:
spam_text = spam_data['EmailText'].values.astype('unicode')

In [2003]:
# Get the features
X = cv.fit_transform(spam_text)

# Get the targets
y = spam_data['Label']

# print(cv.vocabulary_)

In [2027]:
X.shape

(5574, 8625)

In [2039]:
y.shape

(5574,)

In [2051]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=111)

In [2063]:
from sklearn.naive_bayes import MultinomialNB

mn = MultinomialNB()

In [2075]:
mn.fit(X_train, y_train)

MultinomialNB()

## 4 Make prediction

In [2087]:
result = mn.predict(X_test)

## 5 Evaluation

In [2099]:
score = mn.score(X_train, y_train)
print(f"Score of this model: {score}")

Score of this model: 0.9939448306795245


In [2111]:
# Compute the confusion metrix
metrics.confusion_matrix(y_test, result)

array([[961,  10],
       [  7, 137]], dtype=int64)

## 6 Display misclassified messages

In [2123]:
index = 0
misclassifiedIndexes = []
for label, predict in zip(y_test, result):
    if label != predict: 
        misclassifiedIndexes.append(index)
    index +=1

misclassifiedEmailText = []
for badIndex in misclassifiedIndexes:
    misclassifiedEmailText.append([result[badIndex],
                                  spam_data['EmailText'][badIndex]])

In [2135]:
print(pd.DataFrame(misclassifiedEmailText,
      columns=['Wrong Label', 'Email Text']))

   Wrong Label                                         Email Text
0         spam                               U can call me now...
1          ham  Sorry to be a pain. Is it ok if we meet anothe...
2         spam                  Watching telugu movie..wat abt u?
3         spam  Update_Now - Xmas Offer! Latest Motorola, Sony...
4         spam  Loan for any purpose �500 - �75,000. Homeowner...
5         spam  Sorry man my account's dry or I would, if you ...
6          ham  Maybe westshore or hyde park village, the plac...
7         spam  It's fine, imma get a drink or somethin. Want ...
8         spam  Its a valentine game. . . Send dis msg to all ...
9          ham  <Forwarded from 448712404000>Please CALL 08712...
10         ham  You have an important customer service announc...
11        spam  Thanks for yesterday sir. You have been wonder...
12        spam                           Gibbs unsold.mike hussey
13         ham  I am not sure about night menu. . . I know onl...
14        