# Pattern Recognition and Machine Learning
## Week 5 Tutorial

## Tutorial 1 - Weather Forcasting

## 1 Dummy Dataset

In [None]:
weather = ['Sunny','Sunny','Overcast','Rainy','Rainy','Rainy','Overcast','Sunny','Sunny',
           'Rainy','Sunny','Overcast','Overcast','Rainy']
temp = ['Hot','Hot','Hot','Mild','Cool','Cool','Cool','Mild','Cool','Mild','Mild',
        'Mild','Hot','Mild']
humid = ['high', 'high', 'high', 'high', 'normal', 'normal', 'normal', 'high', 'normal','normal', 'normal', 'high', 'normal', 'high']
windy = [False, True, False, False, False, True, True, False, False, False, True, True, False, True]

play = ['No','No','Yes','Yes','Yes','No','Yes','No','Yes','Yes','Yes','Yes','Yes','No']

## 2 Weather Forcasting using Naive Bayes
2.1 Covert categorical data to numeric data

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

# Convert string variables to numerical levels
weather_encoded = le.fit_transform(weather)
temp_encoded = le.fit_transform(temp)
humid_encoded = le.fit_transform(humid)
windy_encoded = le.fit_transform(windy)
play_encoded = le.fit_transform(play)

2.2 Extracting features and labels

In [None]:
X = list(zip(weather_encoded,temp_encoded,humid_encoded,windy_encoded))
y = play_encoded

2.3 Spliting data into train/test sets

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=111)

2.4 Generate model - Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

# Create Gaussian classifier
md = GaussianNB()

md.fit(X_train, y_train)

In [None]:
result = md.predict(X_test)
print(f"Model predicts: {result}")

2.5 Evaluate model

In [None]:
from sklearn import metrics
from sklearn.metrics import classification_report

In [None]:
# Calculate metrics from the predictions
print("Accuracy:",metrics.accuracy_score(y_test, result))
print("Precision:",metrics.precision_score(y_test, result, average = 'weighted'))
print("Recall:",metrics.recall_score(y_test, result, average = 'weighted'))
print("F1-score:",metrics.f1_score(y_test, result, average = 'weighted'))

In [None]:
# Compute the confusion metrix
metrics.confusion_matrix(y_test, result)

2.6 Generate model - Multinominal Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Create Gaussian classifier
md.fit(X_train, y_train)

In [None]:
result = md.predict(X_test)
print(f"Model predicts: {result}")

In [None]:
# Calculate metrics from the predictions
print("Accuracy:",metrics.accuracy_score(y_test, result))
print("Precision:",metrics.precision_score(y_test, result, average = 'weighted'))
print("Recall:",metrics.recall_score(y_test, result, average = 'weighted'))
print("F1-score:",metrics.f1_score(y_test, result, average = 'weighted'))

In [None]:
# Compute the confusion metrix
metrics.confusion_matrix(y_test, result)

## Tutorial 2 - Spam Detection

## 1 Load dataset

In [None]:
import pandas as pd

spam_data = pd.read_csv("prml/data/spam.csv", sep=',')

## 2 Data exploration

In [None]:
spam_data.info()

In [None]:
spam_data.head(15)

In [None]:
spam_data['Label'].value_counts()

In [None]:
spam_data.groupby('Label').describe()

## 2 Feature extration

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# A sample
sample = spam_data['EmailText'][81]

In [None]:
cv = CountVectorizer()
cv.fit_transform(sample)
cv.vocabulary_

## 3 Build multinominal naive bayes model

In [None]:
# Get the features
X = cv.fit_transform(spam_data['EmailText'])

# Get the targets
y = spam_data['Label']

In [None]:
X.shape()

In [None]:
y.shape()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=111)

In [None]:
from sklearn.naive_bayes import MultinomialNB

mn = MultinomialNB()
mn.fit(X_train, y_train)

## 4 Make prediction

In [None]:
result = mn.predict(X_test)

## 5 Evaluation

In [None]:
score = mn.score(X_train, y_train)
print(f"Score of this model: {score}")

In [None]:
# Compute the confusion metrix
metrics.confusion_matrix(y_test, result)

## 6 Display misclassified messages

In [None]:
index = 0
misclassifiedIndexes = []
for label, predict in zip(y_test, result):
    if label != predict: 
        misclassifiedIndexes.append(index)
    index +=1

misclassifiedEmailText = []
for badIndex in misclassifiedIndexes:
    misclassifiedEmailText.append([result[badIndex],
                                  spam_data['EmailText'][badIndex]])

In [None]:
print(pd.DataFrame(misclassifiedEmailText,
      columns=['Wrong Label', 'Email Text']))