# Naive Bayes
Machine Learning project created in 2024

In [103]:
!pip install datasets
!pip install nltk



In [None]:
import numpy as np
import regex as re
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import nltk
from datasets import load_dataset
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tahafaisal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 1. Loading the Datasets


In [None]:
golf_data = pd.read_csv('golf_data.csv')
# Bernoulli Naive Bayes

In [None]:
tweet_data = load_dataset('tweet_eval', 'emotion', cache_dir="datasets")
# Multinomial Naive Bayes

## 2. Data Preprocessing

In [None]:
df_golf = golf_data.copy()
lb = LabelBinarizer()

df_golf['Month'] = lb.fit_transform(df_golf['Month'])
df_golf['Season'] = lb.fit_transform(df_golf['Season'])
df_golf['Temperature'] = lb.fit_transform(df_golf['Temperature'])
df_golf['Humidity'] = lb.fit_transform(df_golf['Humidity'])
df_golf['Outlook'] = lb.fit_transform(df_golf['Outlook'])
df_golf['Crowdedness'] = lb.fit_transform(df_golf['Crowdedness'])

y = df_golf['Play']
X = df_golf.drop(columns=['Play'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

def processor(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

tweet_train_data = tweet_data['train'].to_pandas()
tweet_test_data = tweet_data['test'].to_pandas()
tweet_validation_data = tweet_data['validation'].to_pandas()

tweet_train_data['cleaned_text'] = tweet_train_data['text'].apply(processor)
tweet_test_data['cleaned_text'] = tweet_test_data['text'].apply(processor)
tweet_validation_data['cleaned_text'] = tweet_validation_data['text'].apply(processor)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tahafaisal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 3. Implementing Naive Bayes from Scratch

## 3.1 Bernoulli Naive Bayes


In [None]:
class BernoulliNaiveBayes:
    def __init__(self):
        self.class_priors = {}
        self.feature_probs = {}
        self.classes = []

    def training(self, X, y):
        self.classes = np.unique(y)
        for i in self.classes:
            total_class = np.sum(y == i)
            total_count = len(y)
            self.class_priors[i] = total_class / total_count

        for i in self.classes:
            X_c = X[y == i]
            fcount = np.sum(X_c, axis=0)
            tfcount = X_c.shape[0]
            sprob = (fcount + 1) / (tfcount + 2) # Laplace smoothing
            self.feature_probs[i] = sprob

    def predicting(self, X):
        X = np.array(X, dtype=float)
        predictions = []

        for x in X:
            x = np.array(x, dtype=float)
            class_scores = {}

            for i in self.classes:
                plog = np.log(self.class_priors[i])
                temp = self.feature_probs[i].astype(float)
                present_log_likelihood = x * np.log(temp)
                temp = self.feature_probs[i].astype(float)
                abs_likihood = (1 - x) * np.log(1 - temp)
                add = present_log_likelihood + abs_likihood
                total_log_likelihood = np.sum(add)
                class_scores[i] = plog + total_log_likelihood

            best_class = max(class_scores, key=class_scores.get)
            predictions.append(best_class)

        return np.array(predictions)


In [None]:
model = BernoulliNaiveBayes()
model.training(X_train.values, y_train.values)

test_predictions = model.predicting(X_test.values)

accuracy_test = accuracy_score(y_test, test_predictions)
precision_test = precision_score(y_test, test_predictions, average='binary')
recall_test = recall_score(y_test, test_predictions, average='binary')
f1_test = f1_score(y_test, test_predictions, average='binary')
conf_matrix_test = confusion_matrix(y_test, test_predictions)

print("Test Set Metrics:")
print(f"Accuracy: {accuracy_test:.2f}")
print(f"Precision: {precision_test:.2f}")
print(f"Recall: {recall_test:.2f}")
print(f"F1 Score: {f1_test:.2f}")
print("Confusion Matrix:")
print(conf_matrix_test)

Test Set Metrics:
Accuracy: 0.82
Precision: 0.52
Recall: 0.03
F1 Score: 0.06
Confusion Matrix:
[[1872   13]
 [ 401   14]]


## 3.2 Multinomial Naive Bayes (Manual Implementation)

In [None]:
class BagOfWords:
    def __init__(self):
        self.vocabulary = []
        self.word_to_index = {} # did this so I could do the above mapping as shown in the example

    def fit(self, corpus):
        unique_words = set()

        for sentence in corpus:
            words = sentence.split()
            unique_words.update(words)

        self.vocabulary = sorted(list(unique_words))

        for idx, word in enumerate(self.vocabulary): # done to map the word to the index as shown in the example
            self.word_to_index[word] = idx

    def vectorize(self, sentence):
        vlen = len(self.vocabulary)
        vector = [0] * vlen
        words = sentence.split()

        for word in words:
            if word in self.word_to_index:
                index = self.word_to_index[word]
                vector[index] += 1

        return vector

In [None]:
bow = BagOfWords()
bow.vocabulary = ["the", "cat", "sat", "on", "mat"]
for idx, word in enumerate(bow.vocabulary):
    bow.word_to_index[word] = idx
test_sentence = "the cat sat on the mat"
vectorized_sentence = bow.vectorize(test_sentence)
print("Manually set Vocabulary:", bow.vocabulary)
print("Word to Index Mapping:", bow.word_to_index)
print("Vectorized Sentence:", vectorized_sentence)

corpus = tweet_train_data['cleaned_text'].tolist()
bow = BagOfWords()
bow.fit(corpus)

X_train_vectorized = []
X_validation_vectorized = []
X_test_vectorized = []

for sentence in tweet_train_data['cleaned_text']:
    vectorized_sentence = bow.vectorize(sentence)
    X_train_vectorized.append(vectorized_sentence)

for sentence in tweet_validation_data['cleaned_text']:
    vectorized_sentence = bow.vectorize(sentence)
    X_validation_vectorized.append(vectorized_sentence)

for sentence in tweet_test_data['cleaned_text']:
    vectorized_sentence = bow.vectorize(sentence)
    X_test_vectorized.append(vectorized_sentence)

y_train = tweet_train_data['label'].values
y_validation = tweet_validation_data['label'].values
y_test = tweet_test_data['label'].values


Manually set Vocabulary: ['the', 'cat', 'sat', 'on', 'mat']
Word to Index Mapping: {'the': 0, 'cat': 1, 'sat': 2, 'on': 3, 'mat': 4}
Vectorized Sentence: [2, 1, 1, 1, 1]


### From Scratch

In [None]:
class NaiveBayes:
    def __init__(self):
        self.log_prior = {}
        self.log_likelihood = {}
        self.vocabulary = set() # did not use BagOfWords class here. Could have alternatively implemented the vectorization within the class but have already done that above
        self.classes = []

    def training(self, X, y):
        X = np.array(X)
        y = np.array(y)

        self.classes = np.unique(y)
        Ndoc = len(X)

        for i in self.classes:
            Nc = np.sum(y == i)
            self.log_prior[i] = np.log(Nc / Ndoc)
            docB = X[y == i]
            word_counts = np.sum(docB, axis=0)
            lth = len(word_counts)
            self.vocabulary.update(range(lth))
            tote_count = np.sum(word_counts) + len(self.vocabulary)
            temp = (word_counts + 1) / tote_count
            self.log_likelihood[i] = np.log(temp)

    def predicting(self, X):
        X = np.array(X)
        predictions = []

        for doc in X:
            class_scores = {}

            for j in self.classes:
                score = self.log_prior[j]

                for i, count in enumerate(doc):
                    if i in self.vocabulary:
                        score += count * self.log_likelihood[j][i]

                class_scores[j] = score

            predictions.append(max(class_scores, key=class_scores.get))

        return predictions

In [None]:
nb = NaiveBayes()
nb.training(X_train_vectorized, y_train)
validation_predictions = nb.predicting(X_validation_vectorized)
accuracy_val = accuracy_score(y_validation, validation_predictions)
precision_val = precision_score(y_validation, validation_predictions, average='macro')
recall_val = recall_score(y_validation, validation_predictions, average='macro')
f1_val = f1_score(y_validation, validation_predictions, average='macro')
conf_matrix_val = confusion_matrix(y_validation, validation_predictions)
print(f"Validation Set Metrics:")
print(f"Accuracy: {accuracy_val:.2f}")
print(f"Precision: {precision_val:.2f}")
print(f"Recall: {recall_val:.2f}")
print(f"F1 Score: {f1_val:.2f}")
print("Confusion Matrix:")
print(conf_matrix_val)


Validation Set Metrics:
Accuracy: 0.65
Precision: 0.75
Recall: 0.52
F1 Score: 0.54
Confusion Matrix:
[[141   7   0  12]
 [ 38  44   0  15]
 [ 15   2   4   7]
 [ 29   6   0  54]]


In [None]:
test_predictions = nb.predicting(X_test_vectorized)
accuracy_test = accuracy_score(y_test, test_predictions)
precision_test = precision_score(y_test, test_predictions, average='macro')
recall_test = recall_score(y_test, test_predictions, average='macro')
f1_test = f1_score(y_test, test_predictions, average='macro')
conf_matrix_test = confusion_matrix(y_test, test_predictions)
print(f"Test Set Metrics:")
print(f"Accuracy: {accuracy_test:.2f}")
print(f"Precision: {precision_test:.2f}")
print(f"Recall: {recall_test:.2f}")
print(f"F1 Score: {f1_test:.2f}")
print("Confusion Matrix:")
print(conf_matrix_test)

Test Set Metrics:
Accuracy: 0.65
Precision: 0.68
Recall: 0.53
F1 Score: 0.54
Confusion Matrix:
[[501  19   2  36]
 [120 173   2  63]
 [ 74  12  14  23]
 [120  20   3 239]]


## 4. Implementing Naive Bayes using sklearn

In [116]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(X_train_vectorized, y_train)
mnb_test_predictions = mnb.predict(X_test_vectorized)

accuracy_mnb = accuracy_score(y_test, mnb_test_predictions)
precision_mnb = precision_score(y_test, mnb_test_predictions, average='macro')
recall_mnb = recall_score(y_test, mnb_test_predictions, average='macro')
f1_mnb = f1_score(y_test, mnb_test_predictions, average='macro')
conf_matrix_mnb = confusion_matrix(y_test, mnb_test_predictions)

print("Multinomial Naive Bayes (Tweet Data) Metrics on Test Set:")
print(f"Accuracy: {accuracy_mnb:.2f}")
print(f"Precision: {precision_mnb:.2f}")
print(f"Recall: {recall_mnb:.2f}")
print(f"F1 Score: {f1_mnb:.2f}")
print("Confusion Matrix:")
print(conf_matrix_mnb)


Multinomial Naive Bayes (Tweet Data) Metrics on Test Set:
Accuracy: 0.65
Precision: 0.68
Recall: 0.53
F1 Score: 0.54
Confusion Matrix:
[[501  19   2  36]
 [120 173   2  63]
 [ 74  12  14  23]
 [120  20   3 239]]


In [None]:
from sklearn.naive_bayes import BernoulliNB

y = df_golf['Play']
X = df_golf.drop(columns=['Play'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

bnb = BernoulliNB()
bnb.fit(X_train, y_train)

bnb_test_predictions = bnb.predict(X_test)

accuracy_bnb = accuracy_score(y_test, bnb_test_predictions)
precision_bnb = precision_score(y_test, bnb_test_predictions, average='binary')
recall_bnb = recall_score(y_test, bnb_test_predictions, average='binary')
f1_bnb = f1_score(y_test, bnb_test_predictions, average='binary')
conf_matrix_bnb = confusion_matrix(y_test, bnb_test_predictions)

print("Bernoulli Naive Bayes (Golf Data) Metrics on Test Set:")
print(f"Accuracy: {accuracy_bnb:.2f}")
print(f"Precision: {precision_bnb:.2f}")
print(f"Recall: {recall_bnb:.2f}")
print(f"F1 Score: {f1_bnb:.2f}")
print("Confusion Matrix:")
print(conf_matrix_bnb)

Bernoulli Naive Bayes (Golf Data) Metrics on Test Set:
Accuracy: 0.82
Precision: 0.52
Recall: 0.03
F1 Score: 0.06
Confusion Matrix:
[[1872   13]
 [ 401   14]]
