In [1]:
import numpy as np
from tensorflow import keras

(x_train, y_train), (x_test, y_test) = keras.datasets.imdb.load_data(num_words=500)

word_index = keras.datasets.imdb.get_word_index()
index2word = dict((i + 3, word) for (word, i) in word_index.items())
index2word[0] = '[pad]'
index2word[1] = '[bos]'
index2word[2] = '[oov]'
x_train = np.array([' '.join([index2word[idx] for idx in text]) for text in x_train])
x_test = np.array([' '.join([index2word[idx] for idx in text]) for text in x_test])

2022-01-07 20:03:50.803601: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-07 20:03:50.803654: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
vocabulary = list()
for text in x_train:
  tokens = text.split()
  vocabulary.extend(tokens)

vocabulary = set(vocabulary)
print(len(vocabulary))

498


In [3]:
from tqdm import tqdm

x_train_binary = list()
x_test_binary = list()

for text in tqdm(x_train):
  tokens = text.split()
  binary_vector = list()
  for vocab_token in vocabulary:
    if vocab_token in tokens:
      binary_vector.append(1)
    else:
      binary_vector.append(0)
  x_train_binary.append(binary_vector)

x_train_binary = np.array(x_train_binary)

for text in tqdm(x_test):
  tokens = text.split()
  binary_vector = list()
  for vocab_token in vocabulary:
    if vocab_token in tokens:
      binary_vector.append(1)
    else:
      binary_vector.append(0)
  x_test_binary.append(binary_vector)

x_test_binary = np.array(x_test_binary)

100%|██████████| 25000/25000 [00:39<00:00, 638.36it/s]
100%|██████████| 25000/25000 [00:37<00:00, 660.66it/s]


### testing

In [4]:
prior_positive = np.sum(y_train)/len(y_train)
prior_negative = 1 - prior_positive

In [5]:
def predict():
    predictions = list()
    for idx1, correct in enumerate(y_test):
        positive_prediction = prior_positive
        negative_prediction = prior_negative
        for idx2, word in enumerate(x_test_binary[idx1]):
            if word:
                positive_prediction *= positive_frequencies[idx2]
                negative_prediction *= negative_frequencies[idx2]
            else:
                positive_prediction *= (1 - positive_frequencies[idx2])
                negative_prediction *= (1 - negative_frequencies[idx2])

        if positive_prediction >= negative_prediction:
            predictions.append(1)
        else:
            predictions.append(0)
    return predictions

In [6]:
positive_count = np.zeros(len(x_train_binary[0]))
negative_count = np.zeros(len(x_train_binary[0]))

counter = 0
for review, label in zip(x_train_binary, y_train):
    for idx, word in enumerate(review):
        if label == 1:
            positive_count[idx] += word
        else:
            negative_count[idx] += word

positive_count[:5], negative_count[:5]

(array([1131., 1438.,  987.,  717.,  535.]),
 array([1510., 1497.,  951., 1085.,  420.]))

In [7]:
positive_frequencies = np.array([x/(len(x_train_binary)*prior_positive) for x in positive_count])
negative_frequencies = np.array([x/(len(x_train_binary)*prior_negative) for x in negative_count])

positive_frequencies[:5], negative_frequencies[:5]

(array([0.09048, 0.11504, 0.07896, 0.05736, 0.0428 ]),
 array([0.1208 , 0.11976, 0.07608, 0.0868 , 0.0336 ]))

In [8]:
# predictions = list()
# for review, correct in zip(x_test_binary, y_test):
#     positive_prediction = prior_positive
#     negative_prediction = prior_negative

#     present_indices = set(idx2 for idx2, word in enumerate(review) if word)
#     for idx in range(max(len(positive_frequencies), len(negative_frequencies))):
#         if idx in present_indices:
#             positive_prediction *= positive_frequencies[idx]
#             negative_prediction *= negative_frequencies[idx]
#         else:
#             positive_prediction *= (1 - positive_frequencies[idx])
#             negative_prediction *= (1 - negative_frequencies[idx])

#     if positive_prediction >= negative_prediction:
#         predictions.append(1)
#     else:
#         predictions.append(0)

In [9]:
from sklearn.naive_bayes import BernoulliNB

nb = BernoulliNB()
nb.fit(x_train_binary, y_train)

BernoulliNB()

In [10]:
from sklearn.metrics import classification_report
print(classification_report(y_test, nb.predict(x_test_binary)))

              precision    recall  f1-score   support

           0       0.80      0.74      0.77     12500
           1       0.76      0.82      0.79     12500

    accuracy                           0.78     25000
   macro avg       0.78      0.78      0.78     25000
weighted avg       0.78      0.78      0.78     25000



In [11]:
print(classification_report(y_test, predict()))

              precision    recall  f1-score   support

           0       0.80      0.74      0.77     12500
           1       0.76      0.82      0.79     12500

    accuracy                           0.78     25000
   macro avg       0.78      0.78      0.78     25000
weighted avg       0.78      0.78      0.78     25000

