In [None]:
import numpy as np
from tensorflow import keras

(x_train, y_train), (x_test, y_test) = keras.datasets.imdb.load_data(num_words=2000)

word_index = keras.datasets.imdb.get_word_index()
index2word = dict((i + 3, word) for (word, i) in word_index.items())
index2word[0] = '[pad]'
index2word[1] = '[bos]'
index2word[2] = '[oov]'
x_train = np.array([' '.join([index2word[idx] for idx in text]) for text in x_train])
x_test = np.array([' '.join([index2word[idx] for idx in text]) for text in x_test])

In [None]:
vocabulary = list()
for text in x_train:
  tokens = text.split()
  vocabulary.extend(tokens)

vocabulary = set(vocabulary)
print(len(vocabulary))

In [None]:
from tqdm import tqdm

x_train_binary = list()
x_test_binary = list()

for text in tqdm(x_train):
  tokens = text.split()
  binary_vector = list()
  for vocab_token in vocabulary:
    if vocab_token in tokens:
      binary_vector.append(1)
    else:
      binary_vector.append(0)
  x_train_binary.append(binary_vector)

x_train_binary = np.array(x_train_binary)

for text in tqdm(x_test):
  tokens = text.split()
  binary_vector = list()
  for vocab_token in vocabulary:
    if vocab_token in tokens:
      binary_vector.append(1)
    else:
      binary_vector.append(0)
  x_test_binary.append(binary_vector)

x_test_binary = np.array(x_test_binary)

### testing

In [None]:
prior_positive = np.sum(y_train)/len(y_train)
prior_negative = 1 - prior_positive

In [None]:
positive_count = np.zeros(len(x_train_binary[0]))
negative_count = np.zeros(len(x_train_binary[0]))

for review, label in zip(x_train_binary, y_train):
    for idx2, word in enumerate(review):
        if label == 1:
            positive_count[idx2] += word
        else:
            negative_count[idx2] += word

positive_count[:5], negative_count[:5]

In [None]:
positive_frequencies = np.array([x/(len(x_train_binary)*prior_positive) for x in positive_count])
negative_frequencies = np.array([x/(len(x_train_binary)*prior_negative) for x in negative_count])

positive_frequencies[:5], negative_frequencies[:5]

In [None]:
predictions = list()
for idx1, correct in enumerate(y_test):
    positive_prediction = prior_positive
    negative_prediction = prior_negative
    for idx2, word in enumerate(x_test_binary[idx1]):
        if word:
            positive_prediction *= positive_frequencies[idx2]
            negative_prediction *= negative_frequencies[idx2]
        else:
            positive_prediction *= (1 - positive_frequencies[idx2])
            negative_prediction *= (1 - negative_frequencies[idx2])

    if positive_prediction >= negative_prediction:
        predictions.append(1)
    else:
        predictions.append(0)

In [None]:
# predictions = list()
# for review, correct in zip(x_test_binary, y_test):
#     positive_prediction = prior_positive
#     negative_prediction = prior_negative

#     present_indices = set(idx2 for idx2, word in enumerate(review) if word)
#     for idx in range(max(len(positive_frequencies), len(negative_frequencies))):
#         if idx in present_indices:
#             positive_prediction *= positive_frequencies[idx]
#             negative_prediction *= negative_frequencies[idx]
#         else:
#             positive_prediction *= (1 - positive_frequencies[idx])
#             negative_prediction *= (1 - negative_frequencies[idx])

#     if positive_prediction >= negative_prediction:
#         predictions.append(1)
#     else:
#         predictions.append(0)

In [None]:
present_indices

In [None]:
from sklearn.naive_bayes import BernoulliNB

nb = BernoulliNB()
nb.fit(x_train_binary, y_train)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, nb.predict(x_test_binary)))

In [None]:
print(classification_report(y_test, predictions))