# Naive Bayes Classifier

In [1]:
import nltk
from nltk.corpus import movie_reviews
nltk.download("movie_reviews")

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [2]:
from collections import defaultdict, Counter
import math
import random

train_X, train_Y = [], []
test_X, test_Y = [], []

random.seed(0)
for polarity in movie_reviews.categories():
    for fid in movie_reviews.fileids(polarity):
        if random.randrange(5) == 0:
            test_X.append([w for w in movie_reviews.words(fid)])
            test_Y.append(polarity)
        else:
            train_X.append([w for w in movie_reviews.words(fid)])
            train_Y.append(polarity)

print(train_X[0], train_Y[0])

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', 'what', "'", 's', 'the', 'deal', '?', 'watch', 'the', 'movie', 'and', '"', 'sorta', '"', 'find', 'out', '.', '.', '.', 'critique', ':', 'a', 'mind', '-', 'fuck', 'movie', 'for', 'the', 'teen', 'generation', 'that', 'touches', 'on', 'a', 'very', 'cool', 'idea', ',', 'but', 'presents', 'it', 'in', 'a', 'very', 'bad', 'package', '.', 'which', 'is', 'what', 'makes', 'this', 'review', 'an', 'even', 'harder', 'one', 'to', 'write', ',', 'since', 'i', 'generally', 'applaud', 'films', 'which', 'attempt', 'to', 'break', 'the', 'mold', ',', 'mess', 'with', 'your', 'head', 'and', 'such', '(', 'lost', 'highway', '&', 'memento', ')', ',', 'but', 'there', 'are', 'good', 'and', 'ba

## Model Construction

$\bar{y} = \text{arg}\max_{y \in \mathbf{y}} P(y|x) = \text{arg}\max_{y \in \mathbf{y}} P(y) \prod_{i=1}^n \frac{P(x_i|y)}{P(x_i)} = \text{arg}\max_{y \in \mathbf{y}} P(y) \prod_{i=1}^n P(x_i|y)$

$P(x_i|y)=\frac{C(x_i, y) + k}{C(y) + |\mathbf{y}| \times k}$

$\bar{y} = \textrm{arg} \max_{y \in \mathbf{y}} \log P(y) + \sum_{i=1}^n \log \frac{C(x_i, y) + k}{C(y) + k|\mathbf{y}|}$

     

In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')
import spacy


class NaiveBayesClassifier:
    def __init__(self, k=0.9, min_word_freq=32):
        # k為smoothing features
        self.k = k
        self.min_word_freq = min_word_freq
        self.features = set()
        # defaultdict可以用來設定不同的存放容器 e.g. defaultdict(list) => [1,2,3] / defaultdict(set) => {1,2,3}
        self.class_feature_counts = defaultdict(Counter)
        self.class_counts = Counter()
        self.total = 0
        # stop words和標點符號
        self.nlp = spacy.load('en_core_web_sm')
        self.stop_words = spacy.lang.en.stop_words.STOP_WORDS
        sign_list = [";", ",", ".", "~", ":", "-", "(", ")", "%", "#", "$", "!", "/", "?", "=", "+", "&", "--", "'", '"', '`']
        for sign in sign_list:
          self.stop_words.add(sign)
        # 詞形還原、詞幹提取
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        # 詞頻計算
        self.word_counts = Counter()


    def preprocess(self, tokens):
        tokens = [token.lower() for token in tokens]
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens]
        # tokens = [token for token in tokens if token not in self.stop_words]
        # tokens = [self.stemmer.stem(token) for token in tokens]

        return tokens


    def train(self, train_X, train_Y):
        # tokens = "字" / label = "neg" or "pos"
        for tokens, label in zip(train_X, train_Y):
            tokens = self.preprocess(tokens)
            #更新每個樣本類別出現的次數
            self.class_counts[label] += 1
            #更新總訓練樣本數
            self.total += 1

            self.word_counts.update(tokens)

            for token in set(tokens):
                self.features.add(token)
                self.class_feature_counts[label][token] += 1
        # 詞頻篩選，刪除太少出現的
        self.features = {word for word in self.features if self.word_counts[word] >= self.min_word_freq }


    def probabilities(self, token):
        probs = {}
        for cls, cls_cnt in self.class_counts.items():
            probs[cls] = (self.class_feature_counts[cls][token] + self.k) / (cls_cnt + len(self.class_counts) * self.k)
        return probs

    def predict(self, tokens):
        tokens = set(tokens)
        tokens = self.preprocess(tokens)
        log_probs = Counter()
        for cls, cls_cnt in self.class_counts.items():
            log_probs[cls] = math.log(cls_cnt / self.total)
        for token in self.features:
              probs = self.probabilities(token)
              if token in tokens:
                  for cls, prob in probs.items():
                      log_probs[cls] += math.log(prob)
              else:
                  for cls, prob in probs.items():
                      log_probs[cls] += math.log(1.0 - prob)
        # Return the argmax of log_probs and all log_probs
        return max(log_probs, key=log_probs.get), log_probs

[nltk_data] Downloading package wordnet to /root/nltk_data...


## Using the Model

In [4]:
model = NaiveBayesClassifier()
model.train(train_X, train_Y)

In [7]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

# Taken from https://www.imdb.com/review/rw0990793/?ref_=tt_urv
review = """A whimsical, often spectacular view of a future in which advances in technology dominate the world. It is well shot and although slow-moving it is intense and enjoyable throughout. The featuring of classical music to establish atmosphere works brilliantly; it provides a feeling of awe, mystery and intrigue  the same aura that Walt Disney worked in creating 'Fantasia'. The special effects, both sound and visual, are still spellbinding by the standards of today's technology. Aside from the technical pluses of the film, it stands strong as it is one of not many films out there that has something important to say about humankind, and where the human race is heading in terms of our increasing reliance on machines and our unquenchable thirst to discover. Despite an ending that is hard to understand, it is even harder to overlook this film a true cinema classic."""

model.predict(word_tokenize(review))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


('pos', Counter({'neg': -433.239532854839, 'pos': -426.4539894608389}))

In [8]:
correct, total = 0, 0

for x, y in zip(test_X, test_Y):
    prediction, _ = model.predict(x)
    if prediction == y:
        correct += 1
    total += 1

print("%d / %d = %g" % (correct, total, correct / total))

362 / 422 = 0.85782


## Exploring important features

In [6]:
def prob_class_given_feature(feature, cls, model):
    probs = model.probabilities(feature)
    return probs[cls] / sum(probs.values())

print(sorted(model.features, key=lambda t: prob_class_given_feature(t, "pos", model), reverse=True)[:30])
print(sorted(model.features, key=lambda t: prob_class_given_feature(t, "neg", model), reverse=True)[:30])

['outstanding', 'gattaca', 'mulan', 'wonderfully', 'refreshing', 'finest', 'guido', 'ordell', 'german', 'damon', 'coen', 'jedi', 'breathtaking', 'beautifully', 'religion', 'obi', 'lebowski', 'fashioned', 'superb', 'anger', 'politics', 'ordinary', 'shrek', 'fargo', 'flawless', 'homer', 'flynt', 'pulp', 'era', 'controversial']
['schumacher', 'ludicrous', 'turkey', 'henstridge', 'welles', 'poorly', 'krippendorf', 'seagal', 'idiotic', 'uninspired', 'inept', 'uninteresting', 'sat', 'awful', 'unfunny', 'waste', 'wasted', 'natasha', 'lame', 'idiot', 'ridiculous', 'worst', 'pointless', 'stupid', 'random', 'bland', 'badly', 'mess', 'alicia', 'jolie']
