# Text Classification using Naive Bayes

In [1]:
%matplotlib inline
import bs4
import nltk
import tqdm
import os
from collections import Counter
from decimal import Decimal

# For Train

## Data Load

In [2]:
TRAIN_DIR = "aclImdb/train"

train_pos = []
train_neg = []
# get positive reviews
for filename in tqdm.tqdm(os.listdir(os.path.join(TRAIN_DIR, "pos"))):
    with open(os.path.join(TRAIN_DIR, "pos", filename), "rt", encoding="utf8") as f:
        train_pos.append(f.read())
        
# get negative reviews
for filename in tqdm.tqdm(os.listdir(os.path.join(TRAIN_DIR, "neg"))):
    with open(os.path.join(TRAIN_DIR, "neg", filename), "rt", encoding="utf8") as f:
        train_neg.append(f.read())

100%|█████████████████████████████████████████████████████████████████████████| 12500/12500 [00:00<00:00, 13729.20it/s]
100%|█████████████████████████████████████████████████████████████████████████| 12500/12500 [00:00<00:00, 13595.17it/s]


## Tokenize & Count

In [3]:
pos_cnt = Counter()
neg_cnt = Counter()
for txt in tqdm.tqdm(train_pos):
    words = [word for word, pos in nltk.pos_tag(nltk.word_tokenize(txt))]
    pos_cnt.update(words)
    
for txt in tqdm.tqdm(train_neg):
    words = [word for word, pos in nltk.pos_tag(nltk.word_tokenize(txt))]
    neg_cnt.update(words)

100%|████████████████████████████████████████████████████████████████████████████| 12500/12500 [02:34<00:00, 80.98it/s]
100%|████████████████████████████████████████████████████████████████████████████| 12500/12500 [02:29<00:00, 83.54it/s]


In [4]:
vocab = set()
vocab.update(list(pos_cnt.keys()))
vocab.update(list(neg_cnt.keys()))
V_size = len(vocab)
print("Vocab Size: {:,}".format(V_size))

pos_wc = 0
for k, v in pos_cnt.items():
    pos_wc += v
print("Positive Word Count: {:,}".format(pos_wc))
    
neg_wc = 0
for k, v in neg_cnt.items():
    neg_wc += v
print("Negative Word Count: {:,}".format(neg_wc))

Vocab Size: 134,957
Positive Word Count: 3,556,186
Negative Word Count: 3,500,346


# For Test

## Data Load

In [5]:
TEST_DIR = "aclImdb/test"

test_pos = []
test_neg = []
# get positive reviews
for filename in tqdm.tqdm(os.listdir(os.path.join(TEST_DIR, "pos"))):
    with open(os.path.join(TEST_DIR, "pos", filename), "rt", encoding="utf8") as f:
        test_pos.append(f.read())
        
# get negative reviews
for filename in tqdm.tqdm(os.listdir(os.path.join(TEST_DIR, "neg"))):
    with open(os.path.join(TEST_DIR, "neg", filename), "rt", encoding="utf8") as f:
        test_neg.append(f.read())

100%|█████████████████████████████████████████████████████████████████████████| 12500/12500 [00:00<00:00, 13726.67it/s]
100%|█████████████████████████████████████████████████████████████████████████| 12500/12500 [00:00<00:00, 13730.31it/s]


## Calculate Naive Bayes

In [6]:
pos_answer = 0
for txt in tqdm.tqdm(test_pos):
    cnt = Counter()
    cnt.update([word for word, pos in nltk.pos_tag(nltk.word_tokenize(txt))])
    pos_acc = Decimal(1.)
    for k, v in cnt.items():
        pos_acc *= Decimal(((pos_cnt[k] + 1) / (pos_wc + V_size + 1)))
    neg_acc = Decimal(1.)
    for k, v in cnt.items():
        neg_acc *= Decimal((neg_cnt[k] + 1) / (neg_wc + V_size + 1))
    
    if pos_acc > neg_acc:
        pos_answer += 1
print("{} / 12,500 ({}%)".format(pos_answer, round(pos_answer/12500*100, 2)))
  
neg_answer = 0
for txt in tqdm.tqdm(test_neg):
    cnt = Counter()
    cnt.update([word for word, pos in nltk.pos_tag(nltk.word_tokenize(txt))])
    pos_acc = Decimal(1.)
    for k, v in cnt.items():
        pos_acc *= Decimal(((pos_cnt[k] + 1) / (pos_wc + V_size + 1)))
    
    neg_acc = Decimal(1.)
    for k, v in cnt.items():
        neg_acc *= Decimal((neg_cnt[k] + 1) / (neg_wc + V_size + 1))
    
    if pos_acc < neg_acc:
        neg_answer += 1
        
print("{} / 12,500 ({}%)".format(neg_answer, round(neg_answer/12500*100, 2)))

100%|████████████████████████████████████████████████████████████████████████████| 12500/12500 [02:39<00:00, 78.61it/s]


9530 / 12,500 (76.24%)


100%|████████████████████████████████████████████████████████████████████████████| 12500/12500 [02:37<00:00, 58.94it/s]


11180 / 12,500 (89.44%)
