# Task:
The task is to estimate appropriate parameters using the training data, and use it to predict reviews from the test data, and classify each of them as either positive or negative.

In [65]:
import pandas as pd
import numpy as np
import os
import re
import math
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import itertools
from collections import Counter

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
def clean(text):
    no_html = BeautifulSoup(text).get_text()
    clean = re.sub("[^a-z\s]+"," ", no_html, flags=re.IGNORECASE)
    return re.sub("(\s+)", " ", clean)

def tokenize(text):
    cleaned = clean(text).lower()
    stopwords_en = set(stopwords.words('english'))
    return [w for w in re.split('\W+', cleaned) if not w in stopwords_en]

In [4]:
path = 'C:\\Users\\Owner\\Dev\\Python\\Data Mining\\ProgrammingQuizData\\aclImdb'

In [5]:
test_path = os.path.join(path, 'test')
train_path = os.path.join(path, 'train')
pos_test = os.path.join(test_path, 'pos')
neg_test = os.path.join(test_path, 'neg')
pos_train = os.path.join(train_path, 'pos')
neg_train = os.path.join(train_path, 'neg')

So the first task is to go through all the files in the ‘train’ folder, and construct the vocabulary V of all unique words. Please ignore all the stop-words. The words from each file (both in training and testing phase) must be extracted by splitting the raw text only with whitespace characters and converting them to lowercase characters. 

In [34]:
pos_train_list = []
for file in os.listdir(pos_train):
    file_path = os.path.join(pos_train, file)
    with open(file_path, 'r', encoding='utf8') as n:
        line = n.read()
        pos_train_list.extend(tokenize(line))

In [31]:
neg_train_list = []
for file in os.listdir(pos_train):
    file_path = os.path.join(pos_train, file)
    with open(file_path, 'r', encoding='utf8') as n:
        line = n.read()
        neg_train_list.extend(tokenize(line))

In [32]:
pos_test_list = []
for file in os.listdir(pos_train):
    file_path = os.path.join(pos_train, file)
    with open(file_path, 'r', encoding='utf8') as n:
        line = n.read()
        pos_test_list.extend(tokenize(line))

In [33]:
neg_test_list = []
for file in os.listdir(pos_train):
    file_path = os.path.join(pos_train, file)
    with open(file_path, 'r', encoding='utf8') as n:
        line = n.read()
        neg_test_list.extend(tokenize(line))

In [37]:
vocab_list = list(itertools.chain(pos_test_list,neg_test_list,pos_train_list,neg_train_list))
unique_vocab_list = set(vocab_list)

In [39]:
unique_list = set(vocab_list)
len(unique_list)

54991

The next step is to get counts of each individual words for the positive and the negative classes
separately, to get P(word|class).

In [73]:
#counts per values
pos_dict_counts = Counter(pos_train_list)

In [74]:
#counts per value
neg_dict_counts = Counter(neg_train_list)

Finding the log-posterior (un-normalized), which is given by log(P(X|Y )P(Y )), for both the classes with laplace smoothing.

In [70]:
pos_logs = {}
for c, data in pos_dict_counts.items():
    logs[c] = math.log(data + 1 / (len(pos_train_list) + 1) +
                       len(pos_train_list)/(len(pos_train_list)+len(neg_train_list)))
neg_logs = {}
for c, data in neg_dict_counts.items():
    logs[c] = math.log(data + 1 / (len(neg_train_list) + 1) + 
                       len(neg_train_list)/(len(pos_train_list)+len(neg_train_list)))