In [1]:
import os.path as op
import numpy as np
import string

from sklearn.base import BaseEstimator, ClassifierMixin

In [2]:
# Load data
print("Loading dataset")

from glob import glob
filenames_neg = sorted(glob(op.join('data', 'imdb1', 'neg', '*.txt')))
filenames_pos = sorted(glob(op.join('data', 'imdb1', 'pos', '*.txt')))

texts_neg = [open(f).read() for f in filenames_neg]
texts_pos = [open(f).read() for f in filenames_pos]
texts = texts_neg + texts_pos
y = np.ones(len(texts), dtype=np.int)
y[:len(texts_neg)] = 0.

print("%d documents" % len(texts))

Loading dataset
2000 documents


In [3]:
def count_words(texts):
    """Vectorize text : return count of each word in the text snippets

    Parameters
    ----------
    texts : list of str
        The texts

    Returns
    -------
    vocabulary : dict
        A dictionary that points to an index in counts for each word.
    counts : ndarray, shape (n_samples, n_features)
        The counts of each word in each text.
        n_samples == number of documents.
        n_features == number of words in vocabulary.
    """
    words = set()
    vocabulary = {}
    table = str.maketrans({key:" " for key in string.punctuation})
    i = 0
    j = 0

    for text in texts:
        word_list = text.translate(table).lower().split(" ")
        for word in word_list:
            if word not in words:
                words.add(word)
                vocabulary[word] = j
                j += 1

    n_features = len(words)
    counts = np.zeros((len(texts), n_features))

    for text in texts:
        word_list = text.translate(table).lower().split(" ")
        for word in word_list:
            if word in words:
                counts[i][vocabulary[word]] += 1
        i += 1

    return vocabulary, counts

In [4]:
class NB(BaseEstimator, ClassifierMixin):
    def __init__(self, vocabulary):
        self.vocabulary = vocabulary
        self.prior = np.zeros((2))
        self.condprob = None
        self.scores = None

    def fit(self, X, y):
        self.condprob = np.zeros((2, X.shape[1]))
        for c in [0,1]:
            self.prior[c] = X[y==c].shape[0] / X.shape[0]
            self.condprob[c,:] = ((np.sum(X[y==c], axis=0) +1)
                / np.sum(np.sum(X[y==c], axis=1)+1))
        return self.vocabulary, self.prior, self.condprob


    def predict(self, X):
        self.scores = np.zeros((X.shape[0], self.prior.shape[0]))
        self.scores += np.log(self.prior)
        tmp = np.zeros((X.shape[0], X.shape[1], 2))
        for c in [0,1]:
            tmp[:,:,c] = np.multiply(X, self.condprob[c,:])
        tmp[tmp==0] = 1
        self.scores += np.sum(np.log(tmp), axis=1)
        return np.argmax(self.scores, axis=1)

    def score(self, X, y):
        return np.mean(self.predict(X) == y)

In [5]:
# Count words in text
vocabulary, X = count_words(texts)

# Try to fit, predict and score
nb = NB(vocabulary)
nb.fit(X[::2], y[::2])
print (nb.score(X[1::2], y[1::2]))

0.815


In [6]:
# Try to fit, predict and score using cross-validation 5-folds
X_dict = {}
y_dict = {}
for i in range(5):
    X_dict[f"X_{i}"] = X[i::5]
    y_dict[f"y_{i}"] = y[i::5]

score = 0
for i in range(5):
    nb = NB(vocabulary)
    X_val = X_dict[f"X_{i}"]
    y_val = y_dict[f"y_{i}"]
    first = True
    for j in range(5):
        if (j!=i):
            if bool:
                first = False
                X_train = X_dict[f"X_{j}"]
                y_train = y_dict[f"y_{j}"]
            else:
                X_train = np.concatenate((X_train, X_dict[f"X_{j}"]), axis=0)
                y_train = np.concatenate((y_train, y_dict[f"y_{j}"]), axis=0)
    nb.fit(X_train, y_train)
    score_tmp = nb.score(X_val, y_val)
    print(score_tmp)
    score += score_tmp
print(score/5)

0.78
0.8
0.7975
0.7825
0.735
0.779


In [7]:
def count_words_V2(texts):
    """Vectorize text : return count of each word in the text snippets

    Parameters
    ----------
    texts : list of str
        The texts

    Returns
    -------
    vocabulary : dict
        A dictionary that points to an index in counts for each word.
    counts : ndarray, shape (n_samples, n_features)
        The counts of each word in each text.
        n_samples == number of documents.
        n_features == number of words in vocabulary.
    """
    words = set()
    vocabulary = {}
    table = str.maketrans({key:" " for key in string.punctuation})
    i = 0
    j = 0
    english_words = open(op.join('data', 'english.stop')).read()

    for text in texts:
        word_list = text.translate(table).lower().split(" ")
        for word in word_list:
            if (word not in words) and (word not in english_words):
                words.add(word)
                vocabulary[word] = j
                j += 1

    n_features = len(words)
    counts = np.zeros((len(texts), n_features))

    for text in texts:
        word_list = text.translate(table).lower().split(" ")
        for word in word_list:
            if word in words:
                counts[i][vocabulary[word]] += 1
        i += 1

    return vocabulary, counts

In [8]:
# Count words in text
vocabulary, X = count_words_V2(texts)

# Try to fit, predict and score
nb = NB(vocabulary)
nb.fit(X[::2], y[::2])
print (nb.score(X[1::2], y[1::2]))

0.808


In [9]:
# Try to fit, predict and score using cross-validation 5-folds
X_dict = {}
y_dict = {}
for i in range(5):
    X_dict[f"X_{i}"] = X[i::5]
    y_dict[f"y_{i}"] = y[i::5]

score = 0
for i in range(5):
    nb = NB(vocabulary)
    X_val = X_dict[f"X_{i}"]
    y_val = y_dict[f"y_{i}"]
    first = True
    for j in range(5):
        if (j!=i):
            if bool:
                first = False
                X_train = X_dict[f"X_{j}"]
                y_train = y_dict[f"y_{j}"]
            else:
                X_train = np.concatenate((X_train, X_dict[f"X_{j}"]), axis=0)
                y_train = np.concatenate((y_train, y_dict[f"y_{j}"]), axis=0)
    nb.fit(X_train, y_train)
    score_tmp = nb.score(X_val, y_val)
    print(score_tmp)
    score += score_tmp
print(score/5)

0.75
0.79
0.765
0.77
0.72
0.759


In [10]:
# Comparison with sklearn

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

In [11]:
X_train, y_train = texts[::2], y[::2]
X_test, y_test = texts[1::2], y[1::2]
countVec = CountVectorizer()
clf = MultinomialNB()
cV_mNB = Pipeline([('countVec', countVec), ('multNB', clf)])
cV_mNB.set_params().fit(X_train, y_train)
cV_mNB.score(X_test, y_test)

0.813

In [12]:
X_train, y_train = texts[::2], y[::2]
X_test, y_test = texts[1::2], y[1::2]
countVec = CountVectorizer()
clf = MultinomialNB()
cV_mNB = Pipeline([('countVec', countVec), ('multNB', clf)])
cV_mNB.set_params(countVec__analyzer = "char").fit(X_train, y_train)
cV_mNB.score(X_test, y_test)

0.606

In [13]:
X_train, y_train = texts[::2], y[::2]
X_test, y_test = texts[1::2], y[1::2]
countVec = CountVectorizer()
clf = MultinomialNB()
cV_mNB = Pipeline([('countVec', countVec), ('multNB', clf)])
cV_mNB.set_params(countVec__analyzer = "char_wb").fit(X_train, y_train)
cV_mNB.score(X_test, y_test)

0.605

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from nltk import SnowballStemmer, pos_tag, word_tokenize, sent_tokenize
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/thibault/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/thibault/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [15]:
X_train, y_train = texts[::2], y[::2]
X_test, y_test = texts[1::2], y[1::2]
countVec = CountVectorizer()
clf = LogisticRegression()
cV_mNB = Pipeline([('countVec', countVec), ('logReg', clf)])
cV_mNB.set_params(logReg__random_state=42, logReg__solver='liblinear').fit(X_train, y_train)
cV_mNB.score(X_test, y_test)

0.831

In [16]:
X_train, y_train = texts[::2], y[::2]
X_test, y_test = texts[1::2], y[1::2]
countVec = CountVectorizer()
clf = LinearSVC()
cV_mNB = Pipeline([('countVec', countVec), ('linSVC', clf)])
cV_mNB.set_params(linSVC__random_state=42, linSVC__max_iter=10000).fit(X_train, y_train)
cV_mNB.score(X_test, y_test)

0.81

In [17]:
stemmer = SnowballStemmer("english")
texts_stemmed = []
for text in texts:
    text_stemmed = ""
    for word in text.split(" "):
        text_stemmed += stemmer.stem(word) + " "
    texts_stemmed.append(text_stemmed)

In [18]:
X_train, y_train = texts_stemmed[::2], y[::2]
X_test, y_test = texts_stemmed[1::2], y[1::2]
countVec = CountVectorizer()
clf = LogisticRegression()
cV_mNB = Pipeline([('countVec', countVec), ('logReg', clf)])
cV_mNB.set_params(logReg__random_state=42, logReg__solver='liblinear').fit(X_train, y_train)
cV_mNB.score(X_test, y_test)

0.82

In [19]:
X_train, y_train = texts_stemmed[::2], y[::2]
X_test, y_test = texts_stemmed[1::2], y[1::2]
countVec = CountVectorizer()
clf = LinearSVC()
cV_mNB = Pipeline([('countVec', countVec), ('linSVC', clf)])
cV_mNB.set_params(linSVC__random_state=42, linSVC__max_iter=20000).fit(X_train, y_train)
cV_mNB.score(X_test, y_test)

0.802

In [27]:
texts_tag = []
k = 0
for text in texts_stemmed:
    txt = []
    sents = sent_tokenize(text)
    bool = True
    for sent in sents:
        if bool:
            txt = pos_tag(word_tokenize(sent))
            bool = False
        else:
            txt += pos_tag(word_tokenize(sent))
    texts_tag.append(txt)

In [28]:
print(texts_tag[0])

[('plot', 'NN'), (':', ':'), ('two', 'CD'), ('teen', 'NN'), ('coupl', 'NNS'), ('go', 'VBP'), ('to', 'TO'), ('a', 'DT'), ('church', 'NN'), ('parti', 'NN'), (',', ','), ('drink', 'NN'), ('and', 'CC'), ('then', 'RB'), ('drive', 'NN'), ('.', '.'), ('they', 'PRP'), ('get', 'VBP'), ('into', 'IN'), ('an', 'DT'), ('accid', 'NN'), ('.', '.'), ('one', 'CD'), ('of', 'IN'), ('the', 'DT'), ('guy', 'NN'), ('die', 'NN'), (',', ','), ('but', 'CC'), ('his', 'PRP$'), ('girlfriend', 'NN'), ('continu', 'NN'), ('to', 'TO'), ('see', 'VB'), ('him', 'PRP'), ('in', 'IN'), ('her', 'PRP$'), ('life', 'NN'), (',', ','), ('and', 'CC'), ('has', 'VBZ'), ('nightmar', 'VBN'), ('.', '.'), ('what', 'WP'), ('the', 'DT'), ('deal', 'NN'), ('?', '.'), ('watch', 'VB'), ('the', 'DT'), ('movi', 'NN'), ('and', 'CC'), ('``', '``'), ('sorta', 'JJ'), ('``', '``'), ('find', 'VB'), ('out', 'RP'), ('.', '.'), ('.', '.'), ('.', '.'), ('critiqu', 'NN'), (':', ':'), ('a', 'DT'), ('mind-fuck', 'JJ'), ('movi', 'NN'), ('for', 'IN'), ('the',