<a href="https://colab.research.google.com/github/tfredrick112/sentiment-classification-ml-course-project/blob/master/Doc2Vec_Model_IMDb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Preprocessing Functions

####Importing Libraries

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import re

import unicodedata
import string
from time import time

import io
import tarfile
import os.path
import smart_open
import gensim.utils

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


####Converting to Lowercase and Removing Punctuation

In [None]:
from nltk.tokenize import word_tokenize, TweetTokenizer

def my_word_tokenize(doc):
    new_doc = ""
    characters_to_remove = string.punctuation.replace("'", "") + "\n"
    new_doc = ''.join([ch if ch not in characters_to_remove else " " for ch in doc])
    return new_doc.split()

def basic_preprocess_1(doc):
    '''
    Input: doc (A comment, which is string)
    Output: List of words in the preprocessed document
    Basic preprocessing includes:
    a) converting to lowercase
    b) removing newline characters
    c) removing all punctuation
    d) normalizing to NFKC form
    '''
    
    # Normalization and conversiom to lowercase
    doc = unicodedata.normalize('NFKC', doc).lower()

    # Remove HTML tags
    doc = re.sub('<[^>]*>', ' ', doc)
    
    # Maps all punctuation marks to space
    temp = string.punctuation.replace('@', '').replace("'", "")
    table = str.maketrans(temp, ' '*len(temp))
    doc = doc.translate(table)
    
    # A list of all the words in the document after removing punctuation.
    tokens = TweetTokenizer().tokenize(doc)

    stripped = [w for w in tokens if (w!=' ' and w!='' and w.isalnum()) or w=='@user']
    return stripped

####Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import  wordnet

def get_wordnet_pos(treebank_tag):
    '''
    This function is used to convert the Part of Speech tags returned by nltk.pos_tag
    function to the wordnet POS tags.
    '''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
        # Instead of returning None, I am returning the noun tag, because, by default
        # the WordNetLemmatizer assumes that the POS tag is noun.
        
def lemmatize(doc_words):
    '''
    Input: doc_words(A comment) It is a list of words
    Output: A list of words in the document,after lemmatization
    '''
    # Creating Part-of-Speech tags for the words
    pos_tags = nltk.pos_tag(doc_words)
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lem_words = [lemmatizer.lemmatize(word, pos = get_wordnet_pos(tag)) for word, tag in pos_tags]
    return lem_words

####Remove Stopwords

In [None]:
# Removal of stopwords
from nltk.corpus import stopwords
def remove_stopwords(doc_words):
    '''
    Input: doc_words(a comment) It is a list of words
    Output: List of words in that document, after removing stop words.
    '''
    words_to_remove = set(stopwords.words('english'))
    words = [word for word in doc_words if word not in words_to_remove]
    return words

####Preprocessing

In [None]:
def preprocess_1(doc):
    '''
    Input: doc (A comment)
    Output: List of words (after preprocessing)
    '''
    preprocessed_doc = basic_preprocess_1(doc.replace("’", "'"))
    lemmatized_doc = lemmatize(preprocessed_doc)
    final_preprocessed_doc = remove_stopwords(lemmatized_doc)
    return final_preprocessed_doc

###Download Dataset

In [None]:
import collections
SentimentDocument = collections.namedtuple('SentimentDocument', 'words tags split sentiment')

In [None]:
def download_dataset(url='http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'):
    fname = url.split('/')[-1]

    if os.path.isfile(fname):
       return fname

    with smart_open.open(url, "rb", ignore_ext=True) as fin:
        with smart_open.open(fname, 'wb', ignore_ext=True) as fout:
            while True:
                buf = fin.read(io.DEFAULT_BUFFER_SIZE)
                if not buf:
                    break
                fout.write(buf)

    return fname

def create_sentiment_document(name, text, index):
    _, split, sentiment_str, _ = name.split('/')
    sentiment = {'pos': 1.0, 'neg': 0.0, 'unsup': None}[sentiment_str]

    if sentiment is None:
        split = 'extra'

    tokens = gensim.utils.to_unicode(text).split()
    return SentimentDocument(tokens, [index], split, sentiment)

def extract_documents():
    fname = download_dataset()

    index = 0

    with tarfile.open(fname, mode='r:gz') as tar:
        for member in tar.getmembers():
            if re.match(r'aclImdb/(train|test)/(pos|neg|unsup)/\d+_\d+.txt$', member.name):
                member_bytes = tar.extractfile(member).read()
                member_text = member_bytes.decode('utf-8', errors='replace')
                assert member_text.count('\n') == 0
                yield create_sentiment_document(member.name, member_text, index)
                index += 1

alldocs = list(extract_documents())

In [None]:

# Select only the review text (preprocessed) and the sentiment label
train_docs = [{'doc': doc.words, 'sentiment_numeric':int(doc.sentiment)} for doc in alldocs if doc.split == 'train']
test_docs = [{'doc': doc.words, 'sentiment_numeric': int(doc.sentiment)} for doc in alldocs if doc.split == 'test']

# Dataframe consisting of training examples
df_train = pd.DataFrame(train_docs)

# Dataframe consisting of test examples
df_test = pd.DataFrame(test_docs)

# All the docs in the dataset will be used to train the Doc2Vec model
doc2vec_model_docs = [doc.words for doc in alldocs]

In [None]:
import gensim

def load_reviews_train(processed_reviews):
    for i, review in enumerate(processed_reviews):
        yield gensim.models.doc2vec.TaggedDocument(review, str(i))

train_corpus = list(load_reviews_train(doc2vec_model_docs))
print("Number of training sequences (for Doc2Vec) = ",len(train_corpus))

Number of training sequences (for Doc2Vec) =  100000


###Model Hyperparameters

In [None]:
import multiprocessing
import gensim.models.doc2vec
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"
from gensim.models.doc2vec import Doc2Vec

vector_dimensions = 100
common_kwargs = dict(
    vector_size=vector_dimensions, epochs=25, min_count=2,
    sample=0, workers=multiprocessing.cpu_count(), negative=5, hs=0,
)

###Train Doc2Vec model (PV-DM model)

In [None]:
# PV-DM with default averaging
start = time()
model = Doc2Vec(dm=1, window=10, alpha=0.05, comment='alpha=0.05', **common_kwargs)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)
end = time()
print("Time taken to train the Doc2Vec model = {} minutes".format((end - start)/60))

# Save the model
from gensim.test.utils import get_tmpfile
fname = get_tmpfile("/content/drive/My Drive/SentimentClassification/doc2vec_pvdm_large")
model.save(fname)

Time taken to train the Doc2Vec model = 32.177907117207845 minutes


###Infer Document Vectors

In [None]:
# Infer vectors
from gensim.models import Doc2Vec

# Load the saved model
model = Doc2Vec.load("/content/drive/My Drive/SentimentClassification/doc2vec_pvdm_large")

# Number of training examples (for binary classification)
M = df_train.shape[0]
print("Number of training examples for binary classification =", M)
# Number of test examples
Mtest = df_test.shape[0]
print("Number of test examples for binary classification =", Mtest)

X_train = np.zeros((M, vector_dimensions))
X_test = np.zeros((Mtest, vector_dimensions))
y_train = np.array(df_train['sentiment_numeric'])
y_test = np.array(df_test['sentiment_numeric'])

# Infer vectors for training examples
training_reviews = list(df_train['doc'])
for i, review in enumerate(training_reviews):
    X_train[i] = model.infer_vector(review)

# Infer vectors for test examples
test_reviews = list(df_test['doc'])
for i, review in enumerate(test_reviews):
    X_test[i] = model.infer_vector(review)

Number of training examples for binary classification = 25000
Number of test examples for binary classification = 25000


###Binary Classification (Logistic Regression)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

clf = LogisticRegression().fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

# Training accuracy
print("Training accuracy =", round(clf.score(X_train, y_train), 4) * 100, "%")

# Test accuracy
print("Test accuracy =", round(clf.score(X_test, y_test), 4) * 100, "%\n")

# F1 score
print("Training F1 score =", round(f1_score(y_train, y_pred_train), 3))
print("Test F1 score =", round(f1_score(y_test, y_pred_test), 3))

Training accuracy = 87.41 %
Test accuracy = 87.13 %

Training F1 score = 0.874
Test F1 score = 0.874


###Train Doc2Vec model (PV-DBOW)

Distributed Bag of Words model

In [None]:
# PV-DBOW
start = time()
model2 = Doc2Vec(dm=0, **common_kwargs)
model2.build_vocab(train_corpus)
model2.train(train_corpus, total_examples=model2.corpus_count, epochs=model2.epochs)
end = time()
print("Time taken to train the Doc2Vec model = {} minutes".format((end - start)/60))

# Save the model
from gensim.test.utils import get_tmpfile
fname = get_tmpfile("/content/drive/My Drive/SentimentClassification/doc2vec_pvdbow_large")
model2.save(fname)

Time taken to train the Doc2Vec model = 73.1448712269465 minutes


###Infer Document Vectors

In [None]:
# Infer vectors
from gensim.models import Doc2Vec

# Load the saved model
model2 = Doc2Vec.load("/content/drive/My Drive/SentimentClassification/doc2vec_pvdbow_large")

# Number of training examples (for binary classification)
M = df_train.shape[0]
print("Number of training examples for binary classification =", M)
# Number of test examples
Mtest = df_test.shape[0]
print("Number of test examples for binary classification =", Mtest)

X_train2 = np.zeros((M, vector_dimensions))
X_test2 = np.zeros((Mtest, vector_dimensions))
y_train2 = np.array(df_train['sentiment_numeric'])
y_test2 = np.array(df_test['sentiment_numeric'])

# Infer vectors for training examples
training_reviews = list(df_train['doc'])
for i, review in enumerate(training_reviews):
    X_train2[i] = model2.infer_vector(review)

# Infer vectors for test examples
test_reviews = list(df_test['doc'])
for i, review in enumerate(test_reviews):
    X_test2[i] = model2.infer_vector(review)

Number of training examples for binary classification = 25000
Number of test examples for binary classification = 25000


###Binary Classification (Logistic Regression)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

clf = LogisticRegression().fit(X_train2, y_train2)
y_pred_train2 = clf.predict(X_train2)
y_pred_test2 = clf.predict(X_test2)

# Training accuracy
print("Training accuracy =", round(clf.score(X_train2, y_train2), 4) * 100, "%")

# Test accuracy
print("Test accuracy =", round(clf.score(X_test2, y_test2), 4) * 100, "%\n")

# F1 score
print("Training F1 score =", round(f1_score(y_train2, y_pred_train2), 3))
print("Test F1 score =", round(f1_score(y_test2, y_pred_test2), 3))

Training accuracy = 93.60000000000001 %
Test accuracy = 91.72 %

Training F1 score = 0.936
Test F1 score = 0.919
