# Doc2vec using Gensim based on Le and Mikolov 2014

In [28]:
'''
Approaches
1. Bag-of-words Model - Represent documents as fixed length vectors whose elements are the frequencies of the 
corresponding words in the vocabulary
2. Bag-of-ngrams Model - suffers from data sparsity and high dimensionality
3. Word2vec - Continuous Bag of words (CBOW) and Skip-gram model
4. Doc2vec - Distributed memory Paragraph vector (PV-DM) and Distributed Bag of Words Paragraph vector (PV-DBOW)
Word2vec and doc2vec result in more denser vector representation as opposed to sparse and high dimensional
representation in BOW model
'''

'\nApproaches\n1. Bag-of-words Model - Represent documents as fixed length vectors whose elements are the frequencies of the \ncorresponding words in the vocabulary\n2. Bag-of-ngrams Model - suffers from data sparsity and high dimensionality\n3. Word2vec - Continuous Bag of words (CBOW) and Skip-gram model\n4. Doc2vec - Distributed memory Paragraph vector (PV-DM) and Distributed Bag of Words Paragraph vector (PV-DBOW)\nWord2vec and doc2vec result in more denser vector representation as opposed to sparse and high dimensional\nrepresentation in BOW model\n'

In [29]:
import locale
import glob
import os.path
import requests
import tarfile
import sys
import codecs
import smart_open
dirname = 'C:/Users/lenovo/Documents/data/'
filename = 'aclImdb_v1.tar.gz'
os.chdir(dirname)
locale.setlocale(locale.LC_ALL, 'C')

if sys.version > '3':
    control_chars = [chr(0x85)]
else:
    control_chars = [unichr(0x85)]

In [30]:
# Convert text to lower-case and strip punctuation/symbols from words
def normalize_text(text):
    norm_text = text.lower()
    # Replace breaks with spaces
    norm_text = norm_text.replace('<br />', ' ')
    # Pad punctuation with spaces on both sides
    for char in ['.', '"', ',', '(', ')', '!', '?', ';', ':']:
        norm_text = norm_text.replace(char, ' ' + char + ' ')
    return norm_text

In [31]:
if not os.path.isfile(dirname+filename):
    # Download IMDB archive
    print("Downloading IMDB archive...")
    url = u'http://ai.stanford.edu/~amaas/data/sentiment/' + filename
    r = requests.get(url)
    with smart_open.smart_open(dirname+filename, 'wb') as f:
        f.write(r.content)
tar = tarfile.open(dirname+filename, mode='r')
tar.extractall()
tar.close()

KeyboardInterrupt: 

In [None]:
dirname = dirname + 'aclImdb/'
print(os.path.isdir(dirname))
os.chdir(dirname)
import time
import smart_open
start = time.clock()

print(start)
# Concatenate and normalize test/train data
print("Cleaning up dataset...")
folders = ['train/pos', 'train/neg', 'test/pos', 'test/neg', 'train/unsup']
alldata = u''
for fol in folders:
    temp = u''
    output = fol.replace('/', '-') + '.txt'
    # Is there a better pattern to use?
    txt_files = glob.glob(os.path.join(dirname, fol, '*.txt'))
    for txt in txt_files:
        with smart_open.smart_open(txt, "rb") as t:
            t_clean = t.read().decode("utf-8")
            for c in control_chars:
                t_clean = t_clean.replace(c, ' ')
            temp += t_clean
        temp += "\n"
    temp_norm = normalize_text(temp)
    with smart_open.smart_open(os.path.join(dirname, output), "wb") as n:
        n.write(temp_norm.encode("utf-8"))
    alldata += temp_norm

with smart_open.smart_open(os.path.join(dirname, 'alldata-id.txt'), 'wb') as f:
    for idx, line in enumerate(alldata.splitlines()):
        num_line = u"_*{0} {1}\n".format(idx, line)
        f.write(num_line.encode("utf-8"))

end = time.clock()
print ("Total running time: ", end-start)

In [17]:
import os.path
assert os.path.isfile(os.path.join(dirname, "alldata-id.txt")), "alldata-id.txt unavailable"

In [18]:
import gensim
from gensim.models.doc2vec import TaggedDocument
from collections import namedtuple
from smart_open import smart_open

SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')



In [27]:
alldocs = []  # Will hold all docs in original order
with smart_open(dirname+'alldata-id.txt', 'rb') as alldata:
    alldata = alldata.read().decode('utf-8')
    print(alldata)
    for line_no, line in enumerate(alldata):
        tokens = gensim.utils.to_unicode(line).split()
        words = tokens[1:]
        tags = [line_no] # 'tags = [tokens[0]]' would also work at extra memory cost
        split = ['train', 'test', 'extra', 'extra'][line_no//25000]  # 25k train, 25k test, 25k extra
        sentiment = [1.0, 0.0, 1.0, 0.0, None, None, None, None][line_no//12500] # [12.5K pos, 12.5K neg]*2 then unknown
        alldocs.append(SentimentDocument(words, tags, split, sentiment))

train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']
doc_list = alldocs[:]  # For reshuffling per pass

print('%d docs: %d train-sentiment, %d test-sentiment' % (len(doc_list), len(train_docs), len(test_docs)))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



IndexError: list index out of range