In [44]:
import nltk
nltk.download('reuters')
nltk.download('stopwords')
from nltk.corpus import reuters
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

import re, string, unicodedata
# import contractions
import inflect

import matplotlib.pyplot as plt
import numpy as np
import random
import pickle

import scipy as sp
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA

START_TOKEN = '<START>'
END_TOKEN = '<END>'

[nltk_data] Downloading package reuters to /home/rohit/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package stopwords to /home/rohit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [45]:
def read_corpus(category="crude"):
    """ Read files from the specified Reuter's category.
        Params:
            category (string): category name
        Return:
            list of lists, with words from each of the processed files
    """
    corp = []
    files = reuters.fileids(category)
    return [[w.lower() for w in list(reuters.words(f))] for f in files]

In [46]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    return words

def distinct_words(corpus):
    """ Determine a list of distinct words for the corpus.
        Params:
            corpus (list of list of strings): corpus of documents
        Return:
            corpus_words (list of strings): list of distinct words across the corpus, sorted (using python 'sorted' function)
            num_corpus_words (integer): number of distinct words across the corpus
    """
    corpus_words = []
    num_corpus_words = 0
    
    flattened_list = [str(word) for para in corpus for word in para]
    for i in flattened_list:
      if i not in corpus_words:
        num_corpus_words+=1
        corpus_words.append(i)
    corpus_words.sort()
    return corpus_words, num_corpus_words

In [47]:
reuters_corpus = read_corpus()
reuters_corpus_cleaned = [normalize(words) for words in reuters_corpus]

In [58]:
vocabulary, num_vocabulary = distinct_words(reuters_corpus_cleaned)
with open('vocabulary.pickle', 'wb') as f:
    pickle.dump(vocabulary, f)

In [51]:
dataset= []
window_size = 2
for sentence in reuters_corpus_cleaned:
    for i, word in enumerate(sentence):
        for win in range(1, 1+window_size):
            try:
                dataset.append([sentence[i], sentence[i+win]])
                dataset.append([sentence[i+win], sentence[i]])
            except:
                pass

In [57]:
with open('bigrams_dataset.pickle', 'wb') as handle:
    pickle.dump(dataset, handle)