In [2]:
import os
import json
import time
import gzip
import bcolz
import numpy as np
import re
import copy
from multiprocessing import Pool
from tqdm import tqdm

try:
    import cPickle as pickle
except:
    import pickle

## Loading the Dataset

In [None]:
pickleFile = '../Datasets/Reviews/dataset.pkl'
start = time.clock()
dataset = pickle.load( open( pickleFile, "rb" ))
duration = time.clock() - start
print(duration, "seconds")

In [None]:
print(len(dataset))

## Loading Glove Words

In [3]:
# def loadGlove(glove_path, dim=50):
#     acceptedDimensions = [50, 100, 200, 300]
#     if dim not in acceptedDimensions:
#         print("You didn't choose a right dimension.")
#         print("Try one of these:", acceptedDimensions)
#         return None
#     pickleWordFile = f'{glove_path}/6B.'+str(dim)+'_words.pkl'
    
#     if os.path.isfile(pickleWordFile):
#         # check if we've made the outputs before
#         words = pickle.load(open(pickleWordFile, 'rb'))
#         return words
#     else:
#         print("Doesn't work")
#         return -1
    
def loadGlove(glove_path, dim=50):
    acceptedDimensions = [50, 100, 200, 300]
    if dim not in acceptedDimensions:
        print("You didn't choose a right dimension.")
        print("Try one of these:", acceptedDimensions)
        return None
    pickleWordFile = f'{glove_path}/6B.'+str(dim)+'_words.pkl'
    pickleIdFile   = f'{glove_path}/6B.'+str(dim)+'_idx.pkl'
    pickleDatFile  = f'{glove_path}/glove.6B.'+str(dim)+'.dat'
    pickleDataset  = f'{glove_path}/glove.6B.'+str(dim)+'d.txt'
    
    if os.path.isfile(pickleWordFile):
        # check if we've made the outputs before
        print("Preloading files..", end=" ")
        vectors = bcolz.open(pickleDatFile)[:]
        words = pickle.load(open(pickleWordFile, 'rb'))
        word2idx = pickle.load(open(pickleIdFile, 'rb'))
        glove = {w: vectors[word2idx[w]] for w in words}
        print("Done.")
        return glove
    else:
        print("Doesn't work.", end=" ")


gloveDimension = 50
glovePath = "/media/data/Datasets/glove"
glove = loadGlove(glovePath, dim=gloveDimension)
gloveWords = glove.keys()

Preloading files.. Done.


In [4]:
print(len(gloveWords))

400000


## Preprocessing Data

In [None]:
def preprocess(paragraph):
    # split paragraph by full stops
    paragraph = paragraph.lower()
    paragraph = re.sub("([,!?()-+&£$.%*'])", r' \1 ', paragraph)
    paragraph = re.sub('\s{2,}', ' ', paragraph)
    paragraph = paragraph.split(" ")
    # remove empty string
    return paragraph

def padSentence(words, maxLength, padString="<pad>"):
    if len(words) > maxLength:
        return words[:maxLength]
    else:
        return words + [padString for i in range(maxLength - len(words))]
    
def discretise(value, word):
    return word + "_" + str(value)

In [None]:
def handle(itemID, dataset=dataset, 
           wordbase=gloveWords, 
           minWords=6, 
           minEntries=5, 
           maxSummaryLength=10, 
           maxReviewLength=60):
    """
    Filters words out based on whether they're in the GloVe dataset or not.
    
    Parameters:
    
    """
    count = 0
    legit = 0
    printDebug = False

    reviews = []
#     for itemID in datasetKeys:
#         print("YEAH",itemID)
    # check if there are more than 5 reviews.
    if len(dataset[itemID]) > minEntries:
        review = []
        for i in range(len(dataset[itemID])):
            # initialise variables
            entry = dataset[itemID][i]
            reviewerID = entry['reviewerID']

            """
            Review Text Processing
            """
            # preprocess review
            words = preprocess(entry['reviewText'])
            words = [w for w in words if w in wordbase]
            words = words[:maxReviewLength]
            # preprocess summary
            summary = preprocess(entry['summary'])
            summary = [w for w in summary if w in wordbase]
            summary = summary[:maxSummaryLength]

            # visualise
            if printDebug:
                print(dataset[itemID][i])
                print(numWords, words, summary, "\n")

            # if theres more than 6 tokens, keep the first 50 tokens.
            if len(words) > minWords:
                # add padding tokens here
                summary  = padSentence(summary, maxSummaryLength)
                words    = padSentence(words,   maxReviewLength)
                # also need to process rating, polarity and item_id
                rating   = [discretise(entry['overall'], "rating")]
                # process polarity
                polarity = np.round(np.tanh(entry['helpful'][0]-entry['helpful'][1]),1)
                polarity = [discretise(polarity, "polarity")]
                reviewID = [itemID]
                entry = reviewID + summary + rating + polarity + words
                review.append(entry)

        # check if theres less than 5 filtered reviews
        if len(review) > minEntries:
            reviews.append(review)
#                 break          
    return reviews

In [None]:
datasetKeys = list(dataset.keys())

In [None]:
print(len(datasetKeys))

In [None]:
def imap_unordered_bar(func, args, n_processes = 8):
    p = Pool(n_processes)
    res_list = []
    with tqdm(total = len(args)) as pbar:
        for i, res in enumerate(p.imap_unordered(func, args)):
            pbar.update()
            res_list.append(res)
    pbar.close()
    p.close()
    p.join()
    return res_list

In [None]:
reviews = imap_unordered_bar(handle,datasetKeys)

## Save Filtered Dataset

In [5]:
pickleFile = '../Datasets/Reviews/dataset_filtered.pkl'

In [None]:
# save the dataset to a pickle file.
output = open(pickleFile, 'wb')
pickle.dump(reviews, output)
output.close()
print("Saved!")

In [6]:
# Here we'll load the pickled file again so we can clear the memory.
reviews = pickle.load(open(pickleFile, 'rb'))
print("Loaded Reviews", len(reviews))

Loaded Reviews 63001


## Getting ID's of words

To improve the throughput of the model, we should reduce the embedding size. Here we'll look at all the words and keep track ones that exist. We'll make a reduced word2id based on this set.

In [7]:
wordcounts = {}

In [8]:
# here we reduce the size of the dataset so we can debug our model.
reviews = reviews[::4]
len(reviews)

15751

In [9]:
for i in tqdm(reviews):
    if len(i) > 0:
        for section in i:
            for review in section:
                for word in review:
                    if word not in wordcounts:
                        wordcounts[word] = 0
                    wordcounts[word] += 1

100%|██████████| 15751/15751 [00:06<00:00, 2442.36it/s]


In [10]:
# get words that are not in the glove dataset
knowns   = [word for word in wordcounts if word in glove]
unknowns = [word for word in wordcounts if word not in glove]
# sort words by their frequency
wordOrder = list(sorted(knowns, key=lambda x: wordcounts[x], reverse=True))

In [11]:
weights = [glove[word] for word in tqdm(wordOrder)]

100%|██████████| 50252/50252 [00:00<00:00, 1396498.78it/s]


In [12]:
for word in unknowns:
    if ("rating" in word) or ("polarity" in word):
        part = word.split("_")
        weight = glove[part[0]] + glove[part[1]]
    else:
        # generate a random weight
        weight = np.random.normal(0,0.5,gloveDimension)
    wordOrder.append(word)
    weights.append(weight)

In [13]:
# create dictionaries for constant time referencing
id2word = {idx: w for (idx, w) in enumerate(wordOrder)}
word2id = {w: idx for (idx, w) in enumerate(wordOrder)}

In [14]:
# convert words to their id's in the review.
for i in tqdm(range(len(reviews))):
    part = reviews[i]
    if len(part) > 0:
        for j in range(len(part)):
            section = part[j]
            for k in range(len(section)):
                review = section[k]
                reviews[i][j][k] = [word2id[w] for w in review]

100%|██████████| 15751/15751 [00:03<00:00, 4379.09it/s]


In [15]:
# need to flatten all the arrays to a 2d!
flat = []
for i in tqdm(reviews):
    if len(i) > 0:
        for j in i:
            flat += j

100%|██████████| 15751/15751 [00:00<00:00, 779595.51it/s]


In [19]:
# create container ready for use in dataset
container = {
    'id2word' : id2word,
    'word2id' : word2id,
    'reviews' : flat,
    'weights' : np.array(weights)
}

In [21]:
datasetFile = '../Datasets/Reviews/dataset_ready.pkl'
# save the dataset to a pickle file.
output = open(datasetFile, 'wb')
pickle.dump(container, output)
output.close()
print("Saved!")

Saved!


In [22]:
container.keys()

dict_keys(['id2word', 'word2id', 'reviews', 'weights'])