In [25]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from multiprocessing.pool import Pool
import re
positive_file = 'data/rt-polarity.pos'
negative_file = 'data/rt-polarity.neg'
data_root = 'data/stanfordSentimentTreebank/'
glove_pattern = 'data/glove.6B.<size>d.txt'
glove_size = 50

In [23]:
def get_glove(size = 50):
    file = re.sub('<size>', str(size), glove_pattern)
    glove = pd.read_csv(file, sep = " ", header = None, index_col = 0, quoting = 3)
    glove = {key: val.values for key, val in glove.T.items()}
    return glove

In [10]:
glove = get_glove(glove_size)
glove_words = set(glove.keys())

In [117]:
def preprocess_sentence(line):
    line = re.sub(r'[^\x00-\x7F]+', '', line.strip())
    return line.strip().lower()
    
def get_labelset(file):
    with open(file) as f:
        sentences = set([preprocess_sentence(line) for line in f.readlines()])
    return sentences

In [118]:
positive_labelset = get_labelset(positive_file)
negative_labelset = get_labelset(negative_file)

In [119]:
sentences = {}
word2ind = {}
ttws = tf.keras.preprocessing.text.text_to_word_sequence
n_skipped = 0
idx = 0
with open(data_root + 'datasetSentences.txt') as data:
    lines = data.readlines()
    for line in lines:
        index = re.search('^\d+', line)
        if index is None:
            continue
        entry = {'position': index.group()}
        line = preprocess_sentence(line)
        line = re.sub(r'^\d+\s+', '', line)
        if line in positive_labelset:
            entry['y'] = 1
        elif line in negative_labelset:
            entry['y'] = 0
        else:
            n_skipped += 1
            continue
        sentences[line] = entry
        tokens = ttws(line)
        for token in tokens:
            if token not in word2ind:
                word2ind[token] = idx
                idx = idx + 1
            
print(n_skipped, ' skipped')
print(len(sentences), ' kept')

6791  skipped
5063  kept


In [87]:
default_vector = np.mean(list(glove.values()), axis = 0)
embedding_matrix = np.empty((len(word2ind), default_vector.shape[0]))
for word, position in word2ind.items():
    embedding_matrix[position,:] = glove.get(word, default_vector)

In [108]:
sequence_length = 10
words_x = np.zeros((len(sentences), sequence_length))
labels = np.zeros((len(sentences),))
position = 0
for line, entry in sentences.items():
    for i, token in enumerate(ttws(line)):
        if i >= sequence_length:
            break
        words_x[position, i] = word2ind[token]
    labels[position] = entry['y']
    position = position + 1

In [111]:
words_x.shape

(5063, 10)

In [112]:
n_skipped

6791