In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from multiprocessing.pool import Pool
import re
positive_file = 'data/rt-polarity.pos'
negative_file = 'data/rt-polarity.neg'
data_root = 'data/stanfordSentimentTreebank/'
glove_pattern = 'data/glove.6B.<size>d.txt'
glove_size = 50

In [2]:
def get_glove(size = 50):
    file = re.sub('<size>', str(size), glove_pattern)
    glove = pd.read_csv(file, sep = " ", header = None, index_col = 0, quoting = 3)
    glove = {key: val.values for key, val in glove.T.items()}
    return glove

In [3]:
glove = get_glove(glove_size)
glove_words = set(glove.keys())

In [24]:
def preprocess_sentence(line):
    line = re.sub(r'[^\x00-\x7F]+', '', line.strip())
    return line.strip().lower()

def preprocess_for_labels(line):
    line = preprocess_sentence(line)
    line = re.sub('\W+', '', line.strip())
    return line
    
def get_labelset(file):
    with open(file, "rt", encoding="utf-8") as f:
        sentences = set([preprocess_for_labels(line) for line in f.readlines()])
    return sentences

In [25]:
positive_labelset = get_labelset(positive_file)
negative_labelset = get_labelset(negative_file)

In [61]:
data_split_dict = {'train': [], 'test': [], 'val': []}
data_split_map = {'1': 'train', '2': 'test', '3':'val'}
with open(data_root + 'datasetSplit.txt', 'r') as f:
    for line in f.readlines():
        [key, value] = line.strip().split(',')
        try:
            key = int(key)
        except:
            continue
        if value in data_split_map.keys():
            data_split_dict[data_split_map[value]].append(key)
data_split_dict

{'train': [1,
  2,
  61,
  62,
  63,
  64,
  68,
  72,
  82,
  131,
  132,
  133,
  134,
  135,
  136,
  214,
  228,
  340,
  383,
  387,
  388,
  428,
  429,
  446,
  467,
  468,
  473,
  474,
  486,
  487,
  488,
  514,
  584,
  670,
  692,
  702,
  703,
  704,
  705,
  742,
  748,
  749,
  840,
  893,
  894,
  896,
  897,
  1112,
  1153,
  1154,
  1215,
  1241,
  1256,
  1274,
  1482,
  1489,
  1588,
  1671,
  1672,
  1673,
  1674,
  1675,
  1676,
  1677,
  1678,
  1679,
  1680,
  1681,
  1682,
  1683,
  1684,
  1685,
  1686,
  1687,
  1688,
  1689,
  1690,
  1691,
  1692,
  1693,
  1694,
  1695,
  1696,
  1697,
  1698,
  1699,
  1700,
  1701,
  1702,
  1703,
  1704,
  1705,
  1706,
  1707,
  1708,
  1709,
  1710,
  1711,
  1712,
  1713,
  1714,
  1715,
  1716,
  1717,
  1718,
  1719,
  1720,
  1721,
  1722,
  1723,
  1724,
  1725,
  1726,
  1727,
  1728,
  1729,
  1730,
  1731,
  1732,
  1733,
  1734,
  1735,
  1736,
  1737,
  1738,
  1739,
  1740,
  1741,
  1742,
  1743,
  1744,
 

In [31]:
def check_sentiment(sentence):
    s = preprocess_for_labels(sentence)
    for y, labelset in enumerate([negative_labelset, positive_labelset]):
        for entry in labelset:
            if s in labelset:
                return y
    return -1

sentences = {}
word2ind = {}
ttws = tf.keras.preprocessing.text.text_to_word_sequence
n_skipped = 0
idx = 0
with open(data_root + 'datasetSentences.txt') as data:
    lines = data.readlines()
    for line in lines:
        index = re.search('^\d+', line)
        if index is None:
            continue
        entry = {'position': index.group()}
        line = preprocess_sentence(line)
        line = re.sub(r'^\d+\s+', '', line)
        sentiment = check_sentiment(line)
        if sentiment >= 0:
            entry['y'] = sentiment
        else:
            n_skipped += 1
            continue
        sentences[line] = entry
        tokens = ttws(line)
        for token in tokens:
            if token not in word2ind:
                word2ind[token] = idx
                idx = idx + 1
            
print(n_skipped, ' skipped')
print(len(sentences), ' kept')

2775  skipped
9077  kept


In [48]:
default_vector = np.mean(list(glove.values()), axis = 0)
embedding_matrix = np.empty((len(word2ind), default_vector.shape[0]))
excluded = []
for word, position in word2ind.items():
    if word not in glove:
        excluded.append(word)
    embedding_matrix[position,:] = glove.get(word, default_vector)
print(len(excluded))
print(excluded)

398
['wisegirls', 'enrapturing', 'suspenser', 'obviation', 'gorefests', 'waydowntown', 'makmalbaf', 'exhilarate', 'nuttgens', 'shapelessly', 'mnch', 'addessi', 'seldahl', 'wollter', 'mullinski', 'precollegiate', 'sparklingly', 'superlarge', 'destinees', 'almodvar', 'dominatrixes', 'scuzbag', 'idoosyncratic', 'watstein', 'sappier', 'exporing', 'cadness', 'shagster', 'powaqqatsi', 'kaputschnik', 'travil', 'splittingly', 'aborbing', 'monkeyfun', 'bierbichler', 'crummles', 'bustingly', 'deutchland', 'datedness', 'inhospitability', 'hastier', 'garca', 'talancn', 'montias', 'hotdogging', 'stumblings', 'birot', 'alientation', 'amlie', 'sogginess', 'involvingly', 'wifty', 'gerbosi', 'stuffiest', 'timewaster', 'naivet', 'strafings', 'debuter', 'soaringly', 'outgag', 'pulpiness', 'haphazardness', 'kibbitzes', 'cineasts', 'intacto', 'unconned', 'overmanipulative', 'schtte', 'crappola', 'fizzability', 'sytle', 'stoppingly', 'choquart', 'captivatingly', 'fillm', 'unreligious', 'anteing', 'marcken',

In [43]:
words_x = []
labels = []
for line, entry in sentences.items():
    words_x.append([word2ind[token] for token in ttws(line)])
    labels.append(entry['y'])

['sentence_index', 'splitset_label']
['1', '1']
['2', '1']
['3', '2']
['4', '2']
['5', '2']
['6', '2']
['7', '2']
['8', '2']
['9', '2']
['10', '2']
['11', '2']
['12', '2']
['13', '2']
['14', '2']
['15', '2']
['16', '2']
['17', '2']
['18', '2']
['19', '2']
['20', '2']
['21', '2']
['22', '2']
['23', '2']
['24', '2']
['25', '2']
['26', '2']
['27', '2']
['28', '2']
['29', '2']
['30', '2']
['31', '2']
['32', '2']
['33', '2']
['34', '2']
['35', '2']
['36', '2']
['37', '2']
['38', '2']
['39', '2']
['40', '2']
['41', '2']
['42', '2']
['43', '2']
['44', '2']
['45', '2']
['46', '2']
['47', '2']
['48', '2']
['49', '2']
['50', '2']
['51', '2']
['52', '2']
['53', '2']
['54', '2']
['55', '2']
['56', '2']
['57', '2']
['58', '2']
['59', '2']
['60', '2']
['61', '1']
['62', '1']
['63', '1']
['64', '1']
['65', '2']
['66', '2']
['67', '2']
['68', '1']
['69', '2']
['70', '2']
['71', '2']
['72', '1']
['73', '2']
['74', '2']
['75', '2']
['76', '2']
['77', '2']
['78', '2']
['79', '2']
['80', '2']
['81', '2']


['4882', '1']
['4883', '1']
['4884', '1']
['4885', '1']
['4886', '1']
['4887', '1']
['4888', '1']
['4889', '1']
['4890', '1']
['4891', '1']
['4892', '1']
['4893', '1']
['4894', '1']
['4895', '1']
['4896', '1']
['4897', '1']
['4898', '1']
['4899', '1']
['4900', '1']
['4901', '1']
['4902', '1']
['4903', '1']
['4904', '1']
['4905', '1']
['4906', '1']
['4907', '1']
['4908', '1']
['4909', '1']
['4910', '1']
['4911', '1']
['4912', '1']
['4913', '1']
['4914', '1']
['4915', '1']
['4916', '1']
['4917', '1']
['4918', '1']
['4919', '1']
['4920', '1']
['4921', '1']
['4922', '1']
['4923', '1']
['4924', '1']
['4925', '1']
['4926', '1']
['4927', '1']
['4928', '1']
['4929', '1']
['4930', '1']
['4931', '1']
['4932', '1']
['4933', '1']
['4934', '1']
['4935', '1']
['4936', '1']
['4937', '1']
['4938', '1']
['4939', '1']
['4940', '1']
['4941', '1']
['4942', '1']
['4943', '1']
['4944', '1']
['4945', '1']
['4946', '1']
['4947', '1']
['4948', '1']
['4949', '1']
['4950', '1']
['4951', '1']
['4952', '1']
['4953

['9806', '1']
['9807', '1']
['9808', '1']
['9809', '1']
['9810', '1']
['9811', '1']
['9812', '1']
['9813', '1']
['9814', '1']
['9815', '1']
['9816', '1']
['9817', '1']
['9818', '1']
['9819', '1']
['9820', '1']
['9821', '1']
['9822', '1']
['9823', '1']
['9824', '1']
['9825', '1']
['9826', '1']
['9827', '1']
['9828', '1']
['9829', '1']
['9830', '1']
['9831', '1']
['9832', '1']
['9833', '1']
['9834', '1']
['9835', '1']
['9836', '1']
['9837', '1']
['9838', '1']
['9839', '1']
['9840', '1']
['9841', '1']
['9842', '1']
['9843', '1']
['9844', '1']
['9845', '1']
['9846', '1']
['9847', '1']
['9848', '1']
['9849', '1']
['9850', '1']
['9851', '1']
['9852', '1']
['9853', '1']
['9854', '1']
['9855', '1']
['9856', '1']
['9857', '1']
['9858', '1']
['9859', '1']
['9860', '1']
['9861', '1']
['9862', '1']
['9863', '1']
['9864', '1']
['9865', '1']
['9866', '1']
['9867', '1']
['9868', '1']
['9869', '1']
['9870', '1']
['9871', '1']
['9872', '1']
['9873', '1']
['9874', '1']
['9875', '1']
['9876', '1']
['9877

In [112]:
n_skipped

6791