In [4]:
import numpy as np
import sys
import string
import json

In [5]:
def load_data(data_file=None):
    if data_file == None:
        return
    with open(data_file) as f:
        data = json.load(f)
    return data

In [6]:
folder = './data/large_files/stanford_sentiment/parsed_data/'
word2idx = load_data(folder + "sentiment_word2idx.json")
sentiment_binary_train = load_data(folder + "sentiment_binary_train.json")
sentiment_train = load_data(folder + "sentiment_train.json")
sentiment_binary_test = load_data(folder + "sentiment_binary_test.json")
sentiment_test = load_data(folder + "sentiment_test.json")

In [7]:
# the loaded samples has three type of labels -1,0,1, in which -1 indicates neutral sentiment.
# We exclude samples with neutral sentiment.
def exclude_neutral_sample(samples:dict):
    ssamples = {}
    for k, v in samples.items():
        if v[3][-1] != -1:
            ssamples[k] = v
    return ssamples
        
train_b = exclude_neutral_sample(sentiment_binary_train)
test_b = exclude_neutral_sample(sentiment_binary_test)

print("After filtering: # of training samples and # of test samples")
print("# of traing samples: ", len(train_b))
print("# of test samples: ", len(test_b))

After filtering: # of training samples and # of test samples
# of traing samples:  6920
# of test samples:  1821


In [8]:
def get_comment(wordidx, idx2word:dict):
    wordlist = []
    for idx in wordidx:
        if idx != -1:
            token = idx2word[idx]
            if token not in string.punctuation:
                wordlist.append(token)
    return wordlist

In [9]:
def get_comments_samples(samples:dict, idx2word:dict):
    comments = []
    targets = []
    for _, v in samples.items():
        if v[3][-1] != -1:
            comment = " ".join(get_comment(v[0], idx2word))
            label = v[3][-1]
            comments.append(comment)
            targets.append(label) 
    return comments, targets

In [10]:
idx2word = {v:k for k, v in word2idx.items()}
train_comments_0, train_targets = get_comments_samples(train_b, idx2word)
test_comments_0, test_targets = get_comments_samples(test_b, idx2word)

count0 = 0
count1 = 0
count2 = 0
for i in range(len(train_comments_0)):

    if train_targets[i] == 0:
        count0 += 1
    elif train_targets[i] == 1:
        count1 += 1
    else:
        count2 += 1
#     print(i, comments[i], targets[i])
    
print("0", count0)
print("1", count1)
print("-1", count2)

0 3310
1 3610
-1 0


In [15]:
print(type(train_comments_0))
print(train_comments_0[1])
print(train_targets[1])

<class 'list'>
an undeniably gorgeous terminally smitten document of a troubadour his acolytes and the triumph of his band
1


In [16]:
vocabulary_size = len(idx2word)
print('vocabulary_size' , vocabulary_size)

vocabulary_size 18647


In [17]:
all_text1 = ' '.join(train_comments_0)
all_text2 = ' '.join(test_comments_0)
words1 = all_text1.split()
words2 = all_text2.split()
words = words1 + words2

In [18]:
# Create your dictionary that maps vocab words to integers here
from collections import Counter
counts = Counter(words)
print(type(counts))

vocab = sorted(counts, key=counts.get, reverse=True)

# Note that index start from 1
vocab_to_int = {word:i for i, word in enumerate(vocab, 1)}

<class 'collections.Counter'>


In [19]:
print("type of vocab", type(vocab))
print('total # of words: ', len(vocab_to_int))
print("first word:", vocab[0])
print("last word:", vocab[-1])
print("first word index:", vocab_to_int[vocab[0]])
print("last word index", vocab_to_int[vocab[-1]])

type of vocab <class 'list'>
total # of words:  16750
first word: the
last word: infantilized
first word index: 1
last word index 16750


In [33]:
index2word = {idx:word for word, idx in vocab_to_int.items()}

In [20]:
def convert_to_int(reviews, vocab_to_int):
    # Convert the reviews to integers, same shape as reviews list, but with integers
    print('# of reviews before index: ', len(reviews))
    reviews_ints = []
    for review in reviews:
        reviews_ints.append([vocab_to_int[word] for word in review.split()])

    print('# of reviews after index: ', len(reviews_ints))
    return reviews_ints

In [30]:
x_train = convert_to_int(train_comments_0, vocab_to_int)

# of reviews before index:  6920
# of reviews after index:  6920


In [34]:
print(x_train[7])
print(train_comments_0[7])
text = [index2word[idx] for idx in x_train[7]]
print(text)

[137, 2, 352, 6, 2294, 16, 37, 935, 4, 3194, 14, 1305, 3, 3985, 537, 239, 363, 4, 191, 8, 105, 208]
such a premise is ripe for all manner of lunacy but kaufman and gondry rarely seem sure of where it should go
['such', 'a', 'premise', 'is', 'ripe', 'for', 'all', 'manner', 'of', 'lunacy', 'but', 'kaufman', 'and', 'gondry', 'rarely', 'seem', 'sure', 'of', 'where', 'it', 'should', 'go']


In [35]:
x_test = convert_to_int(test_comments_0, vocab_to_int)

# of reviews before index:  1821
# of reviews after index:  1821


In [None]:
from collections import Counter

# Create length to frequency map
review_lens = Counter([len(x) for x in x_train])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))