# Parse Raw Data

In this notebook, we will parse sentiment analysis raw data in the form of serialized tree structure into integer sequences, which will be used for training our models later.

* An example of a raw data point is like:

```
(4 (4 (2 Enormously) (3 (4 entertaining) (4 (2 for) (3 (2 moviegoers) (2 (2 of) (3 (2 any) (2 age))))))) (2 .))
```

* After being parsed, the above raw data point is transformed into four integer sequences
    * `word index sequence`: [0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, 7, -1],
    * `parent index sequece`: [12, 11, 10, 9, 8, 7, 7, 8, 9, 10, 11, 12, 14, 14, -1], 
    * `relation indicator sequence`: [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, -1], 
    * `label sequence`: [-1, 1, -1, -1, -1, -1, -1, 1, -1, 1, 1, 1, 1, -1, 1])
    
    
* Note that we will use all the four sequences only when we are training Recursive Neural Network. For training Recurrent Neural Netword and Benchmark Model, we only use the `word index sequence` and `label sequence`. This is because only Recursive Neural Network would exploits parent-children relationship when training.

In [5]:
import numpy as np
import sys
import string
import json

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

## Define data parsing class

** Define class `TreeNode` that transforms each serialized training/test example into a tree structure **

In [14]:
class TreeNode:
    def __init__(self):
        self.wordidx = -1
        self.label = -1
        self.idx = -1
        
    def parse(self, line, startIdx, word2idx):
        """
         Parses segment of the character array to a tree node. The parse starts from the specified
         index and returns the end index when the parsing completes.
         
         @param line the character array to parse
         @param startIdx start index of the array to parse
         @param word2idx 
         @return the end index of the array when completing the array
        """
        self.label = int(line[startIdx])
        
        self.children = []
        cword = []
        idx = startIdx + 1
        while idx < len(line):
            if line[idx] == '(':
                tn = TreeNode()
                self.children.append(tn)
                idx = tn.parse(line, idx + 1, word2idx) + 1
            elif line[idx] ==')':
                temp = ''.join(cword).strip()
                word = temp if len(temp) > 0 else None
                if word != None:
                    word = word.lower()
                    if word not in word2idx:
                        word2idx[word] = len(word2idx)
                    self.wordidx = word2idx[word]
                return idx
            else:
                cword.append(line[idx])
                idx += 1
        
        return -1
        

** Define function `add_idx_to_tree` that adds index to each node of a given tree **

In [15]:
def add_idx_to_tree(tree, current_idx):
    if tree is None:
        return current_idx
    
    num_children = len(tree.children)
    if num_children > 0 :
        current_idx = add_idx_to_tree(tree.children[0], current_idx)
        current_idx = add_idx_to_tree(tree.children[1], current_idx)
    
    tree.idx = current_idx
    current_idx+=1
    return current_idx

In [16]:
add_idx_to_tree(root, 0)

11

** Define function `tree2list` that transforms each tree into 4 integer sequences **
* The 4 sequences are: words, parents, relations, labels
* The 4 sequences as a whole represents a training/test example

In [17]:
def tree2list(tree, parent_idx, is_binary=False, is_left=False, is_right=False):
    if tree is None:
        return [], [], [], []
    
    w = tree.wordidx
    if is_left:
        r = 0
    elif is_right:
        r = 1
    else:
        r = -1
     
    num_children = len(tree.children)
    if num_children > 0 :
        left = tree.children[0]
        right = tree.children[1] 
    else:
        left = None
        right = None
        
    words_left, parents_left, relations_left, labels_left = tree2list(left, tree.idx, is_binary, is_left=True)
    words_right, parents_right, relations_right, labels_right = tree2list(right, tree.idx, is_binary, is_right=True)
        
    words = words_left + words_right + [w]
    parents = parents_left + parents_right + [parent_idx]
    relations = relations_left + relations_right + [r]
    if is_binary:
        if tree.label > 2:
            label = 1
        elif tree.label < 2:
            label = 0
        else:
            label = -1
    else:
        label = tree.label
    labels = labels_left + labels_right + [label]
    
    return words, parents, relations, labels

In [18]:
# Test
wordidx, parents, relations, labels = tree2list(root, -1, is_binary=True)
print(wordidx)
print(parents)
print(relations)
print(labels)

[0, 1, 2, -1, 3, -1, 4, -1, -1, 5, -1]
[8, 3, 3, 5, 5, 7, 7, 8, 10, 10, -1]
[0, 0, 1, 0, 1, 0, 1, 1, 0, 1, -1]
[-1, -1, -1, -1, 1, 1, -1, 1, 1, -1, 1]


In [19]:
def save_data(data=None, data_file=None):
    if data == None or data_file == None:
        return
    with open(data_file, 'w') as f:
        json.dump(data, f)

**Test Saving Data**

In [20]:
#Test

input1 = "(4 (4 (2 Enormously) (3 (4 entertaining) (4 (2 for) (3 (2 moviegoers) (2 (2 of) (3 (2 any) (2 age))))))) (2 .))"
input2 = "(4 (3 (2 A) (4 (4 (2 (2 deep) (2 and)) (3 meaningful)) (2 film))) (2 .))"

# Parse raw data and store parsed data
word2idx = {}
train = []
t1 = TreeNode();
t1.parse(input1, 1, word2idx)
train.append(t1)

t2 = TreeNode();
t2.parse(input2, 1, word2idx)
train.append(t2)

for t in train:
    add_idx_to_tree(t, 0)

train_b = {idx : tree2list(t, -1, is_binary=True) for t, idx in zip(train, range(len(train)))}
train = {idx : tree2list(t, -1, is_binary=False) for t, idx in zip(train, range(len(train)))}

print(word2idx)
print(train_b)
print(train)

save_data(word2idx, "./data/test_saving/sentiment_word2idx.json")
save_data(train_b, "./data/test_saving/sentiment_binary_train.json")
save_data(train, "./data/test_saving/sentiment_train.json")
# print(train[0])
print("Saved")

{'enormously': 0, 'entertaining': 1, 'for': 2, 'moviegoers': 3, 'of': 4, 'any': 5, 'age': 6, '.': 7, 'a': 8, 'deep': 9, 'and': 10, 'meaningful': 11, 'film': 12}
{0: ([0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, 7, -1], [12, 11, 10, 9, 8, 7, 7, 8, 9, 10, 11, 12, 14, 14, -1], [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, -1], [-1, 1, -1, -1, -1, -1, -1, 1, -1, 1, 1, 1, 1, -1, 1]), 1: ([8, 9, 10, -1, 11, -1, 12, -1, -1, 7, -1], [8, 3, 3, 5, 5, 7, 7, 8, 10, 10, -1], [0, 0, 1, 0, 1, 0, 1, 1, 0, 1, -1], [-1, -1, -1, -1, 1, 1, -1, 1, 1, -1, 1])}
{0: ([0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, 7, -1], [12, 11, 10, 9, 8, 7, 7, 8, 9, 10, 11, 12, 14, 14, -1], [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, -1], [2, 4, 2, 2, 2, 2, 2, 3, 2, 3, 4, 3, 4, 2, 4]), 1: ([8, 9, 10, -1, 11, -1, 12, -1, -1, 7, -1], [8, 3, 3, 5, 5, 7, 7, 8, 10, 10, -1], [0, 0, 1, 0, 1, 0, 1, 1, 0, 1, -1], [2, 2, 2, 2, 3, 4, 2, 4, 3, 2, 4])}
Saved


## Starting parsing raw data

In [21]:
word2idx = {}
train = []
test = []
folder_read = './data/large_files/stanford_sentiment/trees/'
folder_write = './data/large_files/stanford_sentiment/parsed_data/'

# Parse raw data and store parsed data
for line in open(folder_read + 'train.txt'):
    line = line.rstrip()
    if line:
        t = TreeNode();
        t.parse(line, 1, word2idx)
        train.append(t)

for line in open(folder_read + 'test.txt'):
    line = line.rstrip()
    if line:
        t = TreeNode();
        t.parse(line, 1, word2idx)
        test.append(t)

for t in train:
    add_idx_to_tree(t, 0)
train_b = {idx : tree2list(t, -1, is_binary=True) for t, idx in zip(train, range(len(train)))}
train = {idx : tree2list(t, -1, is_binary=False) for t, idx in zip(train, range(len(train)))}

for t in test:
    add_idx_to_tree(t, 0)
test_b = {idx : tree2list(t, -1, is_binary=True) for t, idx in zip(test, range(len(test)))}
test = {idx : tree2list(t, -1, is_binary=False) for t, idx in zip(test, range(len(test)))}

print(len(train_b))
print(len(test_b))

# print(len(word2idx)
print("finished")

8544
2210
finished


## Save data for future use

In [22]:
print("Start Saving Data RNTN ...")
save_data(word2idx, folder_write + "sentiment_word2idx.json")
save_data(train_b, folder_write + "sentiment_binary_train.json")
save_data(train, folder_write + "sentiment_train.json")
save_data(test_b, folder_write + "sentiment_binary_test.json")
save_data(test, folder_write + "sentiment_test.json")
print("Data Saving Finished")

Start Saving Data RNTN ...
Data Saving Finished
