In [131]:
import numpy as np
import sys
import string
import json

In [144]:
class TreeNode:
    def __init__(self):
        self.wordidx = -1
        self.label = -1
        self.idx = -1
        
    def parse(self, line, startIdx, word2idx):
        """
         Parses segment of the character array to a tree node. The parse starts from the specified
         index and returns the end index when the parsing completes.
         
         @param line the character array to parse
         @param startIdx start index of the array to parse
         @param word2idx 
         @return the end index of the array when completing the array
        """
        self.label = int(line[startIdx])
        
        self.children = []
        cword = []
        idx = startIdx + 1
        while idx < len(line):
            if line[idx] == '(':
                tn = TreeNode()
                self.children.append(tn)
                idx = tn.parse(line, idx + 1, word2idx) + 1
            elif line[idx] ==')':
                temp = ''.join(cword).strip()
                word = temp if len(temp) > 0 else None
                if word != None:
                    word = word.lower()
                    if word not in word2idx:
                        word2idx[word] = len(word2idx)
                    self.wordidx = word2idx[word]
                return idx
            else:
                cword.append(line[idx])
                idx += 1
        
        return -1
        

In [145]:
import queue
from queue import Queue

def bfs(root):
    
    queue = Queue()
    queue.put(root)
    curr_num = 1
    next_num = 0
    
    level_tracker = []
    while not queue.empty():
        t = queue.get()
        level_tracker.append(t)
        curr_num -= 1 
        num_children = len(t.children)
        if num_children > 0 :
            left = t.children[0]
            right = t.children[1]
            queue.put(left)
            queue.put(right)
            next_num+=2
        
        if curr_num == 0:
            curr_num = next_num
            next_num = 0
            for e in level_tracker:
                print(e.wordidx, e.label, e.idx, end='   ')
            print()
            level_tracker = []


In [146]:
input = "(4 (3 (2 A) (4 (4 (2 (2 deep) (2 and)) (3 meaningful)) (2 film))) (2 .))"

root = TreeNode()
word2idx = {}
root.parse(input, 1, word2idx)
bfs(root)
print(word2idx)

-1 4 -1   
-1 3 -1   5 2 -1   
0 2 -1   -1 4 -1   
-1 4 -1   4 2 -1   
-1 2 -1   3 3 -1   
1 2 -1   2 2 -1   
{'.': 5, 'film': 4, 'a': 0, 'and': 2, 'meaningful': 3, 'deep': 1}


In [147]:
def add_idx_to_tree(tree, current_idx):
    if tree is None:
        return current_idx
    
    num_children = len(tree.children)
    if num_children > 0 :
        current_idx = add_idx_to_tree(tree.children[0], current_idx)
        current_idx = add_idx_to_tree(tree.children[1], current_idx)
    
    tree.idx = current_idx
    current_idx+=1
    return current_idx


In [148]:
add_idx_to_tree(root, 0)
bfs(root)

-1 4 10   
-1 3 8   5 2 9   
0 2 0   -1 4 7   
-1 4 5   4 2 6   
-1 2 3   3 3 4   
1 2 1   2 2 2   


In [149]:
def tree2list(tree, parent_idx, is_binary=False, is_left=False, is_right=False):
    if tree is None:
        return [], [], [], []
    
    w = tree.wordidx
    if is_left:
        r = 0
    elif is_right:
        r = 1
    else:
        r = -1
     
    num_children = len(tree.children)
    if num_children > 0 :
        left = tree.children[0]
        right = tree.children[1] 
    else:
        left = None
        right = None
        
    words_left, parents_left, relations_left, labels_left = tree2list(left, tree.idx, is_binary, is_left=True)
    words_right, parents_right, relations_right, labels_right = tree2list(right, tree.idx, is_binary, is_right=True)
        
    words = words_left + words_right + [w]
    parents = parents_left + parents_right + [parent_idx]
    relations = relations_left + relations_right + [r]
    if is_binary:
        if tree.label > 2:
            label = 1
        elif tree.label < 2:
            label = 0
        else:
            label = -1
    else:
        label = tree.label
    labels = labels_left + labels_right + [label]
    
    return words, parents, relations, labels

In [150]:
wordidx, parents, relations, labels = tree2list(root, -1, is_binary=True)
print(wordidx)
print(parents)
print(relations)
print(labels)


[0, 1, 2, -1, 3, -1, 4, -1, -1, 5, -1]
[8, 3, 3, 5, 5, 7, 7, 8, 10, 10, -1]
[0, 0, 1, 0, 1, 0, 1, 1, 0, 1, -1]
[-1, -1, -1, -1, 1, 1, -1, 1, 1, -1, 1]


In [151]:
def save_data(data=None, data_file=None):
    if data == None or data_file == None:
        return
    with open(data_file, 'w') as f:
        json.dump(data, f)

**Test Saving Data**

In [152]:
input1 = "(4 (4 (2 Enormously) (3 (4 entertaining) (4 (2 for) (3 (2 moviegoers) (2 (2 of) (3 (2 any) (2 age))))))) (2 .))"
input2 = "(4 (3 (2 A) (4 (4 (2 (2 deep) (2 and)) (3 meaningful)) (2 film))) (2 .))"

word2idx = {}
train = []
t1 = TreeNode();
t1.parse(input1, 1, word2idx)
train.append(t1)

t2 = TreeNode();
t2.parse(input2, 1, word2idx)
train.append(t2)

for t in train:
    add_idx_to_tree(t, 0)
    
train_b = {idx : tree2list(t, -1, is_binary=True) for t, idx in zip(train, range(len(train)))}
train = {idx : tree2list(t, -1, is_binary=False) for t, idx in zip(train, range(len(train)))}

print(word2idx)
print(train_b)
print(train)

save_data(word2idx, "sentiment_word2idx.json")
save_data(train_b, "sentiment_binary_train.json")
save_data(train, "sentiment_train.json")
# print(train[0])
print("Saved")

{'a': 8, 'moviegoers': 3, 'entertaining': 1, 'meaningful': 11, '.': 7, 'of': 4, 'any': 5, 'film': 12, 'age': 6, 'enormously': 0, 'and': 10, 'for': 2, 'deep': 9}
{0: ([0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, 7, -1], [12, 11, 10, 9, 8, 7, 7, 8, 9, 10, 11, 12, 14, 14, -1], [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, -1], [-1, 1, -1, -1, -1, -1, -1, 1, -1, 1, 1, 1, 1, -1, 1]), 1: ([8, 9, 10, -1, 11, -1, 12, -1, -1, 7, -1], [8, 3, 3, 5, 5, 7, 7, 8, 10, 10, -1], [0, 0, 1, 0, 1, 0, 1, 1, 0, 1, -1], [-1, -1, -1, -1, 1, 1, -1, 1, 1, -1, 1])}
{0: ([0, 1, 2, 3, 4, 5, 6, -1, -1, -1, -1, -1, -1, 7, -1], [12, 11, 10, 9, 8, 7, 7, 8, 9, 10, 11, 12, 14, 14, -1], [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, -1], [2, 4, 2, 2, 2, 2, 2, 3, 2, 3, 4, 3, 4, 2, 4]), 1: ([8, 9, 10, -1, 11, -1, 12, -1, -1, 7, -1], [8, 3, 3, 5, 5, 7, 7, 8, 10, 10, -1], [0, 0, 1, 0, 1, 0, 1, 1, 0, 1, -1], [2, 2, 2, 2, 3, 4, 2, 4, 3, 2, 4])}
Saved


****

In [153]:
word2idx = {}
train = []
test = []
folder_read = '../data/large_files/stanford_sentiment/trees/'
folder_write = '../data/large_files/stanford_sentiment/parsed_data/'

for line in open(folder_read + 'train.txt'):
    line = line.rstrip()
    if line:
        t = TreeNode();
        t.parse(line, 1, word2idx)
        train.append(t)

for line in open(folder_read + 'test.txt'):
    line = line.rstrip()
    if line:
        t = TreeNode();
        t.parse(line, 1, word2idx)
        test.append(t)

for t in train:
    add_idx_to_tree(t, 0)
train_b = {idx : tree2list(t, -1, is_binary=True) for t, idx in zip(train, range(len(train)))}
train = {idx : tree2list(t, -1, is_binary=False) for t, idx in zip(train, range(len(train)))}

for t in test:
    add_idx_to_tree(t, 0)
test_b = {idx : tree2list(t, -1, is_binary=True) for t, idx in zip(test, range(len(test)))}
test = {idx : tree2list(t, -1, is_binary=False) for t, idx in zip(test, range(len(test)))}

print(len(train_b))
print(len(test_b))

# print(len(word2idx)
print("finished")

8544
2210
finished


In [142]:
print("Start Saving Data RNTN ...")
save_data(word2idx, folder_write + "sentiment_word2idx.json")
save_data(train_b, folder_write + "sentiment_binary_train.json")
save_data(train, folder_write + "sentiment_train.json")
save_data(test_b, folder_write + "sentiment_binary_test.json")
save_data(test, folder_write + "sentiment_test.json")
print("Data Saving Finished")

Start Saving Data RNTN ...
Data Saving Finished


In [154]:
def get_binary_sample(samples:dict):
    ssamples = {}
    for k, v in samples.items():
        if v[3][-1] != -1:
            ssamples[k] = v
    return ssamples
        
train_b = get_binary_sample(train_b)
test_b = get_binary_sample(test_b)

print("After filtering: # of training samples and # of test samples")
print(len(train_b))
print(len(test_b))

After filtering: # of training samples and # of test samples
6920
1821


In [155]:
def remove_punctuation(s):
    return s.translate(str.maketrans('','',string.punctuation))

sentence = "how ! are you --"
print(remove_punctuation(sentence))

how  are you 


In [156]:
def get_comment(wordidx, idx2word:dict):
    wordlist = []
    for idx in wordidx:
        if idx != -1:
            token = idx2word[idx]
            if token not in string.punctuation:
                wordlist.append(token)
    return wordlist

In [157]:
def get_comments_samples(samples:dict, idx2word:dict):
    comments = []
    targets = []
    for _, v in samples.items():
        if v[3][-1] != -1:
            comment = " ".join(get_comment(v[0], idx2word))
            label = v[3][-1]
            comments.append(comment)
            targets.append(label) 
    return comments, targets

In [158]:
idx2word = {v:k for k, v in word2idx.items()}
train_comments, train_targets = get_comments_samples(train_b, idx2word)
test_comments, test_targets = get_comments_samples(test_b, idx2word)

count0 = 0
count1 = 0
count2 = 0
for i in range(len(train_comments)):

    if train_targets[i] == 0:
        count0 += 1
    elif train_targets[i] == 1:
        count1 += 1
    else:
        count2 += 1
#     print(i, comments[i], targets[i])
    
print("0", count0)
print("1", count1)
print("-1", count2)

0 3310
1 3610
-1 0


In [159]:
import nltk

print(string.punctuation)
print(nltk.word_tokenize(comment))

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
['in', 'this', 'case', 'zero']


In [161]:
from sklearn.feature_extraction.text import CountVectorizer

all_comments = train_comments + test_comments

print("total # of comments:", len(all_comments))
# Initialize a CoutVectorizer to use NLTK's tokenizer instead of its 
# default one (which ignores punctuation and stopwords). 
# Minimum document frequency set to 1. 
foovec = CountVectorizer()
# sents turned into sparse vector of word frequency counts
foovec = foovec.fit(all_comments)

train_comments = foovec.transform(train_comments)
test_comments = foovec.transform(test_comments)

# foovec now contains vocab dictionary which maps unique words to indexes
print(foovec.vocabulary_)
# print(foovec.stop_words_ )
# sents_counts has a dimension of 3 (document count) by 19 (# of unique words)
print(sents_counts.shape)

# print("Start Saving Data")
# print("Data Saving Finished")

ValueError: inconsistent shapes

In [123]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
X_train = tfidf_transformer.fit_transform(train_comments)
X_test = tfidf_transformer.fit_transform(test_comments)
# print(sents_tfidf.toarray()[0, 200:600])

print(X_train.shape[0])
print(X_test.shape[0])

6920
1821


In [124]:
# Now ready to build a classifier. 
# We will use Multinominal Naive Bayes as our model
from sklearn.naive_bayes import MultinomialNB

# Train a Multimoda Naive Bayes classifier
clf = MultinomialNB().fit(X_train, train_targets)

In [125]:
# Predicting the Test set results, find accuracy
from sklearn.metrics  import accuracy_score
y_pred = clf.predict(X_test)
accuracy_score(test_targets, y_pred)

0.81274025260845684

In [127]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(test_targets, y_pred)
cm

array([[679, 233],
       [108, 801]])

In [128]:
from sklearn.svm import SVC
classifier_rbf = SVC(kernel='linear').fit(X_train, train_targets)
y_pred = classifier_rbf.predict(X_test)
accuracy_score(test_targets, y_pred)

0.81219110378912684

In [129]:
cm = confusion_matrix(test_targets, y_pred)
cm

array([[727, 185],
       [157, 752]])

In [101]:
from sklearn.model_selection import GridSearchCV
from time import time

print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
clf = clf.fit(X_train, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)

Fitting the classifier to the training set
done in 453.360s
Best estimator found by grid search:
SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.0005, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [102]:
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.77962427745664742