In [0]:
import os
import sys
import codecs
import operator
import numpy as np
import re
from time import time

In [0]:
from google.colab import drive
drive.mount('/drive',force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /drive


In [0]:
aspect_path = '/drive/My Drive/Deep Learning Course/practice-5-data/aspect_level-sentiment/aspect_level/'

In [0]:
doc_path = '/drive/My Drive/Deep Learning Course/practice-5-data/doc_level-sentiment/doc_level'

## Vocabulary Indexing

### Note: 

Pay attention how you represent your sequences as an input of RNN model.
With a fixed length vector, you will need to pad the shorter sequences with "0".
Consequently, your vocabulary indexing needs to consider this "0" as padding.

### Function to create vocabulary index

### Returns:

Python dictionary format of vocabulary indexing

In [0]:
num_regex = re.compile('^[+-]?[0-9]+\.?[0-9]*$')

def is_number(token):
    return bool(num_regex.match(token))


def create_vocab(domain, aspect_path, doc_path, maxlen=0, vocab_size=0):
    
    assert domain in ['res_14', 'lt_14', 'res_15', 'res_16']

    file_list = [os.path.join(aspect_path,'%s_train_sentence.txt'%(domain)),
                 os.path.join(aspect_path,'%s_test_sentence.txt'%(domain))]

    if domain in ['lt_14']:
        file_list.append(os.path.join(doc_path,'amazon_electronics_text.txt'))
    else:
        file_list.append(os.path.join(doc_path,'yelp14_text.txt'))

    print ('Creating vocab ...')

    total_words, unique_words = 0, 0
    word_freqs = {}

    for f in file_list:
        top = 0
        fin = codecs.open(f, 'r', 'utf-8')
        for line in fin:
            words = line.split()
            if maxlen > 0 and len(words) > maxlen:
                continue
            for w in words:
                if not is_number(w):
                    try:
                        word_freqs[w] += 1
                    except KeyError:
                        unique_words += 1
                        word_freqs[w] = 1
                    total_words += 1

    print ('  %i total words, %i unique words' % (total_words, unique_words))
    sorted_word_freqs = sorted(word_freqs.items(), key=operator.itemgetter(1), reverse=True)

    vocab = {'<pad>':0, '<unk>':1, '<num>':2}
    index = len(vocab)
    for word, _ in sorted_word_freqs:
        vocab[word] = index
        index += 1
        if vocab_size > 0 and index > vocab_size + 2:
            break
    if vocab_size > 0:
        print (' keep the top %i words' % vocab_size)

    
    return vocab

## Sequence Preprocessing

In [0]:
def read_dataset_aspect(domain, aspect_path, phase, vocab, maxlen):
    
    assert domain in ['res_14', 'lt_14', 'res_15', 'res_16']
    assert phase in ['train', 'test']
    
    print ('Preparing dataset ...')

    data_x, data_y, aspect = [], [], []
    polarity_category = {'positive': 0, 'negative': 1, 'neutral': 2}
    
    if(phase == 'train'):
        file_names = [os.path.join(aspect_path,'%s_%s_sentence.txt'%(domain, phase)),
                   os.path.join(aspect_path,'%s_%s_polarity.txt'%(domain, phase)),
                   os.path.join(aspect_path,'%s_%s_term.txt'%(domain, phase))]
    else:
        file_names = [os.path.join(aspect_path, '%s_%s_sentence.txt'%(domain, phase)),
                   os.path.join(aspect_path, '%s_%s_polarity.txt'%(domain, phase)),
                   os.path.join(aspect_path, '%s_%s_term.txt'%(domain, phase))]

    num_hit, unk_hit, total = 0., 0., 0.
    maxlen_x = 0
    maxlen_aspect = 0

    files = [open(i, 'r') for i in file_names]
    for rows in zip(*files):
        content = rows[0].strip().split()
        polarity = rows[1].strip()
        aspect_content = rows[2].strip().split()

        if maxlen > 0 and len(content) > maxlen:
            continue

        content_indices = []
        if len(content) == 0:
            content_indices.append(vocab['<unk>'])
            unk_hit += 1
        for word in content:
            if is_number(word):
                content_indices.append(vocab['<num>'])
                num_hit += 1
            elif word in vocab:
                content_indices.append(vocab[word])
            else:
                content_indices.append(vocab['<unk>'])
                unk_hit += 1
            total += 1

        data_x.append(content_indices)
        data_y.append(polarity_category[polarity])

        aspect_indices = []
        if len(aspect_content) == 0:
            aspect_indices.append(vocab['<unk>'])
            unk_hit += 1
        for word in aspect_content:
            if is_number(word):
                aspect_indices.append(vocab['<num>'])
            elif word in vocab:
                aspect_indices.append(vocab[word])
            else:
                aspect_indices.append(vocab['<unk>'])
        aspect.append(aspect_indices)

        if maxlen_x < len(content_indices):
            maxlen_x = len(content_indices)
        if maxlen_aspect < len(aspect_indices):
            maxlen_aspect = len(aspect_indices)


    
    print ('  <num> hit rate: %.2f%%, <unk> hit rate: %.2f%%' % (100*num_hit/total, 100*unk_hit/total))
    return data_x, data_y, aspect, maxlen_x, maxlen_aspect


In [0]:
def get_data_aspect(vocab, domain, aspect_path, maxlen=0):
    
    assert domain in ['res_14', 'lt_14', 'res_15', 'res_16']

    train_x, train_y, train_aspect, train_maxlen, train_maxlen_aspect = \
    read_dataset_aspect(domain, aspect_path, 'train', vocab, maxlen)
    
    test_x, test_y, test_aspect, test_maxlen, test_maxlen_aspect = \
    read_dataset_aspect(domain, aspect_path, 'test', vocab, maxlen)
    
    overal_maxlen = max(train_maxlen, test_maxlen)
    overal_maxlen_aspect = max(train_maxlen_aspect, test_maxlen_aspect)

    print (' Overal_maxlen: %s' % overal_maxlen)
    print (' Overal_maxlen_aspect:%s '% overal_maxlen_aspect)
    
    return train_x, train_y, train_aspect, test_x, test_y, test_aspect, overal_maxlen, overal_maxlen_aspect


In [0]:
def create_data(vocab, text_path, label_path, skip_top, skip_len, replace_non_vocab):
    
    data = []
    label = [] # {pos: 0, neg: 1, neu: 2}
    f = codecs.open(text_path, 'r', 'utf-8')
    f_l = codecs.open(label_path, 'r', 'utf-8')
    num_hit, unk_hit, skip_top_hit, total = 0., 0., 0., 0.
    pos_count, neg_count, neu_count = 0, 0, 0
    max_len = 0

    for line, score in zip(f, f_l):
        word_indices = []
        words = line.split()
        if skip_len > 0 and len(words) > skip_len:
            continue

        score = float(score.strip())
        if score < 3:
            neg_count += 1
            label.append(1)
        elif score > 3:
            pos_count += 1
            label.append(0)
        else:
            neu_count += 1
            label.append(2)
            
        for word in words:
            if bool(num_regex.match(word)):
                word_indices.append(vocab['<num>'])
                num_hit += 1
            elif word in vocab:
                word_ind = vocab[word]
                if skip_top > 0 and word_ind < skip_top + 3:
                    skip_top_hit += 1
                else:
                    word_indices.append(word_ind)
            else:
                if replace_non_vocab:
                    word_indices.append(vocab['<unk>'])
                unk_hit += 1
            total += 1

        if len(word_indices) > max_len:
            max_len = len(word_indices)

        data.append(word_indices)

    f.close()
    f_l.close()

    print('  <num> hit rate: %.2f%%, <unk> hit rate: %.2f%%' %(100*num_hit/total, 100*unk_hit/total))

    return np.array(data), np.array(label), max_len



In [0]:
def prepare_data_doc(vocab, domain, doc_path, skip_top=0, skip_len=0, replace_non_vocab=1):
   
    if domain in ['lt_14']:
        text_path = os.path.join(doc_path,'amazon_electronics_text.txt')
        score_path = os.path.join(doc_path,'amazon_electronics_label.txt')
    else:
        text_path= os.path.join(doc_path, 'yelp14_text.txt')
        score_path = os.path.join(doc_path,'yelp14_label.txt')

    data, label, max_len = create_data(vocab, text_path, score_path, skip_top, skip_len, replace_non_vocab)

    return data, label, max_len


In [0]:
def prepare_data(domain, aspect_path, doc_path, vocab_size, maxlen=0):
    
    vocab = create_vocab(domain, aspect_path, doc_path, maxlen, vocab_size)

    train_x, train_y, train_aspect, test_x, test_y, \
    test_aspect, overal_maxlen, overal_maxlen_aspect = get_data_aspect(vocab, domain, aspect_path)

    pretrain_data, pretrain_label, pretrain_maxlen = prepare_data_doc(vocab, domain, doc_path)

    return train_x, train_y, train_aspect, test_x, test_y, \
test_aspect, vocab, overal_maxlen, overal_maxlen_aspect, pretrain_data, pretrain_label, pretrain_maxlen



### Sequence preprocessing (for model inputs - outputs)

In [0]:
from keras.preprocessing import sequence
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


In [0]:
train_x, train_y, train_aspect, test_x, test_y, test_aspect, \
    vocab, overal_maxlen, overal_maxlen_aspect, \
    pretrain_data, pretrain_label, pretrain_maxlen = prepare_data('lt_14', aspect_path, doc_path, 10000)

Creating vocab ...
  3498349 total words, 39278 unique words
 keep the top 10000 words
Preparing dataset ...
  <num> hit rate: 0.99%, <unk> hit rate: 1.07%
Preparing dataset ...
  <num> hit rate: 1.18%, <unk> hit rate: 1.07%
 Overal_maxlen: 82
 Overal_maxlen_aspect:7 
  <num> hit rate: 1.04%, <unk> hit rate: 1.56%


In [0]:
train_x[0]

[4, 530, 8, 32, 653, 7, 1379, 551, 3, 209, 16, 56, 84, 10, 3, 42, 133, 356]

In [0]:
# Pad aspect sentences sequences for mini-batch processing
train_x = sequence.pad_sequences(train_x, maxlen=overal_maxlen)
test_x = sequence.pad_sequences(test_x, maxlen=overal_maxlen)
train_aspect = sequence.pad_sequences(train_aspect, maxlen=overal_maxlen_aspect)
test_aspect = sequence.pad_sequences(test_aspect, maxlen=overal_maxlen_aspect)

#maxlen_pretrain = np.max([len(d) for d in pretrain_data])
maxlen_pretrain = 300
pretrain_data = sequence.pad_sequences(pretrain_data, maxlen_pretrain)

# convert y to categorical labels
train_y = to_categorical(train_y, 3)
test_y = to_categorical(test_y, 3)
pretrain_label = to_categorical(pretrain_label, 3)

validation_ratio = 0.2
validation_size = int(len(train_x) * validation_ratio)
print ('Validation size: %s' % validation_size)


dev_x = train_x[:validation_size]
dev_y = train_y[:validation_size]
dev_aspect = train_aspect[:validation_size]

train_x = train_x[validation_size:]
train_y = train_y[validation_size:]
train_aspect = train_aspect[validation_size:]

Validation size: 462


In [0]:
len(vocab)

10003

In [0]:
print(list(vocab.items())[:5])

[('<pad>', 0), ('<unk>', 1), ('<num>', 2), ('the', 3), ('i', 4)]


In [0]:
overal_maxlen

82

In [0]:
overal_maxlen_aspect

7

In [0]:
pretrain_data.shape # data from doc-level domain (input sequence)

(30000, 300)

In [0]:
pretrain_label.shape # data from doc-level domain (output labels)

(30000, 3)

In [0]:
pretrain_maxlen # max sequence length of input from doc-level domain

1016

In [0]:
train_x.shape # data from aspect-level domain (training set)

(1851, 82)

In [0]:
train_y.shape # data from aspect-level domain (training set)

(1851, 3)

In [0]:
dev_x.shape

(462, 82)

In [0]:
dev_y.shape

(462, 3)

In [0]:
test_x.shape # data from aspect-level domain (test set)

(638, 82)

In [0]:
test_y.shape # data from aspect-level domain (test set)

(638, 3)

In [0]:
 train_aspect.shape # aspect words (training set)

(1851, 7)

In [0]:
test_aspect.shape # aspect words (test set)

(638, 7)

### Store all preprocessed data

In [0]:
import _pickle as cPickle

In [0]:
def read_pickle(data_path, file_name):

    f = open(os.path.join(data_path, file_name), 'rb')
    read_file = cPickle.load(f)
    f.close()

    return read_file

def save_pickle(data_path, file_name, data):

    f = open(os.path.join(data_path, file_name), 'wb')
    cPickle.dump(data, f)
    print(" file saved to: %s"%(os.path.join(data_path, file_name)))
    f.close()

In [0]:
save_pickle(aspect_path, 'all_vocab.pkl', vocab)

save_pickle(aspect_path, 'train_x.pkl', train_x)
save_pickle(aspect_path, 'train_y.pkl', train_y)
save_pickle(aspect_path, 'dev_x.pkl', dev_x)
save_pickle(aspect_path, 'dev_y.pkl', dev_y)
save_pickle(aspect_path, 'test_x.pkl', test_x)
save_pickle(aspect_path, 'test_y.pkl', test_y)

save_pickle(aspect_path, 'train_aspect.pkl', train_aspect)
save_pickle(aspect_path, 'dev_aspect.pkl', dev_aspect)
save_pickle(aspect_path, 'test_aspect.pkl', test_aspect)


save_pickle(aspect_path, 'pretrain_data.pkl', pretrain_data)
save_pickle(aspect_path, 'pretrain_label.pkl', pretrain_label)

 file saved to: /drive/My Drive/Deep Learing Course/practice-5-data/aspect_level-sentiment/aspect_level/all_vocab.pkl
 file saved to: /drive/My Drive/Deep Learing Course/practice-5-data/aspect_level-sentiment/aspect_level/train_x.pkl
 file saved to: /drive/My Drive/Deep Learing Course/practice-5-data/aspect_level-sentiment/aspect_level/train_y.pkl
 file saved to: /drive/My Drive/Deep Learing Course/practice-5-data/aspect_level-sentiment/aspect_level/dev_x.pkl
 file saved to: /drive/My Drive/Deep Learing Course/practice-5-data/aspect_level-sentiment/aspect_level/dev_y.pkl
 file saved to: /drive/My Drive/Deep Learing Course/practice-5-data/aspect_level-sentiment/aspect_level/test_x.pkl
 file saved to: /drive/My Drive/Deep Learing Course/practice-5-data/aspect_level-sentiment/aspect_level/test_y.pkl
 file saved to: /drive/My Drive/Deep Learing Course/practice-5-data/aspect_level-sentiment/aspect_level/train_aspect.pkl
 file saved to: /drive/My Drive/Deep Learing Course/practice-5-data/asp