In [0]:
import os
import sys
import codecs
import operator
import numpy as np
import re
from time import time

In [0]:
from google.colab import drive
drive.mount('/drive', force_remount=True)

Mounted at /drive


In [0]:
#data_path = 'data/doc_level-sentiment/doc_level'
data_path=r"/drive/My Drive/Deep Learing Course/practice-5-data/doc_level-sentiment/doc_level/"

## Vocabulary Indexing

### Note: 

Pay attention how you represent your sequences as an input of RNN/LSTM model.
With a fixed length vector, you will need to pad the shorter sequences with "0".
Consequently, your vocabulary indexing needs to consider this "0" as padding.

### Function to create vocabulary index

### Returns:

Python dictionary format of vocabulary indexing

In [0]:
num_regex = re.compile('^[+-]?[0-9]+\.?[0-9]*$')

def create_vocab(domain, data_path, maxlen=0, vocab_size=0):
    
    print('Creating vocab ...')

    f = os.path.join(data_path,'%s_text.txt'%(domain))

    total_words, unique_words = 0, 0
    word_freqs = {}

    fin = codecs.open(f, 'r', 'utf-8')
    for line in fin:
        words = line.split()
        if maxlen > 0 and len(words) > maxlen:
            continue

        for w in words:
            if not bool(num_regex.match(w)):
                try:
                    word_freqs[w] += 1
                except KeyError:
                    unique_words += 1
                    word_freqs[w] = 1
                total_words += 1

    print ('  %i total words, %i unique words' % (total_words, unique_words))
    sorted_word_freqs = sorted(word_freqs.items(), key=operator.itemgetter(1), reverse=True)

    vocab = {'<pad>':0, '<unk>':1, '<num>':2}
    index = len(vocab)
    for word, _ in sorted_word_freqs:
        vocab[word] = index
        index += 1
        if vocab_size > 0 and index > vocab_size + 2:
            break
    if vocab_size > 0:
        print (' keep the top %i words' % vocab_size)

  
    return vocab

## Sequence preprocessing

### Function to transform word sequence -> integer sequence

### Note:

The raw data set has 5 class labels. Here, we only consider 3 sentiment classes.

### Returns:

integer sequence of text, corresponding labels (int), maxlen (maximum length of sequences)

In [0]:
def create_data(vocab, text_path, label_path, domain, skip_top, skip_len, replace_non_vocab):
    
    data = []
    label = [] # {pos: 0, neg: 1, neu: 2}
    
    f = codecs.open(text_path, 'r', 'utf-8')
    f_l = codecs.open(label_path, 'r', 'utf-8')
    
    num_hit, unk_hit, skip_top_hit, total = 0., 0., 0., 0.
    pos_count, neg_count, neu_count = 0, 0, 0
    max_len = 0

    for line, score in zip(f, f_l):
        word_indices = []
        words = line.split()
        if skip_len > 0 and len(words) > skip_len:
            continue

        score = float(score.strip())
        if score < 3:
            neg_count += 1
            label.append(1)
        elif score > 3:
            pos_count += 1
            label.append(0)
        else:
            neu_count += 1
            label.append(2)
          
        for word in words:
            if bool(num_regex.match(word)):
                word_indices.append(vocab['<num>'])
                num_hit += 1
            elif word in vocab:
                word_ind = vocab[word]
                if skip_top > 0 and word_ind < skip_top + 3:
                    skip_top_hit += 1
                else:
                    word_indices.append(word_ind)
            else:
                if replace_non_vocab:
                    word_indices.append(vocab['<unk>'])
                unk_hit += 1
            total += 1

        if len(word_indices) > max_len:
            max_len = len(word_indices)

        data.append(word_indices)

    f.close()
    f_l.close()

    print('  <num> hit rate: %.2f%%, <unk> hit rate: %.2f%%' % (100*num_hit/total, 100*unk_hit/total))

    print (domain)
    print( 'pos count: ', pos_count )
    print( 'neg count: ', neg_count )
    print( 'neu count: ', neu_count )

    return np.array(data), np.array(label), max_len


### Main Preprocessing Function

### Call : 

- create_vocab()
- create_data()

### Return

- vocabulary index
- integer sequence (model input)
- label (model output)
- maximum sequence length -> as parameter for RNN / LSTM

In [0]:
def prepare_data(domain, data_path, vocab_size, skip_top=0, skip_len=0, replace_non_vocab=1):
    
    print(domain)

    assert domain in ['amazon_electronics', 'yelp14']

    vocab = create_vocab(domain, data_path, skip_len, vocab_size)

    text_path = os.path.join(data_path,'%s_text.txt'%(domain))
    score_path = os.path.join(data_path,'%s_label.txt'%(domain))

    data, label, max_len = create_data(vocab, text_path, score_path, domain, skip_top, \
                                       skip_len, replace_non_vocab)

    return vocab, data, label, max_len

## Run Preprocessing

In [0]:
# choose domain data to train
domain_name = 'amazon_electronics'

In [0]:
vocab, data_list, label_list, overall_maxlen = prepare_data(domain_name, data_path, 10000)

amazon_electronics
Creating vocab ...
  3440972 total words, 39122 unique words
 keep the top 10000 words
  <num> hit rate: 1.04%, <unk> hit rate: 1.56%
amazon_electronics
pos count:  10000
neg count:  10000
neu count:  10000


In [0]:
overall_maxlen

1016

In [0]:
data_list.shape

(30000,)

In [0]:
print(len(data_list[2]))

846


In [0]:
label_list.shape

(30000,)

In [0]:
print(label_list[0])

0


### Example of how to access the stored vocabulary indexing

In [0]:
print(list(vocab.items())[:5])

[('<pad>', 0), ('<unk>', 1), ('<num>', 2), ('the', 3), ('i', 4)]


In [0]:
vocab['love']

301

In [0]:
idx_words = dict((v,k) for (k,v) in vocab.items())

In [0]:
print(list(idx_words.items())[:5])

[(0, '<pad>'), (1, '<unk>'), (2, '<num>'), (3, 'the'), (4, 'i')]


### Storing all preprocessing data

Here, we store as a pickle format

In [0]:
import _pickle as cPickle

In [0]:
def read_pickle(data_path, file_name):

    f = open(os.path.join(data_path, file_name), 'rb')
    read_file = cPickle.load(f)
    f.close()

    return read_file

def save_pickle(data_path, file_name, data):

    f = open(os.path.join(data_path, file_name), 'wb')
    cPickle.dump(data, f)
    print(" file saved to: %s"%(os.path.join(data_path, file_name)))
    f.close()

In [0]:
save_pickle(data_path, 'words_idx.pkl', vocab)

 file saved to: /drive/My Drive/Deep Learing Course/practice-5-data/doc_level-sentiment/doc_level/words_idx.pkl


In [0]:
save_pickle(data_path, 'idx_words.pkl', idx_words)

 file saved to: /drive/My Drive/Deep Learing Course/practice-5-data/doc_level-sentiment/doc_level/idx_words.pkl


In [0]:
save_pickle(data_path, 'data.pkl', data_list)

 file saved to: /drive/My Drive/Deep Learing Course/practice-5-data/doc_level-sentiment/doc_level/data.pkl


In [0]:
save_pickle(data_path, 'label.pkl', label_list)

 file saved to: /drive/My Drive/Deep Learing Course/practice-5-data/doc_level-sentiment/doc_level/label.pkl
