In [None]:
import re
import codecs
import pandas as pd
import operator

In [None]:
pd.set_option('display.max_colwidth', -1) 

In [None]:
def create_vocab(data_file, maxlen=0, vocab_size=0):
    path_to_csv = '../data_clean/'+data_file +'.csv'
    df = pd.read_csv(path_to_csv) 

    total_words, unique_words = 0, 0
    word_freqs = {}
    top = 0
    
    for row in df['review']:
        words = row.split()
        for w in words:
            try:
                word_freqs[w] += 1
            except KeyError:
                unique_words += 1
                word_freqs[w] = 1
            total_words += 1
    
    
    sorted_word_freqs = sorted(word_freqs.items(), key=operator.itemgetter(1), reverse=True)
    
    vocab = {'<pad>':0, '<unk>':1, '<num>':2}
    index = len(vocab)
    for word, _ in sorted_word_freqs:
        vocab[word] = index
        index += 1
        
    
    vocab_file = codecs.open('../vocab', mode='w', encoding='utf8')
    
    sorted_vocab = sorted(vocab.items(), key=operator.itemgetter(1))
    
    for word, index in sorted_vocab:
        if index < 3:
            vocab_file.write(word+'\t'+str(0)+'\n')
            continue
        vocab_file.write(word+'\t'+str(word_freqs[word])+'\n')
    vocab_file.close()

    return vocab

In [None]:
num_regex = re.compile('^[+-]?[0-9]+\.?[0-9]*$')

def is_number(token):
    return bool(num_regex.match(token))

In [None]:
def read_dataset( data_file, vocab, maxlen):
    num_hit, unk_hit, total = 0., 0., 0.
    maxlen_x = 0
    data_x = []
    
    path_to_csv = '../data_clean/'+data_file +'.csv'
    df = pd.read_csv(path_to_csv)

    for row in df['review']:
        words = row.strip().split()
    
        indices = []
        for word in words:
            if is_number(word):
                indices.append(vocab['<num>'])
                num_hit += 1
            elif word in vocab:
                indices.append(vocab[word])
            else:
                indices.append(vocab['<unk>'])
                unk_hit += 1
            total += 1

        data_x.append(indices)
        if maxlen_x < len(indices):
            maxlen_x = len(indices)

    return data_x, maxlen_x

In [None]:
def get_data(data_file, vocab_size=0, maxlen=0):
    vocab = create_vocab(data_file)
    train_x, train_maxlen = read_dataset(data_file, vocab, maxlen)
    return vocab, train_x, train_maxlen