This script produces 1) word-index matrices based on GloVec for LSTM, 2) tf-idf vectorizers.

In [1]:
import numpy as np
import pandas as pd
import nltk
import time
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# read in processed data
train = pd.read_csv('../data/trainpp.csv')
test = pd.read_csv('../data/testpp.csv')

### top words

In [3]:
words = [w for line in train.reviewText for w in line.split()]
vocab = nltk.FreqDist(words)
print 'number of unique words in training data: %d' % len(vocab.keys())

number of unique words in training data: 64836


In [4]:
# top n words

ntop=10000
topwords = [p[0] for p in list(vocab.most_common(ntop))]

### GloVec word embedding

In [5]:
# read in Global Vectors: https://nlp.stanford.edu/projects/glove/
# using top words only
word_vec_dict = {}
with open('../data/glove.6B.100d.txt', 'rb') as f:
    for line in f:
        vec = line.strip().split()
        if vec[0] in topwords:
            word_vec_dict[vec[0]] = np.array(vec[1:], dtype=np.float64)
        
# word-index map
word2index={}
index2word={}
i=1
for word in sorted(word_vec_dict.keys()):
    word2index[word]=i
    index2word[i]=word
    i+=1
    
print 'number of words considered: %d' % len(word_vec_dict)

number of words considered: 9959


In [6]:
# transform input text to word index

def text2index(texts, maxlen):
    res = np.zeros((len(texts), maxlen))
    for i in range( len(texts) ):
        text = texts[i].split()
        k=0
        for w in text:
            if w in word2index.keys():
                res[i,k] = word2index[w]
                k+=1
    return res

In [7]:
# max text length
maxlen=max(map(lambda x: len(x.split()), train.reviewText)+map(lambda x: len(x.split()), test.reviewText))
print 'max length of review text: %d' % maxlen

# transform input text to word index
t0=time.time()
train_x = text2index(train.reviewText, maxlen)
print 'time to transform training set: %d min' % np.floor( (time.time()-t0)/60 )

t0=time.time()
test_x = text2index(test.reviewText, maxlen)
print 'time to transform test set: %d min' % np.floor( (time.time()-t0)/60 )

max length of review text: 1429
time to transform training set: 11 min
time to transform test set: 11 min


In [8]:
# remove columns with all zero
maxlen = np.argwhere(np.mean(np.concatenate((train_x, test_x))==0, axis=0)==1)[0][0]
trainr_x = train_x[:,:maxlen]
testr_x = test_x[:,:maxlen]

In [9]:
# save
np.save('../data/trainIndex', trainr_x)
np.save('../data/testIndex', testr_x)

### tf-idf vectorizer

* bag-of-words

In [5]:
tfidf1 = TfidfVectorizer(max_features = ntop)
train_features1 = tfidf1.fit_transform(train['reviewText'])
print 'feature size from bag-of-words: ',train_features1.shape

feature size from bag-of-words:  (25000, 10000)


* 2-gram

In [6]:
tfidf2 = TfidfVectorizer(max_features = 2*ntop, ngram_range=(1,2))
train_features2 = tfidf2.fit_transform(train['reviewText'])
print 'feature size from 2-gram: ',train_features2.shape

feature size from 2-gram:  (25000, 20000)


In [7]:
# save
with open('../data/tfidf.pkl', 'wb') as f:
    pickle.dump([tfidf1, tfidf2], f)