In [3]:
import csv

split = [ 'train', 'valid', 'test' ]

Reviews = []  
for source in split:
    with open( 'data/yelp/' + source + '.csv', 'r' ) as f:
        Reader = csv.reader( f, delimiter=',', quoting=csv.QUOTE_MINIMAL )
        for record in Reader:
            Reviews.append( record )

In [4]:
# Tokenization and Stemming
# Remove punctuation and tokenize
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
import string
import nltk
import re

# NO punctuation in text
wordnet_lemmatizer = WordNetLemmatizer()
punctuations = list( string.punctuation )

Data = []
for review in Reviews:
    test_review = review[ 0 ].decode( 'utf-8' )
    helpfulness = review[ 1 ]

    # remove hyperlinks in review
    review = re.sub( r'<a.*/a>', ' ', test_review )
    # deal with punctuations
    for ch in punctuations:
        review = review.replace( ch, ch + ' ' )
    
    tokens = [ word for sent in sent_tokenize( review ) for word in word_tokenize( sent ) ]
    tagged_tokens = nltk.pos_tag( tokens )

    stemmed = []
    for pair in tagged_tokens:
        # convert verb to its original form
        if pair[ 1 ][:2] == 'VB':
            token = wordnet_lemmatizer.lemmatize( pair[ 0 ], 'v' )
        else:
            token = wordnet_lemmatizer.lemmatize( pair[ 0 ] )
            
        stemmed.append( token )

    length = len( stemmed )
    record = [ ' '.join( stemmed ), helpfulness, str( length ) ]
    Data.append( record )

In [5]:
import numpy as np

data_size = len( Data )
np.random.shuffle( Data )

Data = [ [record[0], record[1], int( record[2]) ] for record in Data ]

# Spilt into train, valid, test - 60%, 20%, 20%
train_size = int( data_size * 0.6 )
valid_size = int( data_size * 0.2 )

train = Data[ :train_size ]
valid = Data[ train_size:train_size + valid_size ]
test = Data[ train_size + valid_size:]

# Sort the reviews in descending order
train.sort( key=lambda x:x[2], reverse=True )
valid.sort( key=lambda x:x[2], reverse=True )
test.sort( key=lambda x:x[2], reverse=True )

# Save into different files
split = [ 'train', 'valid', 'test' ]
for data_src in split:
    with open(  'data/' + data_src + '.csv', 'w+' ) as f:
        Writer = csv.writer( f, delimiter=',', quoting=csv.QUOTE_MINIMAL )
        
        if data_src == 'train':
            Records = train
        elif data_src == 'valid':
            Records = valid
        else:
            Records = test
            
        Records = [ [record[0], record[1], str( record[2]) ] for record in Records ]
            
        for record in Records:
            record = [ item.encode('utf-8') for item in record ]
            Writer.writerow( record )

## Glove Word Vector Configuration

In [6]:
# Load pretrained GloVe word vectors - 200d
Word2vec_dic = {}
with open( 'word2vecs/glove.6B/glove.6B.200d.txt', 'r' ) as f:
    for line in f:
        line = line.split()
        word = line[0]
        vector = line[1:]
        vector = [ float( item ) for item in vector ]
        Word2vec_dic[ word ] = vector

In [7]:
from data import *

path = './data'
corpus = Corpus( path )

In [8]:
cuda = True
emsize = 200
ntokens = len( corpus.dictionary )
emb_matrix = torch.FloatTensor( ntokens, emsize )
word_idx_list = []
initrange = 0.1
for idx in range( ntokens ):
    try:
        vec = Word2vec_dic[ corpus.dictionary.idx2word[ idx ] ]
        emb_matrix[ idx ] = torch.FloatTensor( vec )
    except:
        word_idx_list.append( idx )
        vec = torch.FloatTensor( 1, emsize )
        vec.uniform_( -initrange, initrange )
        emb_matrix[ idx ] = vec

# Get Index of Word Embedding that need to be updated during training
if cuda:
    word_idx_list = torch.cuda.LongTensor( word_idx_list )
else:
    word_idx_list = torch.LongTensor( word_idx_list )
    
torch.save( emb_matrix, 'data/emb_matrix.pt' )
torch.save( word_idx_list, 'data/word_idx_list.pt' )