In [35]:
# import libraries & set seed
from gensim.models import Doc2Vec, doc2vec, Word2Vec
import string
import numpy as np
import sys
from io import StringIO
import os
from keras import callbacks
from keras.layers import Dense, LSTM, Dropout
from keras.models import Sequential
from keras.models import model_from_json, load_model

np.random.seed(223)

In [36]:
# Set parameters

# This is the number of words expected in each sentence.
timesteps = 200  

# The number of dimensions of each word when it is transformed to
# a vector/tensor
dimensions = 300

batch_size = 64
epochs_number = 4

In [37]:
# This is a readymade model from Google. It contains vector 
# representation for around 3 million words.
def load_w2v():
    fname = "/home/test/word2vec/GoogleNews-vectors-negative300.bin"
    w2vmodel = Doc2Vec.load_word2vec_format(fname,binary=True)
    return w2vmodel

w2vmodel = load_w2v()


In [38]:
# Play around with above word2vec model

print '--- (Tyson + Cricket - Sachin) ... top answer is Boxing... --- '
print w2vmodel.most_similar(['tyson','cricket'],['sachin'])

print '\n ---(Reptile + Dog - Doberman).. top answer Alligator...'
print w2vmodel.most_similar(['reptile','dog'],['doberman'])

print '\n ---- players closest to zidane  ----'
print w2vmodel.most_similar('zidane',topn=5)

print '\n --- similarity between nouns ---'
print w2vmodel.similarity('woman','man')
print w2vmodel.similarity('king','queen')
#os.chdir("/home/test/Sentiment-Analysis")
#w2vmodel.save('w2vmodel.h5')

--- (Tyson + Cricket - Sachin) ... top answer is Boxing... --- 
[(u'boxing', 0.5501577258110046), (u'Razor_Ruddock', 0.4727470278739929), (u'Richie_Woodhall', 0.4665651023387909), (u'Krence', 0.4611070156097412), (u'cricketing', 0.4610179662704468), (u'Anthony_Crolla', 0.4601202607154846), (u'rugby', 0.4596092700958252), (u'Robbie_Peden', 0.4539097845554352), (u'Anthony_Mundine', 0.45366987586021423), (u'Audley_Harrison', 0.45190009474754333)]

 ---(Reptile + Dog - Doberman).. top answer Alligator...
[(u'alligator', 0.6314287185668945), (u'gator', 0.6146361827850342), (u'reptiles', 0.579647958278656), (u'animal', 0.5719218850135803), (u'critter', 0.5674688816070557), (u'alligators', 0.5446991324424744), (u'snake', 0.5382720828056335), (u'pet', 0.529512882232666), (u'gators', 0.5268255472183228), (u'Burmese_python', 0.5229257345199585)]

 ---- players closest to zidane  ----
[(u'raul', 0.7298481464385986), (u'ronaldinho', 0.7168955206871033), (u'iniesta', 0.7099334597587585), (u'robben'

In [39]:
# Break sentence into words
def tokenize(sentence):
    sent_arr = []
    sent = sentence.translate(None,string.punctuation).strip()
    #sent = sentence.translate(None,string.punctuation).lower().strip()
    for word in sent.split(' '):
        sent_arr.append(word)
    print 'sent_arr :' , sent_arr
    return sent_arr

In [40]:
# Find the vector representation for every word in the passed 
# numpy array of sentences, using word2vec model from google
def data_vector(arr_sent):
    main = []
    ii = 0
    for ind , sent in enumerate(arr_sent):
        #print 'array of sentences: ' , arr_sent
        #print 'index: ' , ind
                
        tokens = np.array_str(sent)
        #print 'tokens: ' , tokens
        new = tokens.translate(None,string.punctuation).strip()
        
        #filteredtokens = np.zeros((0),dtype=int)
        if new == '':
            filteredtokens = np.repeat('0',timesteps)
        else:
            filteredtokens = np.array(filter(lambda x : x in w2vmodel.vocab,new.split(' ')))
        diff = timesteps - len(filteredtokens)
        if diff <= 0:
            diff = 0
        filteredtokens = np.append(filteredtokens,
                                   ['0' for jj in range(diff)])
                          
        #print len(filteredtokens) , filteredtokens.shape
        #print '0. ' , filteredtokens    

        subset = filteredtokens[:timesteps]

        temp = []
        for seq , word in enumerate(subset):
            #print seq , word
            vect = w2vmodel[word]
            #print vect.shape
            temp.append(vect)
        if ii == 999999:
            print ii , 'new ' , new
            #print 'temp ' , temp
            #print 'tokens ' , filteredtokens
        ii += 1
        main.append(temp)
        #print '1. main shape ' , np.array(main).shape
    #print '2 ' , type(temp) , type(main)
    fil = np.array(main)

    
    return fil


In [41]:
# Few more experimentation with word2vec model

#print type(w2vmodel.syn0norm)
#print w2vmodel.syn0norm.shape
print w2vmodel.syn0[2999999,:10]
#print w2vmodel.syn0norm[0,:10]
print w2vmodel['snowcapped_Caucasus'][:10]
print w2vmodel.vocab['br']
print w2vmodel.index2word[2999999]
#w2vmodel.syn0_lockf
words = ['the','king']

#w2vmodel.infer_vector(words)
#print text

[ 0.04516602 -0.04516602 -0.00393677  0.04882812  0.08837891 -0.12353516
 -0.03369141 -0.14257812 -0.12109375  0.20117188]
[ 0.04516602 -0.04516602 -0.00393677  0.04882812  0.08837891 -0.12353516
 -0.03369141 -0.14257812 -0.12109375  0.20117188]
Vocab(count:2891204, index:108796)
snowcapped_Caucasus


In [42]:
# Set path for training & test data

data_path = "/home/test/Sentiment-Analysis/aclImdb"
os.chdir(data_path)

train_data_path = '/'.join((data_path,"train/"))
test_data_path = '/'.join((data_path,"test/"))
%pwd

u'/home/test/Sentiment-Analysis/aclImdb'

In [43]:
# Get sentences from files
class DataIterator:
    def __init__(self, data_path, batch_size = 1000):
        pos_files = os.listdir(''.join((data_path,'pos/')))
        #print pos_files
        neg_files = os.listdir(''.join((data_path,'neg/')))
        self.pos_iter = iter(pos_files)
        self.neg_iter = iter(neg_files)
        self.batchSize = batch_size
        self.data_path = data_path

    def get_next(self):
        vectors = []
        values = []
        while (len(vectors) < self.batchSize):

            file = next (self.pos_iter, None)
            
            if file == None:
                break
            #print 'file ' , file
            data_path = ''.join((self.data_path,'/pos/'))
            vec = np.loadtxt(data_path + file,dtype='str',delimiter=' ')
            vectors.append(vec)
            values.append([1,0])

            file = next(self.neg_iter, None)
            if file == None:
                break
            data_path = ''.join((self.data_path,'/neg/'))
            vec = np.loadtxt(data_path + file,dtype='str',delimiter=' ')
            vectors.append(vec)
            values.append([0,1])

        return np.array(vectors), np.array(values)

In [44]:
# Read sentences into train & test arrays

train_iterator = DataIterator(train_data_path, sys.maxint)
test_iterator = DataIterator(test_data_path, sys.maxint)
train_X, train_Y = train_iterator.get_next()
test_X, test_Y = test_iterator.get_next()
#print train_Y[0:15]
#print train_X[:1]
all_X = np.append(train_X , test_X)
all_Y = np.vstack((train_Y , test_Y))
#print all_Y[0:15]
#print all_X[4:5]
    

In [45]:
# Build the train & test numpy arrays
# X is vector representation of words... Y is class variable

max_limit = 50000
limit_train = 40000
        
#temp_train = all_X[:limit_train]
temp_train = all_X[5000:50000]
train_vect_x = data_vector(temp_train)
#print all_X[0:1]
    
#temp_test = all_X[limit_train:max_limit]
temp_test = all_X[:5000]
test_vect_x = data_vector(temp_test)
    
#train_Y = all_Y[:limit_train] 
train_Y = all_Y[5000:50000]
    
#test_Y = all_Y[limit_train:max_limit]
test_Y = all_Y[:5000]
    
print train_vect_x.shape , test_vect_x.shape
print train_Y.shape , test_Y.shape
   

MemoryError: 

In [None]:
print train_vect_x.shape , test_vect_x.shape
print train_Y.shape , test_Y.shape

In [17]:
# Build the RNN-LSTM model
# Each epoch takes approximately 10-11 mins on 16 GB machine

os.chdir("/home/test/Sentiment-Analysis")
dims = timesteps
model = Sequential()
model.add(LSTM(dims,  input_shape=(timesteps, dimensions),  return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(2, input_dim=dims, activation='softmax'))
model.compile(loss="categorical_crossentropy", optimizer='rmsprop')
fname = 'weights/keras-lstm.h5'
#model.load_weights(fname)
cbks = [callbacks.ModelCheckpoint(filepath=fname, monitor='val_loss', save_best_only=True),
        callbacks.EarlyStopping(monitor='val_loss', patience=3)]

model.fit(train_vect_x, train_Y,
            batch_size=batch_size, callbacks=cbks, nb_epoch=epochs_number,
            show_accuracy=True, validation_split=0.25, shuffle=True)

`model.compile(optimizer, loss, metrics=["accuracy"])`


Train on 33750 samples, validate on 11250 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7fd68be0f550>

In [24]:
# Evaluate test data to know the loss
acc = 0
loss = model.evaluate(test_vect_x, test_Y, batch_size, show_accuracy=True)
print('Test loss / test accuracy = {:.4f} / {:.4f}'.format(loss, acc))


`model.compile(optimizer, loss, metrics=["accuracy"])`


Test loss / test accuracy = 0.3098 / 0.0000


In [34]:
# Few test sentences of my own. Build an array

sentences = u'''
Is Zlatan going to show consistency through out the season?
I wonder whether Assasin creed was expected to be such a dismal movie. The game is really popular among kids. The movie was supposed to impress.
This match was very pathetic.
The book is aweful. I was mad at the author.
I hate that person and am very disappointed in him.
I'm from Belgium and therefore my English writing is rather poor, sorry for that...<br /><br />This is one of those little known movies that plays only once on TV and than seems to vanishes into thin air. I was browsing through my old VHS Video collection and came across this title, I looked it up and it had an IMDb score of more than 7/10, that's pretty decent.<br /><br />I must admit that it's a very well put together movie and that's why I'm puzzled. This is the only film made by this director...? How come he didn't make lots of films after this rather good one...? Someone with so much potential should be forced to make another movie, ha ha ;-) <br /><br />Anyway, I really would like to see that he pulls his act together and makes another good movie like this one, please.....?
The book is outstanding. what a novel! I was extremely impressed by the author.
wow! I loved the match.
Zlatan is in form this season.
Rooney is struggling for form.
There are certain things that fosters lethargy in the team. We should address them before we look at other things.
There are certain things that fosters lethargy in the team. We should address them before we look at other things. The hope is to see some progress.
Food was not very good.
'''.strip()


arr = []
for sent in sentences.split('\n'):
    c = StringIO(sent)
    #c = StringIO("a,b\nc,d")
#    print type(c) , c
    #print type(sentences)
    vec = np.loadtxt(c,dtype='str',delimiter=' ')
    arr.append(vec)
  
real_data = data_vector(np.asarray(arr))
print 'This will give a shape (x,y,z)...'
print 'x is number of rows '
print 'y is number of words in each sentence'
print 'z is number of dimensions in each word vector'
print real_data.shape


This will give a shape (x,y,z)...
x is number of rows 
y is number of words in each sentence
z is number of dimensions in each word vector
(13, 200, 300)


In [28]:
# Predict the class variable for test data or given array of sentences

test_limit = len(test_Y)

# Test data
pred = model.predict(test_vect_x[:test_limit],batch_size)

# Your own sentences
#pred = model.predict(real_data,batch_size)

#for ii , val in enumerate(pred):
#    print ii+2 , '.  ' , val


In [29]:
# This determines accuracy of model on test data

passed, failed = 0 , 0 
#test_limit = 10000
for ii in range(test_limit):
    val = pred[ii]
    act = test_Y[ii]
    #if val[0] >= val[1]:
    if val[0] >= 0.50:
        est = np.array([1, 0])
    else:
        est = np.array([0, 1])
#    print 'est, act ' , est.shape, act.item(1)
    if np.array_equal(est,act):
        passed += 1
    else:
        failed += 1
print "passed , failed " , passed , ' : ' ,failed
print 'accuracy = ' , float(passed) / (passed + failed)
print 'End'

passed , failed  4381  :  619
accuracy =  0.8762
End


In [3]:
# Save/reload earlier models to/from disk

os.chdir("/home/test/Sentiment-Analysis")
model = load_model('model_epoch45.h5')

#print model.get_config()
#print model.to_json() 
#print model.summary, 
#model_4_json = model_epoch_4.to_json()
#model2 = model_from_json(model_4_json)

#model.save('model_epoch46.h5')

#print model , model_epoch_5 ,  model_epoch_3, model_epoch_4
#model_temp = load_model('model_epoch45.h5')

#del model2


In [52]:
print " That's it ! "

 That's it ! 
