# this tutorial implment sentiment analysis using keras library 

##  we have implemented ANN, CNN and LSTM NN

###  prepare your dataset and divide into training data and testing data

we have two files in which one file contain the positive review and another file contain negetive review

In [1]:
import io
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import os.path
import numpy as np
import _pickle as pic
import os
#os.environ['CUDA_VISIBLE_DEVICES'] = '-1'      # this is used when you want to run on cpu not gpu

lemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')

In [2]:
positive_file = 'data/full_positive.txt'
negative_file = 'data/full_negative.txt'

we convert each vocablory in these file to their word2vec

In [3]:
def get_vocab(file_name):
    lexicon = []
    with io.open(file_name, 'r', encoding='cp437') as f:
        contents = f.readlines()
        for l in contents:
            all_words = tokenizer.tokenize(l)
            #all_words = word_tokenize(l)
            lexicon += list(all_words)
    lexicon = [(lemmatizer.lemmatize(i)).encode('utf8') for i in lexicon]
    lexicon = sorted(set(lexicon))
    if not os.path.isfile(file_name+"_vocab"):
        with open(file_name+"_vocab", "w") as f:
            for word in lexicon:
                f.write(word)
                f.write("\n")

    return lexicon

def get_unknown_vocab(fname, words):
    f2 = io.open(fname, 'rb').read()
    vocab = f2.splitlines()
    known_vocab = []
    unknown_vocab = []
    for word in words:
        if word in vocab:
            known_vocab.append(word)
        else:
            unknown_vocab.append(word)
    return known_vocab, unknown_vocab

def get_wiki_glove_vector(fname, words):
    vectors = {}
    W = []
    vocab = {}
    ivocab = {}
    f1 = open(fname, 'rb').read()
    for line in f1.splitlines():
        temp = line.split()
        vectors[temp[0]] = map(float, temp[1:])

    vocab_size = len(words)

    for i in range(len(words)):
        W.append(vectors[words[i]])
        vocab[words[i]] = i
        ivocab[i] = words[i]
    W = np.array(W)
    # normalize each word vector to unit variance
    #print W[0:2], W[-1:-3]
    W_norm = np.zeros(W.shape)
    d = (np.sum(W ** 2, 1) ** (0.5))
    W_norm = (W.T / d).T

    return W_norm, vocab, ivocab


def get_unknown_vec(words, word2vec, vocab, ivocab):
    old_size = len(vocab)
    word2vec = list(word2vec)
    for i in range(len(words)):
        word2vec.append(np.random.uniform(-0.25, 0.25, len(word2vec[0])))
        vocab[words[i]] = i+old_size
        ivocab[i+old_size] = words[i]
    return word2vec, vocab, ivocab

glove_file = 'D:/nlp/dataset/glove.6B/glove.6B.50d.txt'
glove_vocab_file = 'D:/nlp/dataset/glove.6B/vocab.txt'
word2vec_file = "data/word2vec.p"

if not os.path.isfile(word2vec_file):
    # get vocablory of all file
    vocab = get_vocab(positive_file)
    vocab += get_vocab(negative_file)
    print("vocab length: ", len(vocab))
    # get the glove vector
    known_vocab, unknown_vocab = get_unknown_vocab(glove_vocab_file, vocab)
    print("known_vocab length: ", len(known_vocab))
    print("unknown_vocab length: ", len(unknown_vocab))
    Word2vec, vocab, ivocab = get_wiki_glove_vector(glove_file, known_vocab)
    Word2vec, vocab, ivocab = get_unknown_vec(unknown_vocab,Word2vec, vocab, ivocab)
    pic.dump([Word2vec, vocab, ivocab ], open(word2vec_file, 'wb'))
    print("word2vec created")

we generate sentence vector from sentence in files 
    we have two way of making sentence vector 
        first way to take average of all word2vec in sentence
        second way to make 2D setence vector

In [4]:
def get_avg_sen2vec(fname, word2vec_fname, output):
    sen2vec = []
    with open(word2vec_fname, 'rb') as f:
        word2vec, vocab, ivocab = pic.load(f)

    with io.open(fname, 'r', encoding='cp437') as f:
        contents = f.readlines()
        for l in contents:
            temp = np.zeros(len(word2vec[0]), dtype=float)
            all_words = tokenizer.tokenize(l)
            lexicon = list(all_words)
            lexicon = [(lemmatizer.lemmatize(i)).encode('utf8') for i in lexicon]
            for word in lexicon:
                temp += word2vec[vocab[word]]
            sen2vec.append([temp, output])
    return sen2vec

def get_2D_sen2vec(fname, word2vec_fname, output, m):
    sen2vec = []
    with open(word2vec_fname, 'rb') as f:
        word2vec, vocab, ivocab = pic.load(f)

    with io.open(fname, 'rb', encoding='cp437') as f:
        contents = f.readlines()
        for l in contents:
            temp = np.zeros([m, len(word2vec[0])], dtype=float)
            count = 0
            all_words = tokenizer.tokenize(l)
            lexicon = list(all_words)
            lexicon = [(lemmatizer.lemmatize(i)).encode('utf8') for i in lexicon]
            for word in lexicon:
                temp[count] = word2vec[vocab[word]]
                count += 1
            sen2vec.append([temp, output])
    return sen2vec

sen2vec_file = 'data/sen2vec.p'

if sen2vec_file == "data/sen2vec.p":
    if not os.path.isfile(sen2vec_file):
        # get sentence vector from file
        sen2vec =  get_avg_sen2vec(positive_file, word2vec_file, [1,0])
        sen2vec += get_avg_sen2vec(negative_file, word2vec_file, [0,1])
        random.shuffle(sen2vec)
        pic.dump(sen2vec, open(sen2vec_file, 'wb'))
        print("sen2vec created")

if sen2vec_file == "data/sen2vec2D.p":
    if not os.path.isfile(sen2vec_file):
        a = dp.get_max_senLen(positive_file)
        b = dp.get_max_senLen(negative_file)
        m = max(a,b)
        sen2vec = sv.get_2D_sen2vec(positive_file, word2vec_file, [1, 0], m)
        sen2vec += sv.get_2D_sen2vec(negative_file, word2vec_file, [0, 1], m)
        random.shuffle(sen2vec)
        pic.dump(sen2vec, open(sen2vec_file, 'wb'))
        print("sen2vec created")

# load sentence vector for training and testing
f = io.open(sen2vec_file, 'rb')
sen2vec = pic.load(f, encoding='latin1')
f.close()
print(len(sen2vec))

10662


Now we make traning and testing dataset from sen2vec

In [5]:
train_data = np.array(sen2vec[0:9000])
test_data = np.array(sen2vec[9000:])
train_x = np.array(list(train_data[:, 0]))
train_y = np.array(list(train_data[:, 1]))
test_x = np.array(list(test_data[:, 0]))
test_y = np.array(list(test_data[:, 1]))
print("train data length: ", len(train_data))
print("test data length: ", len(test_data))
print("structure of train_x", train_x[0])
print("structure of train_y", train_y[0])

train data length:  9000
test data length:  1662
structure of train_x [ -7.10645047e-02   3.56886871e-01   1.51247953e+00  -4.96324830e-01
   3.60876297e-01   3.45026011e-01  -4.64292122e-01   1.58518457e-01
  -5.40560352e-01   5.34866654e-01  -9.02793525e-02  -9.32967806e-02
   3.30208930e-01   1.92810272e-02   7.58572645e-02  -1.18964720e+00
   5.21441553e-01   5.29360106e-01  -5.79081010e-01   1.99314626e-01
   6.28924915e-01  -3.50990890e-01   2.27730109e-02   4.23964732e-03
   6.44080553e-01  -1.70729801e-01  -2.47870729e-01  -1.04310770e+00
   4.08848631e-01  -2.72390474e-01  -9.93259704e-01   1.68478856e+00
  -6.85944405e-02  -4.99940288e-02  -1.26533149e-01   4.08618825e-01
   5.77886497e-01   6.92536474e-01   2.12528441e-01  -1.19557787e-01
  -1.30026804e+00  -3.76894191e-01   6.87990264e-01  -7.97877541e-01
  -3.17431023e-01   3.17487425e-01   6.04146042e-01  -8.87979661e-01
  -5.44144623e-01  -1.87603824e+00  -4.82828782e-02  -3.02600951e-01
   7.08893668e-01   3.51561637e+0

Make feed forward neural network using keras library and train all our data

In [6]:
from keras.models import Sequential
from keras.layers import Dense

# Initialising the ANN
classifier = Sequential()

# Adding the input layer and the first hidden layer
classifier.add(Dense(units = 100, kernel_initializer = 'uniform', activation = 'relu', input_dim = 100))

# Adding the second hidden layer
classifier.add(Dense(units = 100, kernel_initializer = 'uniform', activation = 'relu'))

# Adding the output layer
classifier.add(Dense(units = 2, kernel_initializer = 'uniform', activation = 'softmax'))

# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

# Fitting the ANN to the Training set
classifier.fit(train_x, train_y, batch_size = 10, epochs = 10)

Using TensorFlow backend.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x91022ee828>

we trained our ANN.
Now time to predict the test batch to check our accuracy

In [7]:
y_pred = classifier.predict(test_x)

print("predcition: ", y_pred)
print("real value: ", test_y)

correct = np.equal(np.argmax(y_pred, 1), np.argmax(test_y, 1))
accuracy = np.mean(correct)
print("Accuracy", accuracy)

predcition:  [[ 0.0075157   0.99248427]
 [ 0.39475092  0.60524905]
 [ 0.63080257  0.36919749]
 ..., 
 [ 0.99581784  0.00418222]
 [ 0.38143739  0.6185627 ]
 [ 0.4506152   0.54938477]]
real value:  [[0 1]
 [1 0]
 [0 1]
 ..., 
 [1 0]
 [0 1]
 [1 0]]
Accuracy 0.729843561974


## Using RNN LSTM Model

we prepare data for rnn lstm model which use 2D sentence vector

In [11]:
f = io.open('data/sen2vec2D.p', 'rb')
sen2vec = pic.load(f, encoding='latin1')
f.close()
print(len(sen2vec))

train_data = np.array(sen2vec[0:9000])
test_data = np.array(sen2vec[9000:])
train_x = np.array(list(train_data[:, 0]))
train_y = np.array(list(train_data[:, 1]))
test_x = np.array(list(test_data[:, 0]))
test_y = np.array(list(test_data[:, 1]))
print("train data length: ", len(train_data))
print("test data length: ", len(test_data))
print("structure of train_x", train_x[0])
print("structure of train_y", train_y[0])

10662
train data length:  9000
test data length:  1662
structure of train_x [[ 0.02354535 -0.03094732  0.00125232 ..., -0.18856544 -0.04128594
  -0.04813067]
 [ 0.02429378 -0.09221743 -0.01113799 ..., -0.15952884 -0.13504654
  -0.06183854]
 [ 0.07611648 -0.04697192 -0.02948809 ..., -0.16526055 -0.07888167
   0.08149928]
 ..., 
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]
 [ 0.          0.          0.         ...,  0.          0.          0.        ]]
structure of train_y [1 0]


now make lstm model which process this data

In [12]:

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM

regressor = Sequential()

regressor.add(LSTM(units=200, activation='sigmoid', input_shape=(None, 100)))

regressor.add(Dense(units = 2, activation = 'softmax'))

regressor.compile(optimizer='adam', loss='mean_squared_error')

regressor.fit(train_x, train_y, batch_size=100, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x913f0f3898>

now predict the test data using trained lstm model

In [13]:
# Predicting the Test set results
y_pred = regressor.predict(test_x)

print("predcition: ", y_pred)
print("real value: ", test_y)

correct = np.equal(np.argmax(y_pred, 1), np.argmax(test_y, 1))
accuracy = np.mean(correct)
print("Accuracy", accuracy)


predcition:  [[ 0.54677361  0.45322648]
 [ 0.54677367  0.45322639]
 [ 0.54677361  0.45322648]
 ..., 
 [ 0.54677355  0.45322645]
 [ 0.54677361  0.45322648]
 [ 0.54677403  0.45322594]]
real value:  [[0 1]
 [1 0]
 [1 0]
 ..., 
 [0 1]
 [0 1]
 [1 0]]
Accuracy 0.51504211793


## Using CNN model

we prepare data for cnn model 
    we use 2D sentence vector for cnn model

In [14]:
train_x = np.reshape(train_x, [-1, 53, 100, 1])
test_x = np.reshape(test_x, [-1, 53, 100, 1])
print(train_x.shape, test_x.shape)

(9000, 53, 100, 1) (1662, 53, 100, 1)


now make a CNN model using keras

In [15]:

# Importing the Keras libraries and packages
from keras.models import Sequential
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dense

# Initialising the CNN
classifier = Sequential()

# Step 1 - Convolution
classifier.add(Conv2D(40, (5, 100), input_shape = (53, 100, 1), activation = 'relu'))

# Step 2 - Pooling
classifier.add(MaxPooling2D(pool_size = (2, 1)))

# Step 3 - Flattening
classifier.add(Flatten())

# Step 4 - Full connection
classifier.add(Dense(units = 128, activation = 'relu'))
classifier.add(Dense(units = 2, activation = 'softmax'))

# Compiling the CNN
classifier.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

# Fitting the ANN to the Training set
classifier.fit(train_x, train_y, batch_size = 100, epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x913f2f74a8>

now predict the test batch and check the accuracy of model

In [16]:
# Predicting the Test set results
y_pred = classifier.predict(test_x)

print("predcition: ", y_pred)
print("real value: ", test_y)

correct = np.equal(np.argmax(y_pred, 1), np.argmax(test_y, 1))
accuracy = np.mean(correct)
print("Accuracy", accuracy)


predcition:  [[  5.76599044e-13   1.00000000e+00]
 [  9.75252151e-01   2.47477926e-02]
 [  2.22199097e-01   7.77800918e-01]
 ..., 
 [  8.40556920e-02   9.15944278e-01]
 [  3.29902917e-02   9.67009723e-01]
 [  9.99893546e-01   1.06414343e-04]]
real value:  [[0 1]
 [1 0]
 [1 0]
 ..., 
 [0 1]
 [0 1]
 [1 0]]
Accuracy 0.700962695548
