In [None]:
 !pip install tensorflow-gpu



In [None]:
from __future__ import division, print_function
from gensim import models
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import collections
import re
import string

Using TensorFlow backend.


In [None]:
data = pd.read_csv('abc/IMDB Dataset 2.tsv', header = None, delimiter='\t')

In [None]:
data.columns = ['Text', 'Label']

In [None]:
data.head()

Unnamed: 0,Text,Label
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [None]:
pos = []
neg = []
for l in data.Label:
    if l == 0:
        pos.append(0)
        neg.append(1)
    elif l == 1:
        pos.append(1)
        neg.append(0)

In [None]:
data['Pos']= pos
data['Neg']= neg

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Clean Data
def remove_punct(text):
    text_nopunct = ''
    text_nopunct = re.sub('['+string.punctuation+']', '', text)
    return text_nopunct

data['Text_Clean'] = data['Text'].apply(lambda x: remove_punct(x))
from nltk import word_tokenize, WordNetLemmatizer
tokens = [word_tokenize(sen) for sen in data.Text_Clean]
def lower_token(tokens): 
    return [w.lower() for w in tokens]    
    
lower_tokens = [lower_token(token) for token in tokens]
from nltk.corpus import stopwords
stoplist = stopwords.words('english')

def remove_stop_words(tokens):
    return [word for word in tokens if word not in stoplist]
filtered_words = [remove_stop_words(sen) for sen in lower_tokens]
result = [' '.join(sen) for sen in filtered_words]
data['Text_Final'] = result
data['tokens'] = filtered_words
data = data[['Text_Final', 'tokens', 'Label', 'Pos', 'Neg']]
data[:4]

Unnamed: 0,Text_Final,tokens,Label,Pos,Neg
0,one reviewers mentioned watching 1 oz episode ...,"[one, reviewers, mentioned, watching, 1, oz, e...",1,1,0
1,wonderful little production br br filming tech...,"[wonderful, little, production, br, br, filmin...",1,1,0
2,thought wonderful way spend time hot summer we...,"[thought, wonderful, way, spend, time, hot, su...",1,1,0
3,basically theres family little boy jake thinks...,"[basically, theres, family, little, boy, jake,...",0,0,1


In [None]:
# Split Data into test and train
data_train, data_test = train_test_split(data, test_size=0.10, random_state=42)
all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

5521847 words total, with a vocabulary size of 170579
Max sentence length is 1449


In [None]:
all_test_words = [word for tokens in data_test["tokens"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test["tokens"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

619451 words total, with a vocabulary size of 51872
Max sentence length is 594


In [None]:
#word2vec_path = '/Users/susman/Desktop/glove_Reddit_200d.txt' #train Glove
word2vec_path = 'glove_wiki_300d.txt' #train Glove
word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True, encoding="ISO-8859-1")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

In [None]:
training_embeddings = get_word2vec_embeddings(word2vec, data_train, generate_missing=False)

In [None]:
MAX_SEQUENCE_LENGTH = 500 # change length 
EMBEDDING_DIM = 300

In [None]:
# Tokenize and Pad sequences
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["Text_Final"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train["Text_Final"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

Found 170564 unique tokens.


In [None]:
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

(170565, 300)


In [None]:
test_sequences = tokenizer.texts_to_sequences(data_test["Text_Final"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
# Define RNN-LSTM
from keras.layers import Bidirectional
from keras import regularizers

label_names = ['Pos', 'Neg']
y_train = data_train[label_names].values
x_train = train_cnn_data
y_tr = y_train
def recurrent_nn(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    
    #embedding_layer = Embedding(num_words,
     #                       embedding_dim,
      #                      weights=[embeddings],
       #                     input_length=max_sequence_length,
        #                    trainable=False)
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            input_length=max_sequence_length,
                            trainable=True)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

#     lstm = LSTM(256, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)(embedded_sequences)
    #lstm = LSTM(256, kernel_initializer = 'glorot_uniform')(embedded_sequences)
    lstm = Bidirectional(LSTM(200, kernel_initializer = 'glorot_uniform',kernel_regularizer=regularizers.l2(0.)))(embedded_sequences)
    
   # lstm = Bidirectional(LSTM(256, kernel_initializer = 'glorot_uniform'))(embedded_sequences)
    
    x = Dense(128, activation='relu', kernel_initializer = 'glorot_uniform',kernel_regularizer=regularizers.l2(0.))(lstm)
    x = Dropout(0.1)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model


model = recurrent_nn(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
                len(list(label_names)))

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 500)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 500, 300)          51169500  
_________________________________________________________________
bidirectional_2 (Bidirection (None, 400)               801600    
_________________________________________________________________
dense_3 (Dense)              (None, 128)               51328     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 258       
Total params: 52,022,686
Trainable params: 52,022,686
Non-trainable params: 0
_______________________________________________

In [None]:
# check the shape 
print(x_train.shape, y_tr.shape)
print(test_cnn_data.shape)

(45000, 500) (45000, 2)
(5000, 500)


In [None]:
# Train RNN-LSTM
import keras 
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="0"
num_epochs = 80
batch_size = 128
earlystopper = keras.callbacks.EarlyStopping(patience=15, verbose=1)
history = keras.callbacks.callbacks.History()

hist = model.fit(x_train, y_tr, epochs=num_epochs, validation_split=0.3, shuffle=True, batch_size=batch_size, 
                 callbacks=[earlystopper, history])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 31499 samples, validate on 13501 samples
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 00016: early stopping


In [None]:
# Test RNN-LSTM
predictions = model.predict(test_cnn_data, batch_size=1024, verbose=1)



In [None]:
from sklearn import metrics
metrics.confusion_matrix(data_test.Label, prediction_labels)

y_test = data_test.Label
y_pred_class = prediction_labels
# save confusion matrix and slice into four pieces
confusion = metrics.confusion_matrix(y_test, y_pred_class)
print(confusion)
#[row, column]
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

In [None]:
sensitivity = TP / float(FN + TP)

print(sensitivity)
print(metrics.recall_score(y_test, y_pred_class))

In [None]:
specificity = TN / (TN + FP)

print(specificity)