In [2]:
from textloader import Loader
from preprocessing.utils import remove_empty_docs
import numpy as np

## Datasets : Amazon Review and IMDB

Load Amazon Product review Dataset

In [3]:
train_df = Loader.load_amazon_reviews('train')
print(train_df.shape)

(3600000, 2)


Very diverse dataset and we are taking a subset of this dataset

In [4]:
Sample_size = 200000
dataset = train_df.sample(n=Sample_size, random_state=42)
dataset.sentiment.value_counts()

1    100020
0     99980
Name: sentiment, dtype: int64

In [5]:
corpus_amazon = dataset['review'].values
target_amazon = dataset['sentiment'].values
print(corpus_amazon.shape, target_amazon.shape)
corpus_amazon, target_amazon = remove_empty_docs(corpus_amazon, target_amazon)

(200000,) (200000,)


Loading IMDB datset

In [6]:
train_df_imdb = Loader.load_imdb_data('train')
test_df_imdb = Loader.load_imdb_data('test')
corpus_imdb = train_df_imdb['review'].values
target_imdb = train_df_imdb['sentiment'].values

Combine IMDB and Amazon corpus 

In [7]:
corpus = np.concatenate((corpus_imdb , corpus_amazon))
target = np.concatenate((target_imdb , target_amazon))
print("LEN COUPUS=",len(corpus))

LEN COUPUS= 225000


### Prepare data for Skip-Gram Model Training

In [8]:
from nltk import WordPunctTokenizer
from gensim.models import Word2Vec
import pandas as pd
from gensim.models import KeyedVectors

In [9]:
wpt = WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(doc.lower()) for doc in corpus]

In [10]:
#May take 2-3 minutes to run
w2v_model = Word2Vec(tokenized_corpus,
                     sg=1, #FOR SKIP-GRAM
                     vector_size = 50,
                     window = 5,
                     min_count=5)

In [11]:
w2v_model.wv.save_word2vec_format(fname = 'word2vec.txt')
word_vectors = KeyedVectors.load_word2vec_format('word2vec.txt', binary=False)
vocab_size = len(word_vectors.index_to_key)
print("Vocabulory Size:", vocab_size)

Vocabulory Size: 54667


In [12]:
similar_words = {search_term: [item[0] for item in word_vectors.most_similar([search_term], topn=5)]
                  for search_term in ['broken','damaged','awesome','useful','good','easy','violent', 'romantic', 'nasty', 'unfortunate', 
                                      'predictable', 'hilarious', 'fascinating', 'boring','confused', 'sensitive',
                                     'imaginative','senseless', 'bland','disappointing']}
pd.DataFrame(similar_words).transpose()

Unnamed: 0,0,1,2,3,4
broken,cracked,crumbled,dented,damaged,chipped
damaged,dented,defective,broken,scratched,openned
awesome,awsome,amazing,fantastic,excelent,phenominal
useful,usefull,helpful,doable,valuable,practical
good,great,decent,ok,nice,bad
easy,difficult,simple,quick,dificult,cinch
violent,brutal,sadistic,cruel,disturbing,misogynistic
romantic,screwball,romance,comedy,fairytale,quirky
nasty,gross,filthy,icky,freaky,slashing
unfortunate,regrettable,obvious,unforgivable,loathed,weakness


## Training Sentiment Analyser on the Amazon Review Dataset

In [22]:
import tensorflow as tf
from tensorflow.keras.utils import pad_sequences
from cnn_docmodel import DocClassifier

In [23]:
train_corpus_amazon = corpus_amazon[:185000]
train_target_amazon = target_amazon[:185000]

test_corpus_amazon = corpus_amazon[185000:]
test_target_amazon = target_amazon[185000:]

print("Train Size = {} , test Size = {}".format(len(train_corpus_amazon)
                                                , len(test_corpus_amazon)))

Train Size = 185000 , test Size = 15000


In [24]:
tokenized_train_corpus = [wpt.tokenize(doc.lower()) for doc in train_corpus_amazon]
tokenized_test_corpus = [wpt.tokenize(doc.lower()) for doc in test_corpus_amazon] 

We will be using zero padding to make document of same size. 
Also for OOV(out of vocabulary words) we will use index vocab_size+1. 

In [25]:
def getWordIndex(word):
    try :
        return word_vectors.get_index(word) + 1
    except:
        return vocab_size + 1

In [26]:
corpus_to_seq = [ [getWordIndex(token) for token in tokens] for tokens in tokenized_train_corpus]
MAX_TR_SEQ_LEN = int(np.mean([len(seq) for seq in corpus_to_seq]))
corpus_to_seq = pad_sequences(corpus_to_seq,MAX_TR_SEQ_LEN, padding='post',truncating='post')

In [27]:
corpus_to_seq[1]

array([5002,   94,  645,    1,   79,   24,    2, 4195,  961,    3,    2,
       5002,   11,   94,  645,   15,   28, 4239,    1,  101,    3,    2,
         24,  394, 2315, 3751,   35,   18,   27,    7, 7510,    9,    7,
        842, 2597, 7393,    1,   20,    2,  182,    6,  516,   44, 4186,
          1,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0])

In [28]:
test_corpus_amazon_seq = [ [getWordIndex(token) for token in tokens] for tokens in tokenized_test_corpus]
test_corpus_amazon_seq = pad_sequences(test_corpus_amazon_seq,MAX_TR_SEQ_LEN, padding='post',truncating='post')

Create CNN Document Model

In [44]:
classifier = DocClassifier(vocab_size+2, #0 for null tok & for OOV - vocab_size+1
                           embedding_dim = 50,
                           dropout_rate = 0.3,
                           training = True)        
classifier(np.array([np.arange(50)]))
classifier.summary()

Model: "sentiment_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 cnn_document_model (Documen  multiple                 2764231   
 tEncoder)                                                       
                                                                 
 dropout_7 (Dropout)         multiple                  0         
                                                                 
 dense_7 (Dense)             multiple                  257       
                                                                 
Total params: 2,764,488
Trainable params: 2,764,488
Non-trainable params: 0
_________________________________________________________________


In [45]:
classifier.doc_model.summary()

Model: "cnn_document_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     multiple                  2733450   
                                                                 
 conv1d_9 (Conv1D)           multiple                  2525      
                                                                 
 conv1d_10 (Conv1D)          multiple                  3775      
                                                                 
 conv1d_11 (Conv1D)          multiple                  5025      
                                                                 
 dropout_6 (Dropout)         multiple                  0 (unused)
                                                                 
 dense_6 (Dense)             multiple                  19456     
                                                                 
 global_max_pooling1d_3 (Glo  multiple          

Inilialize Embeddings

In [46]:
embeddings = np.concatenate([np.zeros((1,50)),
                             word_vectors.vectors,
                             np.expand_dims(np.mean(word_vectors.vectors, axis=0), axis=0)])

In [47]:
classifier.doc_model.embedding.set_weights([embeddings])

### Train and Validate SA Model

In [48]:
cross_entropy = tf.keras.losses.BinaryCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
#mse = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
loss_metric = tf.keras.metrics.Mean()
acc = tf.keras.metrics.BinaryAccuracy()

test_loss = tf.keras.metrics.Mean()
test_acc = tf.keras.metrics.BinaryAccuracy()

In [49]:
num_epochs = 2
for epoch in range(num_epochs):
    train_dataset = tf.data.Dataset.from_tensor_slices((corpus_to_seq,train_target_amazon))
    train_dataset = train_dataset.shuffle(buffer_size=1024).batch(64)
    
    test_dataset_iterator = tf.data.Dataset.from_tensor_slices((test_corpus_amazon_seq,test_target_amazon))
    test_dataset_iterator = test_dataset_iterator.shuffle(buffer_size=1024).batch(64)

    loss_metric.reset_states()
    acc.reset_states()
    
    test_loss.reset_states()
    test_acc.reset_states()
    
    for step, (documents_batch, target_batch) in enumerate(train_dataset):
        with tf.GradientTape() as tape:
            pred =  classifier(documents_batch)
            loss = tf.reduce_mean(cross_entropy(target_batch, pred))
        gradients = tape.gradient(loss, classifier.trainable_weights)
        optimizer.apply_gradients(zip(gradients, classifier.trainable_weights))  
        loss_metric(loss)
        acc.update_state(target_batch, pred)
        if step % 100==0:
            print("epoch %d step %d Loss=%.4f Acc=%.4f" % (epoch,
                                                  step, 
                                                  loss_metric.result(),
                                                  acc.result()))

            
    for test_step, (test_docs, test_targets) in enumerate(test_dataset_iterator) :
        test_pred =  classifier(test_docs)
        testloss = tf.reduce_mean(cross_entropy(test_targets, test_pred))
        test_loss(testloss)
        test_acc.update_state(test_targets, test_pred)
    print("Validation Loss = %.4f Acc=%.4f"% (test_loss.result(), test_acc.result()))


epoch 0 step 0 Loss=0.7005 Acc=0.5000
epoch 0 step 100 Loss=0.5983 Acc=0.6767
epoch 0 step 200 Loss=0.4844 Acc=0.7576
epoch 0 step 300 Loss=0.4321 Acc=0.7904
epoch 0 step 400 Loss=0.3976 Acc=0.8118
epoch 0 step 500 Loss=0.3758 Acc=0.8253
epoch 0 step 600 Loss=0.3600 Acc=0.8352
epoch 0 step 700 Loss=0.3485 Acc=0.8421
epoch 0 step 800 Loss=0.3369 Acc=0.8484
epoch 0 step 900 Loss=0.3294 Acc=0.8530
epoch 0 step 1000 Loss=0.3231 Acc=0.8567
epoch 0 step 1100 Loss=0.3171 Acc=0.8598
epoch 0 step 1200 Loss=0.3120 Acc=0.8627
epoch 0 step 1300 Loss=0.3072 Acc=0.8656
epoch 0 step 1400 Loss=0.3017 Acc=0.8684
epoch 0 step 1500 Loss=0.2978 Acc=0.8706
epoch 0 step 1600 Loss=0.2947 Acc=0.8721
epoch 0 step 1700 Loss=0.2917 Acc=0.8738
epoch 0 step 1800 Loss=0.2887 Acc=0.8754
epoch 0 step 1900 Loss=0.2860 Acc=0.8767
epoch 0 step 2000 Loss=0.2839 Acc=0.8778
epoch 0 step 2100 Loss=0.2812 Acc=0.8792
epoch 0 step 2200 Loss=0.2793 Acc=0.8802
epoch 0 step 2300 Loss=0.2770 Acc=0.8814
epoch 0 step 2400 Loss=0.274

In [51]:
#classifier.save_weights('./model/sa/sentiment_analyser')  

#### Performance on IMDB test w/o Fine-Tuning

In [117]:
test_corpus_imdb = test_df_imdb['review'].tolist()
test_target_imdb = test_df_imdb['sentiment'].tolist()

In [118]:
tokenized_test_corpus_imdb = [wpt.tokenize(doc.lower()) for doc in test_corpus_imdb]
test_corpus_imdb_seq = [ [getWordIndex(token) for token in tokens] for tokens in tokenized_test_corpus_imdb]
test_corpus_imdb_seq = pad_sequences(test_corpus_imdb_seq,MAX_TR_SEQ_LEN, padding='post',truncating='post')

In [119]:
test_dataset_imdb_iterator = tf.data.Dataset.from_tensor_slices((test_corpus_imdb_seq,test_target_imdb))
test_dataset_imdb_iterator = test_dataset_imdb_iterator.shuffle(buffer_size=1024).batch(64)

In [127]:
classifier_imdb = DocClassifier(vocab_size+2,
                           embedding_dim = 50,
                           dropout_rate = 0.3,
                           training = True)        
classifier_imdb(np.array([np.arange(50)]))
#inilialize    
classifier_imdb.load_weights('./model/sa/sentiment_analyser')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x289aeaa7e48>

In [128]:
test_loss = tf.keras.metrics.Mean()
test_acc = tf.keras.metrics.BinaryAccuracy()

for test_step, (test_docs, test_targets) in enumerate(test_dataset_imdb_iterator) :
    test_pred =  classifier_imdb(test_docs)
    testloss = tf.reduce_mean(cross_entropy(test_targets, test_pred))
    test_loss(testloss)
    test_acc.update_state(test_targets, test_pred)
print("Test Loss = %.4f Acc=%.4f"% (test_loss.result(), test_acc.result()))

Test Loss = 0.4726 Acc=0.8079


### More Fine Tuning

In [129]:
zero_init=np.zeros_like(classifier_imdb.fc1.get_weights()[0])
zero_init_bias=np.zeros_like(classifier_imdb.fc1.get_weights()[1])
classifier_imdb.fc1.set_weights([zero_init, zero_init_bias])

In [130]:
train_df_imdb_small = train_df_imdb.sample(frac=0.30, random_state = 42)
corpus_imdb_small = train_df_imdb_small['review'].tolist()
target_imdb_small = train_df_imdb_small['sentiment'].tolist()

In [131]:
tokenized_imdb_small_train_corpus = [wpt.tokenize(doc.lower()) for doc in corpus_imdb_small]
corpus_imdb_small_seq = [ [getWordIndex(token) for token in tokens] for tokens in tokenized_imdb_small_train_corpus]
corpus_imdb_small_seq = pad_sequences(corpus_imdb_small_seq,MAX_TR_SEQ_LEN, padding='post',truncating='post')

In [132]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
loss_metric = tf.keras.metrics.Mean()
acc = tf.keras.metrics.BinaryAccuracy()

test_loss = tf.keras.metrics.Mean()
test_acc = tf.keras.metrics.BinaryAccuracy()

In [133]:
num_epochs = 6

for epoch in range(num_epochs):
    train_dataset = tf.data.Dataset.from_tensor_slices((corpus_imdb_small_seq,target_imdb_small))
    train_dataset = train_dataset.shuffle(buffer_size=1024).batch(32)
    
    test_dataset_iterator = tf.data.Dataset.from_tensor_slices((test_corpus_imdb_seq,test_target_imdb))
    test_dataset_iterator = test_dataset_iterator.shuffle(buffer_size=1024).batch(32)

    loss_metric.reset_states()
    acc.reset_states()
    
    test_loss.reset_states()
    test_acc.reset_states()
    
    for step, (documents_batch, target_batch) in enumerate(train_dataset):
        with tf.GradientTape() as tape:
            pred =  classifier_imdb(documents_batch)
            loss = tf.reduce_mean(cross_entropy(target_batch, pred))
        final_layer_weights = classifier_imdb.trainable_weights
        gradients = tape.gradient(loss, final_layer_weights)
        optimizer.apply_gradients(zip(gradients, final_layer_weights))  
        loss_metric(loss)
        acc.update_state(target_batch, pred)
        if step % 100==0:
            print("epoch %d step %d Loss=%.4f Acc=%.4f" % (epoch,
                                                  step, 
                                                  loss_metric.result(),
                                                  acc.result()))

    for test_step, (test_docs, test_targets) in enumerate(test_dataset_imdb_iterator) :
        test_pred =  classifier_imdb(test_docs)
        testloss = tf.reduce_mean(cross_entropy(test_targets, test_pred))
        test_loss(testloss)
        test_acc.update_state(test_targets, test_pred)
    print("Test Loss = %.4f Acc=%.4f"% (test_loss.result(), test_acc.result()))

epoch 0 step 0 Loss=0.6931 Acc=0.5625
epoch 0 step 100 Loss=0.6327 Acc=0.8054
epoch 0 step 200 Loss=0.5683 Acc=0.8113
Test Loss = 0.4417 Acc=0.8121
epoch 1 step 0 Loss=0.5509 Acc=0.7188
epoch 1 step 100 Loss=0.4193 Acc=0.8236
epoch 1 step 200 Loss=0.4095 Acc=0.8266
Test Loss = 0.3986 Acc=0.8175
epoch 2 step 0 Loss=0.5169 Acc=0.7500
epoch 2 step 100 Loss=0.3683 Acc=0.8394
epoch 2 step 200 Loss=0.3739 Acc=0.8344
Test Loss = 0.3920 Acc=0.8190
epoch 3 step 0 Loss=0.3548 Acc=0.8125
epoch 3 step 100 Loss=0.3551 Acc=0.8419
epoch 3 step 200 Loss=0.3519 Acc=0.8456
Test Loss = 0.3889 Acc=0.8210
epoch 4 step 0 Loss=0.2871 Acc=0.8750
epoch 4 step 100 Loss=0.3323 Acc=0.8546
epoch 4 step 200 Loss=0.3322 Acc=0.8576
Test Loss = 0.3876 Acc=0.8221
epoch 5 step 0 Loss=0.2787 Acc=0.8438
epoch 5 step 100 Loss=0.3157 Acc=0.8660
epoch 5 step 200 Loss=0.3138 Acc=0.8666
Test Loss = 0.3928 Acc=0.8222


## Training on IMDB From Scrach

In [134]:
classifier_imdb = DocClassifier(vocab_size+2,
                           embedding_dim = 50,
                           dropout_rate = 0.3,
                           training = True)        
classifier_imdb(np.array([np.arange(50)]))
#inilialize embeddings   
classifier_imdb.doc_model.embedding.set_weights([embeddings])

In [135]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
loss_metric = tf.keras.metrics.Mean()
acc = tf.keras.metrics.BinaryAccuracy()

test_loss = tf.keras.metrics.Mean()
test_acc = tf.keras.metrics.BinaryAccuracy()

In [136]:
num_epochs = 5

for epoch in range(num_epochs):
    train_dataset = tf.data.Dataset.from_tensor_slices((corpus_imdb_small_seq,target_imdb_small))
    train_dataset = train_dataset.shuffle(buffer_size=1024).batch(32)
    
    test_dataset_iterator = tf.data.Dataset.from_tensor_slices((test_corpus_imdb_seq,test_target_imdb))
    test_dataset_iterator = test_dataset_iterator.shuffle(buffer_size=1024).batch(32)

    loss_metric.reset_states()
    acc.reset_states()
    
    test_loss.reset_states()
    test_acc.reset_states()
    
    for step, (documents_batch, target_batch) in enumerate(train_dataset):
        with tf.GradientTape() as tape:
            pred =  classifier_imdb(documents_batch)
            loss = tf.reduce_mean(cross_entropy(target_batch, pred))
        final_layer_weights = classifier_imdb.fc1.trainable_weights + classifier_imdb.doc_model.doc_embedding.trainable_weights
        gradients = tape.gradient(loss, final_layer_weights)
        optimizer.apply_gradients(zip(gradients, final_layer_weights))  
        loss_metric(loss)
        acc.update_state(target_batch, pred)
        if step % 100==0:
            print("epoch %d step %d Loss=%.4f Acc=%.4f" % (epoch,
                                                  step, 
                                                  loss_metric.result(),
                                                  acc.result()))

    for test_step, (test_docs, test_targets) in enumerate(test_dataset_imdb_iterator) :
        test_pred =  classifier_imdb(test_docs)
        testloss = tf.reduce_mean(cross_entropy(test_targets, test_pred))
        test_loss(testloss)
        test_acc.update_state(test_targets, test_pred)
    print("Test Loss = %.4f Acc=%.4f"% (test_loss.result(), test_acc.result()))

epoch 0 step 0 Loss=0.7441 Acc=0.4375
epoch 0 step 100 Loss=0.6928 Acc=0.5213
epoch 0 step 200 Loss=0.6902 Acc=0.5350
Test Loss = 0.6836 Acc=0.5836
epoch 1 step 0 Loss=0.6856 Acc=0.5000
epoch 1 step 100 Loss=0.6819 Acc=0.5727
epoch 1 step 200 Loss=0.6814 Acc=0.5766
Test Loss = 0.6785 Acc=0.6022
epoch 2 step 0 Loss=0.6759 Acc=0.6562
epoch 2 step 100 Loss=0.6755 Acc=0.6061
epoch 2 step 200 Loss=0.6754 Acc=0.6007
Test Loss = 0.6739 Acc=0.6135
epoch 3 step 0 Loss=0.6661 Acc=0.6562
epoch 3 step 100 Loss=0.6715 Acc=0.6194
epoch 3 step 200 Loss=0.6708 Acc=0.6182
Test Loss = 0.6700 Acc=0.6207
epoch 4 step 0 Loss=0.6999 Acc=0.4375
epoch 4 step 100 Loss=0.6663 Acc=0.6071
epoch 4 step 200 Loss=0.6670 Acc=0.6096
Test Loss = 0.6689 Acc=0.5858
