In [2]:
from dataloader.loader import Loader
from preprocessing.utils import remove_empty_docs
import numpy as np

## Datasets : Amazon Review and IMDB

Load Amazon Product review Dataset

In [3]:
train_df = Loader.load_amazon_reviews('train')
print(train_df.shape)

(3600000, 2)


Very diverse dataset and we are taking a subset of this dataset

In [52]:
Sample_size = 200000
dataset = train_df.sample(n=Sample_size, random_state=42)
dataset.sentiment.value_counts()

1    100020
0     99980
Name: sentiment, dtype: int64

In [53]:
corpus_amazon = dataset['review'].values
target_amazon = dataset['sentiment'].values
print(corpus_amazon.shape, target_amazon.shape)
corpus_amazon, target_amazon = remove_empty_docs(corpus_amazon, target_amazon)

(200000,) (200000,)


Loading IMDB datset

In [7]:
train_df_imdb = Loader.load_imdb_data('train')
test_df_imdb = Loader.load_imdb_data('test')
corpus_imdb = train_df_imdb['review'].values
target_imdb = train_df_imdb['sentiment'].values

In [8]:
corpus = np.concatenate((corpus_imdb , corpus_amazon))
target = np.concatenate((target_imdb , target_amazon))
print("LEN COUPUS=",len(corpus))

LEN COUPUS= 125000


### Prepare data for Skip-Gram Model Training

In [12]:
from nltk import WordPunctTokenizer
from gensim.models import Word2Vec
import pandas as pd
from gensim.models import KeyedVectors

In [10]:
wpt = WordPunctTokenizer()
tokenized_corpus = [wpt.tokenize(doc.lower()) for doc in corpus]

In [11]:
#May take 2-3 minutes to run
w2v_model = Word2Vec(tokenized_corpus,
                     sg=1, #FOR SKIP-GRAM
                     vector_size = 50,
                     window = 5,
                     min_count=5)

In [13]:
w2v_model.wv.save_word2vec_format(fname = 'word2vec.txt')
word_vectors = KeyedVectors.load_word2vec_format('word2vec.txt', binary=False)
vocab_size = len(word_vectors.index_to_key)
print("Vocabulory Size:", vocab_size)

Vocabulory Size: 44600


In [14]:
similar_words = {search_term: [item[0] for item in word_vectors.most_similar([search_term], topn=5)]
                  for search_term in ['broken','damaged','awesome','useful','good','easy','violent', 'romantic', 'nasty', 'unfortunate', 
                                      'predictable', 'hilarious', 'fascinating', 'boring','confused', 'sensitive',
                                     'imaginative','senseless', 'bland','disappointing']}
pd.DataFrame(similar_words).transpose()

Unnamed: 0,0,1,2,3,4
broken,shattered,cracked,dented,chipped,damaged
damaged,defective,broken,dented,faulty,scratched
awesome,awsome,amazing,fantastic,excelent,great
useful,usefull,helpful,valuable,beneficial,specific
good,great,decent,bad,nice,ok
easy,difficult,simple,quick,convenient,hard
violent,brutal,misogynistic,sadistic,gory,vicious
romantic,romance,fairytale,comedy,screwball,erotic
nasty,freaky,boob,decapitation,sickening,gross
unfortunate,obvious,chiefly,dubious,redemptive,loathed


## Training Sentiment Analyser on the Amazon Review Dataset

In [33]:
import tensorflow as tf
from tensorflow.keras.utils import pad_sequences
from cnn_docmodel import DocClassifier

In [87]:
train_corpus_amazon = corpus_amazon[:185000]
train_target_amazon = target_amazon[:185000]

test_corpus_amazon = corpus_amazon[185000:]
test_target_amazon = target_amazon[185000:]

print("Train Size = {} , test Size = {}".format(len(train_corpus_amazon)
                                                , len(test_corpus_amazon)))

Train Size = 185000 , test Size = 15000


In [88]:
tokenized_train_corpus = [wpt.tokenize(doc.lower()) for doc in train_corpus_amazon]
tokenized_test_corpus = [wpt.tokenize(doc.lower()) for doc in test_corpus_amazon] 

We will be using zero padding to make document of same size. 
Also for OOV(out of vocabulary words) we will use index vocab_size+1. 

In [89]:
def getWordIndex(word):
    try :
        return word_vectors.get_index(word) + 1
    except:
        return vocab_size + 1

In [90]:
corpus_to_seq = [ [getWordIndex(token) for token in tokens] for tokens in tokenized_train_corpus]
MAX_TR_SEQ_LEN = int(np.mean([len(seq) for seq in corpus_to_seq]))
corpus_to_seq = pad_sequences(corpus_to_seq,MAX_TR_SEQ_LEN, padding='post',truncating='post')

In [91]:
corpus_to_seq[1]

array([6119,   98,  593,    1,   75,   25,    2, 4220,  972,    3,    2,
       6119,   11,   98,  593,   15,   30, 4156,    1,   99,    3,    2,
         25,  442, 2398, 3918,   37,   20,   27,    6, 9647,   10,    6,
        821, 2781, 7652,    1,   23,    2,  206,    7,  560,   43, 5515,
          1,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0])

In [92]:
test_corpus_amazon_seq = [ [getWordIndex(token) for token in tokens] for tokens in tokenized_test_corpus]
test_corpus_amazon_seq = pad_sequences(test_corpus_amazon_seq,MAX_TR_SEQ_LEN, padding='post',truncating='post')

Create CNN Document Model

In [93]:
classifier = DocClassifier(vocab_size+2, #0 for null tok & for OOV - vocab_size+1
                           embedding_dim = 50,
                           dropout_rate = 0.3,
                           training = True)        
classifier(np.array([np.arange(50)]))
classifier.summary()

Model: "sentiment_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 cnn_document_model (Documen  multiple                 2260881   
 tEncoder)                                                       
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
 dense_19 (Dense)            multiple                  257       
                                                                 
Total params: 2,261,138
Trainable params: 2,261,138
Non-trainable params: 0
_________________________________________________________________


In [94]:
classifier.doc_model.summary()

Model: "cnn_document_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     multiple                  2230100   
                                                                 
 conv1d_27 (Conv1D)          multiple                  2525      
                                                                 
 conv1d_28 (Conv1D)          multiple                  3775      
                                                                 
 conv1d_29 (Conv1D)          multiple                  5025      
                                                                 
 dropout_18 (Dropout)        multiple                  0 (unused)
                                                                 
 dense_18 (Dense)            multiple                  19456     
                                                                 
 global_max_pooling1d_9 (Glo  multiple          

Inilialize Embeddings

In [95]:
embeddings = np.concatenate([np.zeros((1,50)),
                             word_vectors.vectors,
                             np.expand_dims(np.mean(word_vectors.vectors, axis=0), axis=0)])

In [96]:
classifier.doc_model.embedding.set_weights([embeddings])

### Train and Validate SA Model

In [97]:
cross_entropy = tf.keras.losses.BinaryCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
#mse = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.NONE)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
loss_metric = tf.keras.metrics.Mean()
acc = tf.keras.metrics.BinaryAccuracy()

test_loss = tf.keras.metrics.Mean()
test_acc = tf.keras.metrics.BinaryAccuracy()

In [98]:
num_epochs = 2
for epoch in range(num_epochs):
    train_dataset = tf.data.Dataset.from_tensor_slices((corpus_to_seq,train_target_amazon))
    train_dataset = train_dataset.shuffle(buffer_size=1024).batch(64)
    
    test_dataset_iterator = tf.data.Dataset.from_tensor_slices((test_corpus_amazon_seq,test_target_amazon))
    test_dataset_iterator = test_dataset_iterator.shuffle(buffer_size=1024).batch(64)

    loss_metric.reset_states()
    acc.reset_states()
    
    test_loss.reset_states()
    test_acc.reset_states()
    
    for step, (documents_batch, target_batch) in enumerate(train_dataset):
        with tf.GradientTape() as tape:
            pred =  classifier(documents_batch)
            loss = tf.reduce_mean(cross_entropy(target_batch, pred))
        gradients = tape.gradient(loss, classifier.trainable_weights)
        optimizer.apply_gradients(zip(gradients, classifier.trainable_weights))  
        loss_metric(loss)
        acc.update_state(target_batch, pred)
        if step % 100==0:
            print("epoch %d step %d Loss=%.4f Acc=%.4f" % (epoch,
                                                  step, 
                                                  loss_metric.result(),
                                                  acc.result()))

            
    for test_step, (test_docs, test_targets) in enumerate(test_dataset_iterator) :
        test_pred =  classifier(test_docs)
        testloss = tf.reduce_mean(cross_entropy(test_targets, test_pred))
        test_loss(testloss)
        test_acc.update_state(test_targets, test_pred)
    print("Validation Loss = %.4f Acc=%.4f"% (test_loss.result(), test_acc.result()))


epoch 0 step 0 Loss=0.7437 Acc=0.4375
epoch 0 step 100 Loss=0.5841 Acc=0.6847
epoch 0 step 200 Loss=0.4791 Acc=0.7617
epoch 0 step 300 Loss=0.4258 Acc=0.7930
epoch 0 step 400 Loss=0.3960 Acc=0.8113
epoch 0 step 500 Loss=0.3748 Acc=0.8243
epoch 0 step 600 Loss=0.3595 Acc=0.8341
epoch 0 step 700 Loss=0.3480 Acc=0.8406
epoch 0 step 800 Loss=0.3384 Acc=0.8464
epoch 0 step 900 Loss=0.3331 Acc=0.8501
epoch 0 step 1000 Loss=0.3258 Acc=0.8546
epoch 0 step 1100 Loss=0.3197 Acc=0.8582
epoch 0 step 1200 Loss=0.3147 Acc=0.8608
epoch 0 step 1300 Loss=0.3096 Acc=0.8635
epoch 0 step 1400 Loss=0.3041 Acc=0.8663
epoch 0 step 1500 Loss=0.3000 Acc=0.8686
epoch 0 step 1600 Loss=0.2966 Acc=0.8706
epoch 0 step 1700 Loss=0.2935 Acc=0.8725
epoch 0 step 1800 Loss=0.2910 Acc=0.8739
epoch 0 step 1900 Loss=0.2881 Acc=0.8755
epoch 0 step 2000 Loss=0.2862 Acc=0.8766
epoch 0 step 2100 Loss=0.2837 Acc=0.8780
epoch 0 step 2200 Loss=0.2812 Acc=0.8793
epoch 0 step 2300 Loss=0.2789 Acc=0.8806
epoch 0 step 2400 Loss=0.276

In [99]:
classifier.save_weights('./model/sa/sentiment_analyser')  

#### Performance on IMDB test w/o Fine-Tuning

In [100]:
test_corpus_imdb = test_df_imdb['review'].tolist()
test_target_imdb = test_df_imdb['sentiment'].tolist()

In [101]:
tokenized_test_corpus_imdb = [wpt.tokenize(doc.lower()) for doc in test_corpus_imdb]
test_corpus_imdb_seq = [ [getWordIndex(token) for token in tokens] for tokens in tokenized_test_corpus_imdb]
test_corpus_imdb_seq = pad_sequences(test_corpus_imdb_seq,MAX_TR_SEQ_LEN, padding='post',truncating='post')

In [102]:
test_dataset_imdb_iterator = tf.data.Dataset.from_tensor_slices((test_corpus_imdb_seq,test_target_imdb))
test_dataset_imdb_iterator = test_dataset_imdb_iterator.shuffle(buffer_size=1024).batch(64)

In [162]:
classifier_imdb = DocClassifier(vocab_size+2,
                           embedding_dim = 50,
                           dropout_rate = 0.3,
                           training = True)        
classifier_imdb(np.array([np.arange(50)]))
#inilialize    
classifier_imdb.load_weights('./model/sa/sentiment_analyser')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x142621a93c8>

In [163]:
test_loss = tf.keras.metrics.Mean()
test_acc = tf.keras.metrics.BinaryAccuracy()

for test_step, (test_docs, test_targets) in enumerate(test_dataset_imdb_iterator) :
    test_pred =  classifier_imdb(test_docs)
    testloss = tf.reduce_mean(cross_entropy(test_targets, test_pred))
    test_loss(testloss)
    test_acc.update_state(test_targets, test_pred)
print("Test Loss = %.4f Acc=%.4f"% (test_loss.result(), test_acc.result()))

Test Loss = 0.4682 Acc=0.8088


### More Fine Tuning

In [164]:
train_df_imdb_small = train_df_imdb.sample(frac=0.30, random_state = 42)
corpus_imdb_small = train_df_imdb_small['review'].tolist()
target_imdb_small = train_df_imdb_small['sentiment'].tolist()

In [165]:
tokenized_imdb_small_train_corpus = [wpt.tokenize(doc.lower()) for doc in corpus_imdb_small]
corpus_imdb_small_seq = [ [getWordIndex(token) for token in tokens] for tokens in tokenized_imdb_small_train_corpus]
corpus_imdb_small_seq = pad_sequences(corpus_imdb_small_seq,MAX_TR_SEQ_LEN, padding='post',truncating='post')

In [166]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
loss_metric = tf.keras.metrics.Mean()
acc = tf.keras.metrics.BinaryAccuracy()

test_loss = tf.keras.metrics.Mean()
test_acc = tf.keras.metrics.BinaryAccuracy()

In [167]:
num_epochs = 5

for epoch in range(num_epochs):
    train_dataset = tf.data.Dataset.from_tensor_slices((corpus_imdb_small_seq,target_imdb_small))
    train_dataset = train_dataset.shuffle(buffer_size=1024).batch(32)
    
    test_dataset_iterator = tf.data.Dataset.from_tensor_slices((test_corpus_imdb_seq,test_target_imdb))
    test_dataset_iterator = test_dataset_iterator.shuffle(buffer_size=1024).batch(32)

    loss_metric.reset_states()
    acc.reset_states()
    
    test_loss.reset_states()
    test_acc.reset_states()
    
    for step, (documents_batch, target_batch) in enumerate(train_dataset):
        with tf.GradientTape() as tape:
            pred =  classifier_imdb(documents_batch)
            loss = tf.reduce_mean(cross_entropy(target_batch, pred))
        final_layer_weights = classifier_imdb.fc1.trainable_weights + classifier_imdb.doc_model.doc_embedding.trainable_weights
        gradients = tape.gradient(loss, final_layer_weights)
        optimizer.apply_gradients(zip(gradients, final_layer_weights))  
        loss_metric(loss)
        acc.update_state(target_batch, pred)
        if step % 100==0:
            print("epoch %d step %d Loss=%.4f Acc=%.4f" % (epoch,
                                                  step, 
                                                  loss_metric.result(),
                                                  acc.result()))

    for test_step, (test_docs, test_targets) in enumerate(test_dataset_imdb_iterator) :
        test_pred =  classifier_imdb(test_docs)
        testloss = tf.reduce_mean(cross_entropy(test_targets, test_pred))
        test_loss(testloss)
        test_acc.update_state(test_targets, test_pred)
    print("Test Loss = %.4f Acc=%.4f"% (test_loss.result(), test_acc.result()))

epoch 0 step 0 Loss=0.3464 Acc=0.7812
epoch 0 step 100 Loss=0.4623 Acc=0.8091
epoch 0 step 200 Loss=0.4392 Acc=0.8111
Test Loss = 0.4212 Acc=0.8092
epoch 1 step 0 Loss=0.2490 Acc=0.8438
epoch 1 step 100 Loss=0.4244 Acc=0.8082
epoch 1 step 200 Loss=0.4109 Acc=0.8123
Test Loss = 0.4132 Acc=0.8089
epoch 2 step 0 Loss=0.3421 Acc=0.9062
epoch 2 step 100 Loss=0.4115 Acc=0.8122
epoch 2 step 200 Loss=0.4072 Acc=0.8133
Test Loss = 0.4113 Acc=0.8104
epoch 3 step 0 Loss=0.4801 Acc=0.7812
epoch 3 step 100 Loss=0.4120 Acc=0.8134
epoch 3 step 200 Loss=0.4024 Acc=0.8178
Test Loss = 0.4103 Acc=0.8105
epoch 4 step 0 Loss=0.4198 Acc=0.7812
epoch 4 step 100 Loss=0.4148 Acc=0.8131
epoch 4 step 200 Loss=0.4021 Acc=0.8165
Test Loss = 0.4094 Acc=0.8108


## Training on IMDB From Scrach

In [168]:
classifier_imdb = DocClassifier(vocab_size+2,
                           embedding_dim = 50,
                           dropout_rate = 0.3,
                           training = True)        
classifier_imdb(np.array([np.arange(50)]))
#inilialize embeddings   
classifier_imdb.doc_model.embedding.set_weights([embeddings])

In [169]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
loss_metric = tf.keras.metrics.Mean()
acc = tf.keras.metrics.BinaryAccuracy()

test_loss = tf.keras.metrics.Mean()
test_acc = tf.keras.metrics.BinaryAccuracy()

In [170]:
num_epochs = 5

for epoch in range(num_epochs):
    train_dataset = tf.data.Dataset.from_tensor_slices((corpus_imdb_small_seq,target_imdb_small))
    train_dataset = train_dataset.shuffle(buffer_size=1024).batch(32)
    
    test_dataset_iterator = tf.data.Dataset.from_tensor_slices((test_corpus_imdb_seq,test_target_imdb))
    test_dataset_iterator = test_dataset_iterator.shuffle(buffer_size=1024).batch(32)

    loss_metric.reset_states()
    acc.reset_states()
    
    test_loss.reset_states()
    test_acc.reset_states()
    
    for step, (documents_batch, target_batch) in enumerate(train_dataset):
        with tf.GradientTape() as tape:
            pred =  classifier_imdb(documents_batch)
            loss = tf.reduce_mean(cross_entropy(target_batch, pred))
        final_layer_weights = classifier_imdb.fc1.trainable_weights + classifier_imdb.doc_model.doc_embedding.trainable_weights
        gradients = tape.gradient(loss, final_layer_weights)
        optimizer.apply_gradients(zip(gradients, final_layer_weights))  
        loss_metric(loss)
        acc.update_state(target_batch, pred)
        if step % 100==0:
            print("epoch %d step %d Loss=%.4f Acc=%.4f" % (epoch,
                                                  step, 
                                                  loss_metric.result(),
                                                  acc.result()))

    for test_step, (test_docs, test_targets) in enumerate(test_dataset_imdb_iterator) :
        test_pred =  classifier_imdb(test_docs)
        testloss = tf.reduce_mean(cross_entropy(test_targets, test_pred))
        test_loss(testloss)
        test_acc.update_state(test_targets, test_pred)
    print("Test Loss = %.4f Acc=%.4f"% (test_loss.result(), test_acc.result()))

epoch 0 step 0 Loss=0.7338 Acc=0.4688
epoch 0 step 100 Loss=0.6922 Acc=0.5186
epoch 0 step 200 Loss=0.6893 Acc=0.5339
Test Loss = 0.6826 Acc=0.5760
epoch 1 step 0 Loss=0.6673 Acc=0.6562
epoch 1 step 100 Loss=0.6786 Acc=0.5959
epoch 1 step 200 Loss=0.6773 Acc=0.5920
Test Loss = 0.6750 Acc=0.6088
epoch 2 step 0 Loss=0.6724 Acc=0.6562
epoch 2 step 100 Loss=0.6712 Acc=0.6126
epoch 2 step 200 Loss=0.6698 Acc=0.6132
Test Loss = 0.6754 Acc=0.5689
epoch 3 step 0 Loss=0.6547 Acc=0.5625
epoch 3 step 100 Loss=0.6657 Acc=0.6132
epoch 3 step 200 Loss=0.6634 Acc=0.6155
Test Loss = 0.6633 Acc=0.6337
epoch 4 step 0 Loss=0.6280 Acc=0.7500
epoch 4 step 100 Loss=0.6611 Acc=0.6284
epoch 4 step 200 Loss=0.6576 Acc=0.6373
Test Loss = 0.6630 Acc=0.6130
