**Text and Sequence**

Phani Varshitha, Durga Chowdary 

We will be using IMDB data for this text and sequence problem. Firstly, we need to create a
validation set with 80% of training dataset and setting apart 20% for training.

Reading Data

In [22]:
import os, shutil , pathlib , random
base_dir = pathlib.Path("C:/Users/varshitha/Downloads/aclImdb_v1/aclImdb")
val_dir= base_dir/"validation"
train_dir=base_dir/"train"
for category in ("neg","pos"):
    os.makedirs(val_dir/category)
    files= os.listdir(train_dir/category)
    random.Random(1337).shuffle(files)
    num_val_samples = 5000
    val_file = files[:num_val_samples]
    for fname in val_file:
        shutil.move(train_dir/category/fname,
                    val_dir/category,fname)

Making a small training sample as well :

In [23]:
train_dir_1 =base_dir/"train1"
for category in ("neg","pos"):
     os.makedirs(train_dir_1/category)
     files= os.listdir(train_dir/category)
     random.Random(1337).shuffle(files)
     num_train_samples = 50
     train_file = files[:num_train_samples]
     for fname in train_file:
        shutil.move(train_dir/category/fname,
                    train_dir_1/category,fname)

Reading our datasets :

In [24]:
from tensorflow import keras
batch_size = 32
train = keras.utils.text_dataset_from_directory (train_dir_1,batch_size=batch_size)
validation=keras.utils.text_dataset_from_directory(val_dir,batch_size=batch_size)
test=keras.utils.text_dataset_from_directory(base_dir/
  "test",batch_size=batch_size)

Found 100 files belonging to 2 classes.
Found 10000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


Trying sequencing model:

Preparing dataset for this model:

In [25]:
from tensorflow.keras import layers
max_length = 150 # Cutting off values after 150 words
max_tokens = 10000 # Considering only top 10,000 words
text_vectorization = layers.TextVectorization(
      max_tokens=max_tokens,
      output_mode="int",
      output_sequence_length=max_length,
)
text_only_train_ds = train.map(lambda x, y: x)
# Turning text to vectors
text_vectorization.adapt(text_only_train_ds)
int_train_ds = train.map(
lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
int_val_ds = validation.map(
lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)
int_test_ds = test.map(
lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)


Model Construction - Embedding Layer:

In [26]:
import tensorflow as tf
inputs=keras.Input(shape=(None,), dtype="int64")
embedded= layers.Embedding(input_dim=max_tokens, output_dim=256,
 mask_zero=True)(inputs)
# We have turned mask on because training bi-directional LSTM can take longer time
x= layers.Bidirectional(layers.LSTM(32))(embedded)
x=layers.Dropout(0.5)(x)
outputs= layers.Dense(1,activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])

model.summary()

Fitting the model on our testing data:



In [27]:
model.fit(int_train_ds,
          validation_data=int_val_ds,
          epochs=10)

Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 4s/step - accuracy: 0.5226 - loss: 0.6918 - val_accuracy: 0.4982 - val_loss: 0.6931
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 4s/step - accuracy: 0.6032 - loss: 0.6872 - val_accuracy: 0.5197 - val_loss: 0.6925
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 4s/step - accuracy: 0.7837 - loss: 0.6763 - val_accuracy: 0.5188 - val_loss: 0.6921
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 4s/step - accuracy: 0.8556 - loss: 0.6620 - val_accuracy: 0.5247 - val_loss: 0.6916
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 4s/step - accuracy: 0.9340 - loss: 0.6511 - val_accuracy: 0.5335 - val_loss: 0.6910
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 4s/step - accuracy: 0.9048 - loss: 0.6312 - val_accuracy: 0.5441 - val_loss: 0.6898
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x2bebf1874d0>

Testing this model:

In [28]:
print("\n Model's accuracy:",round (model.evaluate(int_test_ds)[1]*100,2),"%")

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 38ms/step - accuracy: 0.5012 - loss: 0.8357

 Model's accuracy: 50.19 %


Hence, our first model's accuracy with LSTM and embedding is just 55.44% which is quite low. we will now try a pre-trained word embedding.82 

Model Construction - Pretrained word embedded 

Parsing after downloading the glove pretrained work-embedding

In [33]:
import numpy as nppath_to_glove_file = "C:/Users/varshitha/Downloads/glove.6B/glove.6B.100d.txt"

embeddings_index={} 
with open(path_to_glove_file, encoding = "utf-8") as f:
    for line in f:
        words,coefs=line.split(maxsplit=1)
        coefs = np.fromstring(coefs,"f", sep=" ")
        embeddings_index[words]=coefs 

Preparing a matrix of GloVe :

In [34]:
embedding_dim=100
vocabulary = text_vectorization.get_vocabulary()
word_index = dict(zip(vocabulary,range(len(vocabulary))))
embedding_matrix = np.zeros((max_tokens,embedding_dim))
for word, i in word_index.items():
     if i<max_tokens:
          embedding_vector = embeddings_index.get(word)
     if embedding_vector is not None :
         embedding_matrix[i] = embedding_vector

Making an embedding layer with this embedded matrix :

In [35]:
embedding_layer= layers.Embedding(max_tokens,
                                  embedding_dim,
                                  embeddings_initializer=keras.initializers.
 Constant(embedding_matrix),
                                  trainable=False,
                                  mask_zero=True)

Making a final model with pretrained work-embedding :

In [36]:
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = embedding_layer(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
               loss="binary_crossentropy",
               metrics=["accuracy"])
model.summary()
callbacks = [ keras.callbacks.ModelCheckpoint("C:/Users/varshitha/Downloads/aclImdb_v1/aclImdb/glove_embeddings_sequence_model.keras",
                                               save_best_only=True)
]

Training this model on our dataset :



In [37]:
model.fit (int_train_ds, validation_data=int_val_ds , epochs= 10,
callbacks=callbacks) 

Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 10s/step - accuracy: 0.4906 - loss: 0.7368 - val_accuracy: 0.4999 - val_loss: 0.7029
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 9s/step - accuracy: 0.5486 - loss: 0.6926 - val_accuracy: 0.5317 - val_loss: 0.6893
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 8s/step - accuracy: 0.6558 - loss: 0.6394 - val_accuracy: 0.5038 - val_loss: 0.7104
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 9s/step - accuracy: 0.5860 - loss: 0.6914 - val_accuracy: 0.5431 - val_loss: 0.6874
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 9s/step - accuracy: 0.6252 - loss: 0.6620 - val_accuracy: 0.5547 - val_loss: 0.6845
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 8s/step - accuracy: 0.6975 - loss: 0.6273 - val_accuracy: 0.5353 - val_loss: 0.6896
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0

<keras.src.callbacks.history.History at 0x2bed611db50>

Testing this Model on our dataset:

In [38]:
print("\n Model's Accuracy :", round (model.evaluate (int_test_ds)[1]*100,2))

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 253ms/step - accuracy: 0.5301 - loss: 0.6969

 Model's Accuracy : 53.04


Pre-trained embedding is not really helpful in this case. Hence,training from scratch worked better for this dataset. Now, we will try to increase training sample size and then train our model again:

Increase training size by 7000 samples

In [39]:
for category in ("neg","pos"):
    files= os.listdir(train_dir/category)
    random.Random(1337).shuffle(files)
    num_train_samples =3500
    train_file = files[:num_train_samples]
    for fname in train_file:
        shutil.move(train_dir/category/fname,
                    train_dir_1/category,fname)

Making a training dataset again

In [41]:
train = keras.utils.text_dataset_from_directory(train_dir_1,batch_size=batch_size)
int_train_ds = train.map(
lambda x, y : (text_vectorization(x) , y ), num_parallel_calls=4)

Found 7100 files belonging to 2 classes.


Training the last pretrained embedding model with new training dataset :

In [44]:
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10,callbacks=callbacks)

Epoch 1/10
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 621ms/step - accuracy: 0.5844 - loss: 0.6709 - val_accuracy: 0.6898 - val_loss: 0.5861
Epoch 2/10
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 385ms/step - accuracy: 0.7198 - loss: 0.5662 - val_accuracy: 0.7718 - val_loss: 0.4806
Epoch 3/10
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 373ms/step - accuracy: 0.7685 - loss: 0.4908 - val_accuracy: 0.7652 - val_loss: 0.5023
Epoch 4/10
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 470ms/step - accuracy: 0.8033 - loss: 0.4349 - val_accuracy: 0.8094 - val_loss: 0.4181
Epoch 5/10
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 478ms/step - accuracy: 0.8297 - loss: 0.3894 - val_accuracy: 0.8149 - val_loss: 0.4129
Epoch 6/10
[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 378ms/step - accuracy: 0.8478 - loss: 0.3579 - val_accuracy: 0.8029 - val_loss: 0.4538
Epoch 

<keras.src.callbacks.history.History at 0x2bed0cd0d10>

Testing the model now :

In [None]:
print ("\n Model's Acurracy:" ,round(model.evaluate(int_test_ds)[1]*100,2))

Increasing samples did not really increase any acuracy.
    
Increasing training sample again by 7000

In [45]:
for category in ("neg","pos"):
    files= os.listdir(train_dir/category)
    random.Random(1337).shuffle(files)
    num_train_samples = 3500
    train_file = files[:num_train_samples]
    for fname in train_file:
        shutil.move(train_dir/category/fname,
                    train_dir_1/category,fname)

Reading a new training set:

In [47]:
train = keras.utils.text_dataset_from_directory(train_dir_1,batch_size=batch_size)
int_train_ds = train.map(
lambda x, y: (text_vectorization(x), y), num_parallel_calls=4)

Found 14100 files belonging to 2 classes.


Training this model again 

In [48]:
 model.fit(int_train_ds, validation_data=int_val_ds, epochs=10)

Epoch 1/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 445ms/step - accuracy: 0.8671 - loss: 0.3212 - val_accuracy: 0.8352 - val_loss: 0.3775
Epoch 2/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m162s[0m 366ms/step - accuracy: 0.8837 - loss: 0.2852 - val_accuracy: 0.8148 - val_loss: 0.4872
Epoch 3/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 341ms/step - accuracy: 0.8965 - loss: 0.2641 - val_accuracy: 0.8386 - val_loss: 0.3732
Epoch 4/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 329ms/step - accuracy: 0.9093 - loss: 0.2357 - val_accuracy: 0.8429 - val_loss: 0.4187
Epoch 5/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 334ms/step - accuracy: 0.9153 - loss: 0.2159 - val_accuracy: 0.8393 - val_loss: 0.4293
Epoch 6/10
[1m441/441[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 321ms/step - accuracy: 0.9274 - loss: 0.1928 - val_accuracy: 0.8322 - val_loss: 0.4755
Epoc

<keras.src.callbacks.history.History at 0x2beebdb2d10>

Testing this model:

In [None]:
print("\n Model's Accuracy:",round(model.evaluate(int_test_ds)[1]*100,2))

[1m514/782[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m21s[0m 79ms/step - accuracy: 0.8294 - loss: 0.6172