In [3]:
import os
import sys
from pathlib import Path

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

import re
from typing import *
from helper import to_sequences, to_padding

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

### 1.Reading the data

In [4]:
data_path = Path("data/got/game_of_thrones.txt")

In [5]:
df = pd.read_csv(data_path, sep='\t', header=None, names=["got_text"])

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7083 entries, 0 to 7082
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   got_text  7083 non-null   object
dtypes: object(1)
memory usage: 55.5+ KB


In [7]:
df.head()

Unnamed: 0,got_text
0,A Song of Ice and Fire
1,A Game of Thrones
2,PROLOGUE
3,"We should start back, Gared urged as the woods..."
4,Do the dead frighten you? Ser Waymar Royce ask...


In [8]:
# Converting each line to a list
sentences = df['got_text'].apply(lambda x: x.lower()).to_list()

In [9]:
sentences[:3]

['a song of ice and fire', 'a game of thrones', 'prologue']

### 2.Cleaning the Data

* All words are converted to Lower case

In [10]:
MAX_LENGTH = 0
lens = []
for each_sentence in sentences:
    temp_len = len(each_sentence)
    lens.append(temp_len)
    if MAX_LENGTH<temp_len:
        MAX_LENGTH = temp_len

print(MAX_LENGTH, np.mean(lens))

1603 224.711986446421


### 4.Preprocessing --> Tokenization, Sequences and Padding

In [11]:
import warnings
warnings.filterwarnings('ignore')

In [12]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [13]:
# Vocabulary size of the tokenizer
VOCAB_SIZE = 10000

# Maximum length of clean sentence is  217
# Giving 50 as the max length including the padded sequences
MAX_LENGTH = 225

# Output dimensions of the Embedding layer
EMBEDDING_DIM = 16

# Parameters for padding and OOV tokens
TRUNC_TYPE = 'pre'
PADDING_TYPE = 'pre'
OOV_TOKEN = "<OOV>"

In [21]:
def get_tokenizer(VOCAB_SIZE, OOV_TOKEN, X_data=sentences):
    # Initialize the Tokenizer class
    tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token=OOV_TOKEN)

    # Generate the word index dictionary
    tokenizer.fit_on_texts(X_data)
    
    return tokenizer

def to_sequences_n_padding(tokenizer,MAX_LENGTH,PADDING_TYPE,TRUNC_TYPE,X_data):
    # Generate and pad the training sequences
    _sequences = tokenizer.texts_to_sequences(X_data)
    _padded = pad_sequences(_sequences, 
                            maxlen=MAX_LENGTH, 
                            padding=PADDING_TYPE, 
                            truncating=TRUNC_TYPE)
    
    return _padded   


def seq_gen(sent_seq):
    input_seq = []
    for line in sent_seq:
        for i in range(1, len(line)):
            input_seq.append(line[:i+1])

    return input_seq

In [15]:
tokenizer = get_tokenizer(VOCAB_SIZE=VOCAB_SIZE,OOV_TOKEN=OOV_TOKEN)

# Define the total words. You add 1 for the index `0` which is just the padding token.
total_words = len(tokenizer.word_index) + 1

print(f'word index dictionary: {tokenizer.word_index}')
print(f'total words: {total_words}')

total words: 12251


In [16]:
sent_seq = to_sequences(tokenizer=tokenizer,X_data=sentences)

In [17]:
sent_seq[:3]

[[5, 1029, 6, 552, 3, 244], [5, 1248, 6, 2171], [7746]]

In [22]:
input_seq = seq_gen(sent_seq)

In [24]:
input_seq[:10]

[[5, 1029],
 [5, 1029, 6],
 [5, 1029, 6, 552],
 [5, 1029, 6, 552, 3],
 [5, 1029, 6, 552, 3, 244],
 [5, 1248],
 [5, 1248, 6],
 [5, 1248, 6, 2171],
 [67, 181],
 [67, 181, 1347]]

In [25]:
padded_seq = to_padding(input_seq, 
                        MAX_LENGTH=MAX_LENGTH,
                        PADDING_TYPE=PADDING_TYPE,
                        TRUNC_TYPE=TRUNC_TYPE
                        )

In [29]:
padded_seq.shape

(289204, 225)

### 5. Dataset Preparation

In [31]:
# Create inputs and label by splitting the last token in the subphrases
xs, labels = padded_seq[:, :-1], padded_seq[:, -1]

In [32]:
# Convert the label into one-hot arrays
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

MemoryError: Unable to allocate 13.2 GiB for an array with shape (289204, 12251) and data type float32

### 6. LSTM +BIDIRECTIONAL

##### Callbacks

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0.0001,
    patience=5,
    verbose=0,
    mode='auto',
    # baseline=None,
    restore_best_weights=True,
    # start_from_epoch=0
)

# Saving the best model and its weights to given path
checkpoint_filepath = 'models/model_checkpoint_LSTM.h5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    verbose=1,
    # save_weights_only=True,
    save_best_only=True,
    monitor='val_loss',
    mode='auto'
)

##### Model Training

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LENGTH, name="embedding"),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(30, name="rnn1_30", return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, name="rnn2_50", return_sequences=False)),

    # tf.keras.layers.Flatten(),

    tf.keras.layers.Dense(120, activation='relu', name="dense32"),
    tf.keras.layers.BatchNormalization(name="batch1"),
    tf.keras.layers.Dropout(0.5, name="dropout1"),

    tf.keras.layers.Dense(80, activation='relu', name="dense16"),
    tf.keras.layers.BatchNormalization(name="batch2"),
    tf.keras.layers.Dropout(0.4, name="dropout2"),


    tf.keras.layers.Dense(1, activation='sigmoid', name="last_dense1")
])

# Print the model summary
model.summary()

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001),
              loss='binary_crossentropy',
              metrics=['binary_accuracy'])

In [None]:
history = model.fit(train_dataset,
          validation_data=test_dataset,
          epochs=30,
          verbose=2,
          callbacks=[early_stopping, 
                     model_checkpoint_callback])

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15, 5))

ax[0].plot(history.history['loss'], label="train_loss")
ax[0].plot(history.history['val_loss'], label="val_loss")
ax[0].set_title("SCE loss function")
ax[0].legend()

ax[1].plot(history.history['binary_accuracy'], label="train_acc")
ax[1].plot(history.history['val_binary_accuracy'], label="val_acc")
ax[1].set_title("Accuracy Metric function")
ax[1].legend()
plt.tight_layout()
plt.show()