In [1]:
# import necessary libraries
import numpy as np
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Embedding, TimeDistributed
import gensim
from gensim.models import Word2Vec, KeyedVectors
import json

In [2]:
!pip install datasets



In [3]:
#load hr500k dataset from Hugging Face, classla repository
from datasets import load_dataset

dataset = load_dataset("classla/hr500k")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/13.4k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.11M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [4]:
# split train, validation and test subsets
# tokens column is our input data and xpos_tags column is our target data
train_data = dataset['train']
validation_data = dataset['validation']
test_data = dataset['test']

X_train = train_data['tokens']
Y_train = train_data['xpos_tags']
X_valid = validation_data['tokens']
Y_valid = validation_data['xpos_tags']
X_test = test_data['tokens']
Y_test = test_data['xpos_tags']

In [5]:
# Tokenize and pad sequences for training data

# The Tokenizer class from Keras or TensorFlow is used to vectorize a text corpus by turning each text into either a sequence of integers (where each integer represents a unique word in the corpus)
# or into a vector where the coefficient for each word could be binary, based on word count, based on TF-IDF, etc. In this case, we tokenize words and part-of-speech tags.
word_tokenizer = Tokenizer()
# update the internal vocabulary based on a list of texts, it creates the word-to-index mapping necessary for tokenization
word_tokenizer.fit_on_texts(X_train + X_valid + X_test)
xpos_tokenizer = Tokenizer()
xpos_tokenizer.fit_on_texts(Y_train + Y_valid + Y_test)

MAX_SEQ_LENGTH = 100
# transforms each text in the given list of texts to a sequence of integers based on the word-to-index mapping learned during fit_on_texts
X_train_encoded = word_tokenizer.texts_to_sequences(X_train)
# ensure that all sequences in a list have the same length, pad sequences that are shorter than the maximum length or truncate sequences that are longer than the maximum length
X_train_padded = pad_sequences(X_train_encoded, maxlen=MAX_SEQ_LENGTH, padding="post", truncating="post")

Y_train_xpos_encoded = xpos_tokenizer.texts_to_sequences(Y_train)
Y_train_xpos_padded = pad_sequences(Y_train_xpos_encoded, maxlen=MAX_SEQ_LENGTH, padding="post", truncating="post")

In [6]:
# Tokenize and pad sequences for validation data
X_valid_encoded = word_tokenizer.texts_to_sequences(X_valid)
X_valid_padded = pad_sequences(X_valid_encoded, maxlen=MAX_SEQ_LENGTH, padding="post", truncating="post")

Y_valid_xpos_encoded = xpos_tokenizer.texts_to_sequences(Y_valid)
Y_valid_xpos_padded = pad_sequences(Y_valid_xpos_encoded, maxlen=MAX_SEQ_LENGTH, padding="post", truncating="post")

# Encode Y_train_xpos and Y_valid_xpos for model training
NUM_CLASSES_XPOS = len(xpos_tokenizer.word_index) + 1
Y_train_xpos_cat = to_categorical(Y_train_xpos_padded, num_classes=NUM_CLASSES_XPOS)
Y_valid_xpos_cat = to_categorical(Y_valid_xpos_padded, num_classes=NUM_CLASSES_XPOS)

In [7]:
# Train Word2Vec model
model = Word2Vec(sentences=X_train, vector_size=300, epochs=100)

# Save the trained model to a file
model.wv.save('word2vec_model.kv')

# Load the saved model
word_vectors = KeyedVectors.load('word2vec_model.kv')

EMBEDDING_SIZE = 300
VOCABULARY_SIZE = len(word_tokenizer.word_index) + 1

# Create an empty embedding matrix
embedding_weights = np.zeros((VOCABULARY_SIZE, EMBEDDING_SIZE))

# Create a word to index dictionary mapping
word2id = word_tokenizer.word_index

# Copy vectors from Word2Vec model to the words present in the corpus
for word, index in word2id.items():
    try:
        embedding_weights[index, :] = word_vectors[word]
    except KeyError:
        pass

In [8]:
# Define a bidirectional LSTM model for a sequence tagging
bidirect_model = Sequential()
# Embedding layer is used to convert integer indices representing words into dense vectors of fixed size (EMBEDDING_SIZE)
bidirect_model.add(Embedding(input_dim     = VOCABULARY_SIZE, # the total number of unique words
                             output_dim    = EMBEDDING_SIZE, # dimensionality of the embedding vectors
                             input_length  = MAX_SEQ_LENGTH, # the maximum length of input sequences
                             weights       = [embedding_weights], # pre-trained word embeddings
                             trainable     = True # the weights of the embedding layer will be updated during training
))

# Bidirectional wrapper is used to create a bidirectional LSTM layer
# Inside, an LSTM layer with 64 units is specified, return_sequences=True ensures that the LSTM layer returns the full sequence of outputs for each timestep
bidirect_model.add(Bidirectional(LSTM(64, return_sequences=True)))

# TimeDistributed wrapper is used to apply a dense layer to each timestep independently
# Dense layer with NUM_CLASSES_XPOS units and softmax activation is specified, predicts the part-of-speech tags for each word in the sequence
bidirect_model.add(TimeDistributed(Dense(NUM_CLASSES_XPOS, activation='softmax')))

In [9]:
# configure the learning process of the model
bidirect_model.compile(
    loss='categorical_crossentropy', # typically used for multi-class classification problems when the targets are one-hot encoded
    optimizer='adam', # adaptive learning rate optimization algorithm known for its efficiency and effectiveness
    metrics=['acc'] # accuracy will be calculated and displayed during training
)

In [10]:
# display a summary of the model architecture, including the number of parameters and the shape of the output at each layer
bidirect_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 300)          20039400  
                                                                 
 bidirectional (Bidirection  (None, 100, 128)          186880    
 al)                                                             
                                                                 
 time_distributed (TimeDist  (None, 100, 769)          99201     
 ributed)                                                        
                                                                 
Total params: 20325481 (77.54 MB)
Trainable params: 20325481 (77.54 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [11]:
# train the model
bidirect_training = bidirect_model.fit(
    X_train_padded, Y_train_xpos_cat,
    batch_size=64,
    epochs=10,
    validation_data=(X_valid_padded, Y_valid_xpos_cat)
)

# This will train the bidirectional model on the training data (X_train_padded, Y_train_xpos_cat) for 10 epochs, using a batch size of 64.
# Validation data (X_valid_padded, Y_valid_xpos_cat) will be used to evaluate the model's performance after each epoch.
# During training, the model will minimize the categorical cross-entropy loss specified during compilation and compute the accuracy metric specified as one of the training metrics.

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [12]:
# save model
bidirect_model.save('bidir_lstm.h5')

  saving_api.save_model(


In [13]:
# save training history as a JSON file
history_dict = bidirect_training.history
json.dump(history_dict, open('bidir_train_history.json','w'))