# AUEB M.Sc. in Data Science (part-time)

### 2024.04 - 2024.06

Exercise 2

**Course**: Text Analytics   
**Authors**:
Anagnos Theodoros (p3352323) -
Michalopoulos Ioannis (p3352314) -
Kafantaris Panagiotis (p3352328) -  
Vigkos Ioannis (p3352326)

**Date**: 2024-04-29

installing libraries

In [None]:
!pip install  tensorflow==2.15.0
!pip install -U fasttext



In [None]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


In [None]:
!pip install conllu



download dataset

In [None]:
!wget https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu -O en_ewt_dev.conllu

--2024-05-29 10:41:25--  https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu
Resolving github.com (github.com)... 20.27.177.113
Connecting to github.com (github.com)|20.27.177.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-dev.conllu [following]
--2024-05-29 10:41:25--  https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-dev.conllu
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1764449 (1.7M) [text/plain]
Saving to: ‘en_ewt_dev.conllu’


2024-05-29 10:41:26 (8.22 MB/s) - ‘en_ewt_dev.conllu’ saved [1764449/1764449]



we tokenize the sentences and build the lists accordingly. We seperate words from tags into different variables for every subset

In [None]:
from conllu import parse
from sklearn.model_selection import train_test_split

def parse_conllu(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = f.read()
        sentences = parse(data)
    return sentences


def extract_sentences_and_tags(sentences):
    all_sentences = []
    all_tags = []
    for sentence in sentences:
        words = []
        tags = []
        for token in sentence:
            if token['form'] and token['upostag']:
                words.append(token['form'])
                tags.append(token['upostag'])
        all_sentences.append(' '.join(words))
        all_tags.append(tags)
    return all_sentences, all_tags

sentences = parse_conllu("en_ewt_dev.conllu")
words, tags = extract_words_and_tags(sentences)

In [None]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

# Load spaCy model
nlp = spacy.load('en_core_web_sm', disable=["tagger", "parser", "ner", "lemmatizer"])
from spacy.lang.en.stop_words import STOP_WORDS
nlp.add_pipe('sentencizer')

def preprocess(corpus):
    corpus_tokenized = []
    for doc in tqdm(corpus):
        doc = ' '.join(doc)
        doc = nlp(doc)
        tokens = []
        for sent in doc.sents:
            for tok in sent:
                if '\n' in tok.text or "\t" in tok.text or "--" in tok.text or "*" in tok.text or \
                   tok.text.lower() in STOP_WORDS or tok.text in string.punctuation or \
                   all(x in string.punctuation for x in tok.text) or is_number(tok.text):
                    continue
                if tok.text.strip():
                    tokens.append(tok.text.replace('"', "'").strip().lower())
        corpus_tokenized.append(tokens)
    return corpus_tokenized

## Sliding Window implementation

In [None]:
import numpy as np
from nltk.util import ngrams

def convert_to_window_dataset(data, window_size, pad_symbol='</s>'):
    sentences, pos_tags = data
    windowed_sentences = []
    windowed_tags = []

    half_window = window_size // 2

    for sentence, tags in zip(sentences, pos_tags):
        padded_sentence = [pad_symbol] * half_window + sentence + [pad_symbol] * half_window
        padded_tags = [pad_symbol] * half_window + tags + [pad_symbol] * half_window

        for i in range(half_window, len(padded_sentence) - half_window):
            windowed_sentence = padded_sentence[i - half_window: i + half_window + 1]
            windowed_tag = tags[i - half_window]

            if windowed_tag != pad_symbol:
                windowed_sentences.append(windowed_sentence)
                windowed_tags.append(windowed_tag)

    return windowed_sentences, windowed_tags

window_size = 3

# data = (
#     [['the', 'cat', 'sat', 'on', 'the', 'mat'], ['another', 'sentence']],
#     [['DET', 'NOUN', 'VERB', 'ADP', 'DET', 'NOUN'], ['DET', 'NOUN']]
# )

# windowed_sentences, windowed_tags = convert_to_window_dataset(data, window_size)
# print("Windowed Sentences:", windowed_sentences)
# print("Windowed Tags:", windowed_tags)

In [None]:
parsed_sentences = parse_conllu(file_path)
raw_sentences, tags = extract_sentences_and_tags(parsed_sentences)

# Split the data into train, dev, and test sets
train_sentences, temp_sentences, train_tags, temp_tags = train_test_split(raw_sentences, tags, test_size=0.4, random_state=42)
dev_sentences, test_sentences, dev_tags, test_tags = train_test_split(temp_sentences, temp_tags, test_size=0.5, random_state=42)

# Combine all sentences for creating vocabulary
all_sentences = train_sentences + dev_sentences + test_sentences

# Preprocess the combined sentences to create tokens
all_sentences_tokenized = preprocess(all_sentences)

# Flatten the tokenized sentences
flat_all_sentences = [word for sublist in all_sentences_tokenized for word in sublist]

# Create vocabulary for words from the complete dataset to ensure comprehensive coverage
vocab = {word: idx for idx, word in enumerate(set(flat_all_sentences))}

# Create vocabulary for tags
flat_all_tags = [tag for sublist in tags for tag in sublist]
vocab_tags = {tag: idx for idx, tag in enumerate(set(flat_all_tags))}


In [None]:
# Preprocess individual sets
train_words_tokenized = preprocess(train_words)
dev_words_tokenized = preprocess(dev_words)
test_words_tokenized = preprocess(test_words)

In [None]:
### Build the vocabulary
# flat_train_words = [word for sublist in train_words for word in sublist]
# vocab = {word: idx for idx, word in enumerate(set(flat_train_words))}
# vocab_tags = {tag: idx for idx, tag in enumerate(set(train_tags))}

all_words = train_words + dev_words + test_words
all_words_tokenized = preprocess(all_words)
flat_all_words = [word for sublist in all_words_tokenized for word in sublist]
# Create vocabulary from the complete dataset
vocab = {word: idx for idx, word in enumerate(set(flat_all_words))}

# Create vocabulary for tags
flat_all_tags = [tag for sublist in all_tags for tag in sublist]
vocab_tags = {tag: idx for idx, tag in enumerate(set(flat_all_tags))}

NameError: name 'preprocess' is not defined

In [None]:
len(vocab), len(vocab_tags)

(3627, 18)

Get the W2V embendings

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
!gzip -d cc.en.300.bin.gz

--2024-05-29 11:12:09--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 54.192.18.54, 54.192.18.51, 54.192.18.50, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|54.192.18.54|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4503593528 (4.2G) [application/octet-stream]
Saving to: ‘cc.en.300.bin.gz’


2024-05-29 11:13:01 (84.7 MB/s) - ‘cc.en.300.bin.gz’ saved [4503593528/4503593528]



In [None]:
# import gensim.downloader as api

# wv = api.load('word2vec-google-news-300')



Map each word with its embending matrix. We do the same for every subset. We initialize with zeros, each words that isnt included in the vocabulary (wv)

In [None]:
import numpy as np

# Function to map words to embeddings
def map_vocab_to_embeddings(vocab, wv, embedding_dim=300):
    embedding_matrix = np.zeros((len(vocab), embedding_dim))
    for word, idx in vocab.items():
        if word in wv:
            embedding_matrix[idx] = wv[word]
        else:
            # Handle out-of-vocabulary (OOV) words by initializing with zeros
            embedding_matrix[idx] = np.zeros(embedding_dim)
    return embedding_matrix

# Map vocab to embeddings
embeddings = map_vocab_to_embeddings(vocab, wv)


In [None]:
embeddings.shape

(3627, 300)

we build the one-hot encoding for each subset in order to use it as a feature in our model later.

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np

# Encode POS tags
label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder(sparse_output=False)

train_tags_encoded = onehot_encoder.fit_transform(label_encoder.fit_transform(train_tags).reshape(-1, 1))
dev_tags_encoded = onehot_encoder.transform(label_encoder.transform(dev_tags).reshape(-1, 1))
test_tags_encoded = onehot_encoder.transform(label_encoder.transform(test_tags).reshape(-1, 1))


In [None]:
# Convert words to indices
def words_to_indices(words, vocab):
    return np.array([[vocab.get(word, 0) for word in window] for window in words])

train_words_idx = words_to_indices(train_words, vocab)
dev_words_idx = words_to_indices(dev_words, vocab)
test_words_idx = words_to_indices(test_words, vocab)

We define our MLP model. We start from a random architecture and later we do Hyperparameter tuning. We set EarlyStopping to patience=3. Which means that after 3 epochs that validation loss will be worse the process will stop and will save the best weights (restore_best_weights=True)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input, Embedding, Flatten, Bidirectional, LSTM
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import CategoricalAccuracy
from sklearn.metrics import classification_report, precision_recall_curve, auc, f1_score, recall_score, precision_score

model = Sequential([
        Input(shape=(3,), dtype='int32', name='Input_Layer'),
        Embedding(len(vocab), 300, weights=[embeddings], input_length=3, trainable=True),
        Bidirectional(LSTM(256, return_sequences=False, dropout=0.5, recurrent_dropout=0.5)),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(len(vocab_tags), activation='softmax')  # Output layer multi-class classification
    ])

print(model.summary())

model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=[CategoricalAccuracy()])


# Metrics callback
class Metrics(tf.keras.callbacks.Callback):
    def __init__(self, valid_data):
        super(Metrics, self).__init__()
        self.validation_data = valid_data

    def on_epoch_end(self, epoch, logs=None):
        val_predict = np.argmax(self.model.predict(self.validation_data[0]), -1)
        val_targ = np.argmax(self.validation_data[1], -1)

        _val_f1 = f1_score(val_targ, val_predict, average="weighted")
        _val_recall = recall_score(val_targ, val_predict, average="weighted")
        _val_precision = precision_score(val_targ, val_predict, average="weighted")

        logs['val_f1'] = _val_f1
        logs['val_recall'] = _val_recall
        logs['val_precision'] = _val_precision
        print(f" — val_f1: {_val_f1} — val_precision: {_val_precision} — val_recall: {_val_recall}")
        return

early_stopping = EarlyStopping(patience=10, verbose=2, restore_best_weights=True, monitor='val_loss', mode='min')

history = model.fit(
    train_words_idx, train_tags_encoded,
    validation_data=(dev_words_idx, dev_tags_encoded),
    batch_size=256, epochs=100, shuffle=True,
    callbacks=[Metrics(valid_data=(dev_words_idx, dev_tags_encoded)), early_stopping]
)



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 3, 300)            1088100   
                                                                 
 bidirectional (Bidirection  (None, 512)               1140736   
 al)                                                             
                                                                 
 dense (Dense)               (None, 128)               65664     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 18)                2322      
                                                                 
Total params: 2296822 (8.76 MB)
Trainable params: 2296822 (8.76 MB)
Non-trainable params: 0 (0.00 Byte)
__________________

  _warn_prf(average, modifier, msg_start, len(result))


 — val_f1: 0.520409439681463 — val_precision: 0.5533561630377771 — val_recall: 0.5560564484515876
Epoch 3/100
 7/60 [==>...........................] - ETA: 1s - loss: 1.3731 - categorical_accuracy: 0.5787

  _warn_prf(average, modifier, msg_start, len(result))


 — val_f1: 0.7606067796227609 — val_precision: 0.8108270535292319 — val_recall: 0.7628381027048217
Epoch 4/100
 6/60 [==>...........................] - ETA: 1s - loss: 0.6864 - categorical_accuracy: 0.7917

  _warn_prf(average, modifier, msg_start, len(result))


 — val_f1: 0.7934596038291534 — val_precision: 0.8406393716412542 — val_recall: 0.7885143081144649
Epoch 5/100
 2/60 [>.............................] - ETA: 4s - loss: 0.5095 - categorical_accuracy: 0.8711

  _warn_prf(average, modifier, msg_start, len(result))


 — val_f1: 0.8037838408101826 — val_precision: 0.8506480645686444 — val_recall: 0.7951783614268915
Epoch 6/100
 1/60 [..............................] - ETA: 10s - loss: 0.4331 - categorical_accuracy: 0.8789

  _warn_prf(average, modifier, msg_start, len(result))


 — val_f1: 0.8072601379767056 — val_precision: 0.846796040243656 — val_recall: 0.800078400627205
Epoch 7/100
 5/60 [=>............................] - ETA: 1s - loss: 0.2780 - categorical_accuracy: 0.9125

  _warn_prf(average, modifier, msg_start, len(result))


 — val_f1: 0.809885162617824 — val_precision: 0.8542421486761392 — val_recall: 0.8012544100352803
Epoch 8/100
 6/60 [==>...........................] - ETA: 1s - loss: 0.2733 - categorical_accuracy: 0.9180

  _warn_prf(average, modifier, msg_start, len(result))


 — val_f1: 0.8080520559097703 — val_precision: 0.8537178535149493 — val_recall: 0.8008624068992551
Epoch 9/100
 3/60 [>.............................] - ETA: 3s - loss: 0.2755 - categorical_accuracy: 0.9310

  _warn_prf(average, modifier, msg_start, len(result))


 — val_f1: 0.807708359020003 — val_precision: 0.8508569879913206 — val_recall: 0.7989023912191298
Epoch 10/100
 7/60 [==>...........................] - ETA: 1s - loss: 0.2114 - categorical_accuracy: 0.9364

  _warn_prf(average, modifier, msg_start, len(result))


 — val_f1: 0.8111193943078374 — val_precision: 0.8557713882549575 — val_recall: 0.8030184241473932
Epoch 11/100
 6/60 [==>...........................] - ETA: 1s - loss: 0.1825 - categorical_accuracy: 0.9382

  _warn_prf(average, modifier, msg_start, len(result))


 — val_f1: 0.8102085481522047 — val_precision: 0.8546330959222207 — val_recall: 0.8018424147393179
Epoch 12/100
 6/60 [==>...........................] - ETA: 1s - loss: 0.1763 - categorical_accuracy: 0.9447

  _warn_prf(average, modifier, msg_start, len(result))


 — val_f1: 0.8110326620615863 — val_precision: 0.8551489636426974 — val_recall: 0.8020384163073304
Epoch 13/100
 — val_f1: 0.8116437455080099 — val_precision: 0.8551809844949367 — val_recall: 0.8016464131713054
Epoch 14/100
 — val_f1: 0.8129033779942265 — val_precision: 0.8598117096371681 — val_recall: 0.8034104272834183
Restoring model weights from the end of the best epoch: 4.
Epoch 14: early stopping


## Hyper-parameter tuning

In [None]:
%%capture
!pip install -U keras-tuner

In [None]:
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import GRU
from keras_tuner import RandomSearch, Objective

def build_tunable_model(hp):
    model = Sequential()
    model.add(Embedding(len(vocab), 300, weights=[embeddings], input_length=3, trainable=True))
    for i in range(hp.Int('num_layers', 1, 5)):
        if hp.Choice('rnn_type', ['LSTM', 'GRU']) == 'LSTM':
            model.add(Bidirectional(LSTM(units=hp.Int('units_' + str(i), min_value=128, max_value=512, step=64),
                                         return_sequences=True if i < hp.Int('num_layers', 1, 5) - 1 else False,
                                         dropout=hp.Choice('dropout_' + str(i), values=[0.3, 0.5, 0.7]),
                                         recurrent_dropout=hp.Choice('recurrent_dropout_' + str(i), values=[0.3, 0.5, 0.7]),
                                         kernel_regularizer=l2(0.01))))
        else:
            model.add(Bidirectional(GRU(units=hp.Int('units_' + str(i), min_value=128, max_value=512, step=64),
                                        return_sequences=True if i < hp.Int('num_layers', 1, 5) - 1 else False,
                                        dropout=hp.Choice('dropout_' + str(i), values=[0.3, 0.5, 0.7]),
                                        recurrent_dropout=hp.Choice('recurrent_dropout_' + str(i), values=[0.3, 0.5, 0.7]),
                                        kernel_regularizer=l2(0.01))))
    model.add(Dense(len(vocab_tags), activation='softmax'))
    model.compile(optimizer=Adam(hp.Choice('learning_rate', values=[1e-3, 1e-4])),
                  loss='categorical_crossentropy', metrics=[CategoricalAccuracy()])
    return model

# Instantiate the tuner
tuner = RandomSearch(build_tunable_model, objective=Objective("val_categorical_accuracy", direction="max"),
                     max_trials=10, executions_per_trial=1, directory='KT_dir', project_name='KT_tuning')

tuner.search_space_summary()

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=2)

# Search for the best hyperparameters
tuner.search(train_words_idx, train_tags_encoded, epochs=80, validation_data=(dev_words_idx, dev_tags_encoded),
             callbacks=[early_stopping])

# Get the best model
best_model = tuner.get_best_models(num_models=1)[0]
loss, accuracy = best_model.evaluate(test_words_idx, test_tags_encoded)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

best_model.save("biRNN_model.keras")

Trial 8 Complete [00h 05m 48s]
val_categorical_accuracy: 0.7842022776603699

Best val_categorical_accuracy So Far: 0.803214430809021
Total elapsed time: 01h 03m 40s

Search: Running Trial #9

Value             |Best Value So Far |Hyperparameter
4                 |1                 |num_layers
LSTM              |GRU               |rnn_type
192               |448               |units_0
0.3               |0.5               |dropout_0
0.3               |0.7               |recurrent_dropout_0
0.001             |0.001             |learning_rate
512               |128               |units_1
0.7               |0.5               |dropout_1
0.5               |0.7               |recurrent_dropout_1
384               |512               |units_2
0.5               |0.5               |dropout_2
0.5               |0.3               |recurrent_dropout_2
448               |None              |units_3
0.7               |None              |dropout_3
0.5               |None              |recurrent_dropout_3



Epoch 1/80
Epoch 2/80
 26/479 [>.............................] - ETA: 41s - loss: 2.5384 - categorical_accuracy: 0.1839

KeyboardInterrupt: 

In [None]:
# Plot loss curves
import matplotlib.pyplot as plt

plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.title('Loss Curves')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.show()

# Save accuracy plots
plt.figure()
plt.plot(history.history['categorical_accuracy'], label='Training Accuracy')
plt.plot(history.history['val_categorical_accuracy'], label='Validation Accuracy')
plt.legend()
plt.title('Accuracy Curves')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
# plt.savefig('accuracy_plot.png')
plt.show()

In [None]:
# Retrieve the best hyperparameters
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"Best hyperparameters: {best_hyperparameters.values}")

In [None]:
# Baseline: Most frequent tag
most_frequent_tag = max(set(train_tags), key=train_tags.count)
baseline_predictions = [most_frequent_tag] * len(test_tags)
baseline_predictions_encoded = label_encoder.transform(baseline_predictions)

# Calculate classification report
def evaluate_model(predictions, true_tags, labels):
    print(classification_report(true_tags, predictions, target_names=labels))

# Evaluate on test set
test_predictions = np.argmax(model.predict(test_words_idx), axis=-1)
test_true = np.argmax(test_tags_encoded, axis=-1)

print("Test Set Classification Report:")
evaluate_model(test_predictions, test_true, label_encoder.classes_)

# Evaluate on development set
dev_predictions = np.argmax(model.predict(dev_words_idx), axis=-1)
dev_true = np.argmax(dev_tags_encoded, axis=-1)

print("Development Set Classification Report:")
evaluate_model(dev_predictions, dev_true, label_encoder.classes_)

# Evaluate on training set
train_predictions = np.argmax(model.predict(train_words_idx), axis=-1)
train_true = np.argmax(train_tags_encoded, axis=-1)

print("Training Set Classification Report:")
evaluate_model(train_predictions, train_true, label_encoder.classes_)

# Evaluate baseline
baseline_true = test_true
print("Baseline Classification Report:")
evaluate_model(baseline_predictions_encoded, baseline_true, label_encoder.classes_)

# Describe the methods and datasets
num_train_sentences = len(train_words) // 3
num_dev_sentences = len(dev_words) // 3
num_test_sentences = len(test_words) // 3
num_train_words = len(train_words)
num_dev_words = len(dev_words)
num_test_words = len(test_words)
vocab_size = len(vocab)

print(f"Number of training sentences: {num_train_sentences}")
print(f"Number of development sentences: {num_dev_sentences}")
print(f"Number of test sentences: {num_test_sentences}")
print(f"Number of training words: {num_train_words}")
print(f"Number of development words: {num_dev_words}")
print(f"Number of test words: {num_test_words}")
print(f"Vocabulary size: {vocab_size}")

example