# Ancient-Greek-Bert Tokenization 
Tokenizing the Ancient Greek works of Homer, Hesiod, and the Homeric Hymns. All texts are written in dactylic hexameter. 

In [51]:
## Imports and Installs 
from transformers import AutoTokenizer, AutoModel
import torch
import json
import tensorflow as tf
from sklearn.model_selection import train_test_split


In [13]:
# Load data 
with open('scraping/parsed_dactylic_hexameter_corpus.json', 'r') as openfile:
 
    # Reading from json file
    parsed_data = json.load(openfile)
 
print(parsed_data)
print(type(parsed_data))

{'Homer': {'Iliad': ['μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος', 'οὐλομένην, ἣ μυρίʼ Ἀχαιοῖς ἄλγεʼ ἔθηκε,', 'πολλὰς δʼ ἰφθίμους ψυχὰς Ἄϊδι προΐαψεν', 'ἡρώων, αὐτοὺς δὲ ἑλώρια τεῦχε κύνεσσιν', 'οἰωνοῖσί τε πᾶσι, Διὸς δʼ ἐτελείετο βουλή,', 'ἐξ οὗ δὴ τὰ πρῶτα διαστήτην ἐρίσαντε', 'Ἀτρεΐδης τε ἄναξ ἀνδρῶν καὶ δῖος Ἀχιλλεύς.', 'τίς τʼ ἄρ σφωε θεῶν ἔριδι ξυνέηκε μάχεσθαι;', 'Λητοῦς καὶ Διὸς υἱός· ὃ γὰρ βασιλῆϊ χολωθεὶς', 'νοῦσον ἀνὰ στρατὸν ὄρσε κακήν, ὀλέκοντο δὲ λαοί,', 'οὕνεκα τὸν Χρύσην ἠτίμασεν ἀρητῆρα', 'Ἀτρεΐδης· ὃ γὰρ ἦλθε θοὰς ἐπὶ νῆας Ἀχαιῶν', 'λυσόμενός τε θύγατρα φέρων τʼ ἀπερείσιʼ ἄποινα,', 'στέμματʼ ἔχων ἐν χερσὶν ἑκηβόλου Ἀπόλλωνος', 'χρυσέῳ ἀνὰ σκήπτρῳ, καὶ λίσσετο πάντας Ἀχαιούς,', 'Ἀτρεΐδα δὲ μάλιστα δύω, κοσμήτορε λαῶν·', 'Ἀτρεΐδαι τε καὶ ἄλλοι ἐϋκνήμιδες Ἀχαιοί,', 'ὑμῖν μὲν θεοὶ δοῖεν Ὀλύμπια δώματʼ ἔχοντες', 'ἐκπέρσαι Πριάμοιο πόλιν, εὖ δʼ οἴκαδʼ ἱκέσθαι·', 'παῖδα δʼ ἐμοὶ λύσαιτε φίλην, τὰ δʼ ἄποινα δέχεσθαι,', 'ἁζόμενοι Διὸς υἱὸν ἑκηβόλον Ἀπόλλωνα.', 'ἔνθʼ ἄλλοι μὲν πάντες ἐπε

In [50]:
## Turn JSON into a dataframe
import pandas as pd
data = []

# Iterate over the nested JSON object
for author, titles in parsed_data.items():
    for title, lines in titles.items():
        for idx, line in enumerate(lines):
            data.append([author, title, idx+1, line])

# Create DataFrame
ancient_greek = pd.DataFrame(data, columns=['author', 'title', 'line_idx', 'line_txt'])

# Display DataFrame
ancient_greek

Unnamed: 0,author,title,line_idx,line_txt
0,Homer,Iliad,1,μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος
1,Homer,Iliad,2,"οὐλομένην, ἣ μυρίʼ Ἀχαιοῖς ἄλγεʼ ἔθηκε,"
2,Homer,Iliad,3,πολλὰς δʼ ἰφθίμους ψυχὰς Ἄϊδι προΐαψεν
3,Homer,Iliad,4,"ἡρώων, αὐτοὺς δὲ ἑλώρια τεῦχε κύνεσσιν"
4,Homer,Iliad,5,"οἰωνοῖσί τε πᾶσι, Διὸς δʼ ἐτελείετο βουλή,"
...,...,...,...,...
32592,Hesiod,Works and Days,827,"ἄλλος δʼ ἀλλοίην αἰνεῖ, παῦροι δὲ ἴσασιν."
32593,Hesiod,Works and Days,828,"ἄλλοτε μητρυιὴ πέλει ἡμέρη, ἄλλοτε μήτηρ."
32594,Hesiod,Works and Days,829,"τάων εὐδαίμων τε καὶ ὄλβιος, ὃς τάδε πάντα"
32595,Hesiod,Works and Days,830,"εἰδὼς ἐργάζηται ἀναίτιος ἀθανάτοισιν,"


In [54]:
## Creating a train test split 
train_df, test_df = train_test_split(ancient_greek, test_size=0.2, random_state=42)

train_text = train_df['line_txt']
train_author = train_df['author']

test_text = test_df['line_txt']
test_author = test_df['author']

print("Train text shape:", train_text.shape)
print("Train label shape: ", train_author.shape)
print()
print("Test text shape:", test_text.shape)
print("Test label shape: ", test_author.shape)

Train text shape: (26077,)
Train label shape:  (26077,)

Test text shape: (6520,)
Test label shape:  (6520,)


In [6]:
tokeniser = AutoTokenizer.from_pretrained("pranaydeeps/Ancient-Greek-BERT")
model = AutoModel.from_pretrained("pranaydeeps/Ancient-Greek-BERT")

In [10]:
input_ids = tokeniser.encode('τοῦ βίου τοῦ καθ ΄ εαυτοὺς πολλὰ γίνεσθαι συγχωροῦν [MASK]')
tokens = tokeniser.convert_ids_to_tokens(input_ids)
idx = tokens.index("[MASK]")
print(idx, tokens)
outputs = model(torch.tensor([input_ids]))[0]
outputs.shape

13 ['[CLS]', 'του', 'βιου', 'του', 'καθ', '΄', 'εαυτους', 'πολλα', 'γινε', '##σθαι', 'συγχ', '##ωρου', '##ν', '[MASK]', '[SEP]']


torch.Size([1, 15, 768])

In [22]:
hymn_1 = parsed_data['Homeric Hymns']['Hymn 1 to Dionysus']
hymn_1_tokenized = tokeniser(hymn_1)

In [41]:
## Testing Tokenization on one document
MAX_SEQUENCE_LENGTH = 512
hymn_1_tokenized = tokeniser(hymn_1, return_tensors='tf', padding=True, truncation=True, max_length=MAX_SEQUENCE_LENGTH)

# Pass the tokenized input to the BERT model
outputs = model(**hymn_1_tokenized)

# Get the output embeddings from BERT
cls_tokens = outputs.last_hidden_state[:, 0, :]
cls_tokens

<tf.Tensor: shape=(21, 768), dtype=float32, numpy=
array([[ 0.66409504, -0.28627765, -0.00620437, ...,  0.32648504,
        -0.23340756, -0.182919  ],
       [ 0.32689637, -0.31059164,  1.1813892 , ...,  0.9536348 ,
        -0.6949148 ,  0.45000824],
       [-0.13619798,  0.02270813, -0.3478313 , ...,  0.67646575,
        -0.3727346 , -0.02015273],
       ...,
       [-0.7647954 , -0.46125937,  0.3838446 , ...,  0.73274493,
        -1.2633125 , -0.42863792],
       [-0.34413442, -0.9495552 ,  0.27038175, ...,  0.18278803,
         0.5519371 , -0.0193481 ],
       [-0.12150065, -1.3934509 ,  1.3613051 , ..., -0.2853232 ,
        -0.24562144,  0.8898896 ]], dtype=float32)>

In [42]:
parsed_data

{'Homer': {'Iliad': ['μῆνιν ἄειδε θεὰ Πηληϊάδεω Ἀχιλῆος',
   'οὐλομένην, ἣ μυρίʼ Ἀχαιοῖς ἄλγεʼ ἔθηκε,',
   'πολλὰς δʼ ἰφθίμους ψυχὰς Ἄϊδι προΐαψεν',
   'ἡρώων, αὐτοὺς δὲ ἑλώρια τεῦχε κύνεσσιν',
   'οἰωνοῖσί τε πᾶσι, Διὸς δʼ ἐτελείετο βουλή,',
   'ἐξ οὗ δὴ τὰ πρῶτα διαστήτην ἐρίσαντε',
   'Ἀτρεΐδης τε ἄναξ ἀνδρῶν καὶ δῖος Ἀχιλλεύς.',
   'τίς τʼ ἄρ σφωε θεῶν ἔριδι ξυνέηκε μάχεσθαι;',
   'Λητοῦς καὶ Διὸς υἱός· ὃ γὰρ βασιλῆϊ χολωθεὶς',
   'νοῦσον ἀνὰ στρατὸν ὄρσε κακήν, ὀλέκοντο δὲ λαοί,',
   'οὕνεκα τὸν Χρύσην ἠτίμασεν ἀρητῆρα',
   'Ἀτρεΐδης· ὃ γὰρ ἦλθε θοὰς ἐπὶ νῆας Ἀχαιῶν',
   'λυσόμενός τε θύγατρα φέρων τʼ ἀπερείσιʼ ἄποινα,',
   'στέμματʼ ἔχων ἐν χερσὶν ἑκηβόλου Ἀπόλλωνος',
   'χρυσέῳ ἀνὰ σκήπτρῳ, καὶ λίσσετο πάντας Ἀχαιούς,',
   'Ἀτρεΐδα δὲ μάλιστα δύω, κοσμήτορε λαῶν·',
   'Ἀτρεΐδαι τε καὶ ἄλλοι ἐϋκνήμιδες Ἀχαιοί,',
   'ὑμῖν μὲν θεοὶ δοῖεν Ὀλύμπια δώματʼ ἔχοντες',
   'ἐκπέρσαι Πριάμοιο πόλιν, εὖ δʼ οἴκαδʼ ἱκέσθαι·',
   'παῖδα δʼ ἐμοὶ λύσαιτε φίλην, τὰ δʼ ἄποινα δέχεσθαι,',
   'ἁζόμεν

### Baseline Ancient-Greek-BERT Classifier 

In [33]:
MAX_SEQUENCE_LENGTH = 512
def cls_baseline_classifier(bert_model,
                          max_sequence_length=MAX_SEQUENCE_LENGTH,
                          hidden_size = 100,
                          dropout=0.3,
                          learning_rate=0.00005):
    """
    Build a simple classification model with BERT. Use the CLS Token output for classification purposes.
    """

    bert_model.trainable = True

    input_ids = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}

    bert_out = bert_model(bert_inputs)

    cls_token = bert_out[0][:, 0, :]

    hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(cls_token)

    hidden = tf.keras.layers.Dropout(dropout)(hidden)

    classification = tf.keras.layers.Dense(1, activation='sigmoid',name='classification_layer')(hidden)

    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])

    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                                 metrics='accuracy')

    return classification_model

In [39]:
from transformers import AutoTokenizer, TFBertModel

# Load the BERT tokenizer and model
tokeniser = AutoTokenizer.from_pretrained("pranaydeeps/Ancient-Greek-BERT")
model = TFBertModel.from_pretrained("pranaydeeps/Ancient-Greek-BERT")

MAX_SEQUENCE_LENGTH = 512
hymn_1_tokenized = tokeniser(hymn_1, return_tensors='tf', padding=True, truncation=True, max_length=MAX_SEQUENCE_LENGTH)

# Pass the tokenized input to the BERT model
outputs = model(**hymn_1_tokenized)

# Get the output embeddings from BERT
cls_tokens = outputs.last_hidden_state[:, 0, :]


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['embeddings.position_ids']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
cls_model = create_bert_cls_model(bert_model)
cls_model.fit(bert_train_inputs,
              bert_train_labels,
    validation_data=(bert_test_inputs, bert_test_labels),
    batch_size=8,
    epochs=2
)