In [1]:
!pip install transformers
!pip install torch
!pip install datasets

[0m

In [2]:
import torch
import pandas as pd
import os
import datasets
import math

In [3]:
device = torch.device("cuda")

In [None]:
from transformers import BertTokenizer, BertForMaskedLM
from transformers import LineByLineTextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForMaskedLM


model_name = 'pranaydeeps/Ancient-Greek-BERT'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)


In [53]:
# load the masked and original sentence files
masked_file_path = '/kaggle/working/masked.txt'
original_file_path = '/kaggle/working/orignal.txt'

# create datasets from the files
masked_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=masked_file_path,
    block_size=128
)

original_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=original_file_path,
    block_size=128
)

# create a data collator to batch the data
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)






In [61]:
# define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=20,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    logging_steps=20,
    save_steps=1000,
    learning_rate=2e-5,
    weight_decay=0.01,
    evaluation_strategy='steps',
    eval_steps=300
)

# define the trainer and train the model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=masked_dataset,
    eval_dataset=original_dataset
)

trainer.train()



Step,Training Loss,Validation Loss
300,1.9989,1.885498
600,1.8852,1.828121
900,1.7936,1.793792
1200,1.7348,1.774593
1500,1.7171,1.705026
1800,1.7203,1.71323
2100,1.6668,1.703594
2400,1.7149,1.685495
2700,1.7381,1.708302
3000,1.6911,1.637435


TrainOutput(global_step=3140, training_loss=1.7858183283714733, metrics={'train_runtime': 2896.3678, 'train_samples_per_second': 69.052, 'train_steps_per_second': 1.084, 'total_flos': 1.1880888333122304e+16, 'train_loss': 1.7858183283714733, 'epoch': 20.0})

In [62]:
model.save_pretrained('/kaggle/working/fine-tuned-pranaydeeps-ancient-greek-v1')
tokenizer.save_pretrained('/kaggle/working/fine-tuned-pranaydeeps-ancient-greek-v1')

('/kaggle/working/fine-tuned-pranaydeeps-ancient-greek-v1/tokenizer_config.json',
 '/kaggle/working/fine-tuned-pranaydeeps-ancient-greek-v1/special_tokens_map.json',
 '/kaggle/working/fine-tuned-pranaydeeps-ancient-greek-v1/vocab.txt',
 '/kaggle/working/fine-tuned-pranaydeeps-ancient-greek-v1/added_tokens.json',
 '/kaggle/working/fine-tuned-pranaydeeps-ancient-greek-v1/tokenizer.json')

In [63]:
eval_results = trainer.evaluate(original_dataset)

'''
A lower perplexity score means a better language model, 

'''

print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 5.12


In [64]:
import gc
torch.cuda.empty_cache()
gc.collect()

1445

In [67]:

input = [
    "ὅστις μέν σύ ὦ [MASK] Ἀθηναῖος πάσχω ὑπό ὁ ἐμός κατήγορος οὐ οἶδα",                                                                                                                 
    "ἐγώ δέ οὖν [MASK] αὐτός ὑπό αὐτός ὀλίγος ἐμαυτοῦ ἐπιλανθάνομαι οὕτως πιθανός λέγω",                                                                                                                 
    "καίτοι ἀληθής γε ὡς ἔπος εἶπον οὐδείς [MASK]",                                                                                                                      
    "μάλιστα δέ αὐτός εἷς [MASK] ὁ πολύς ὅς ψεύδω οὗτος ἐν ὅς λέγω ὡς χράω σύ εὐλαβέομαι μή ὑπό ἐγώ ἐξαπατάω ὡς δεινός εἰμί λέγω",                                                                                                     
    "ὁ γάρ μή αἰσχύνω ὅστις αὐτίκα ὑπό ἐγώ ἐξελέγχω ἔργον [MASK] μηδέ ὁπωσοῦν φαίνω δεινός λέγω οὗτος ἐγώ δοκέω αὐτός ἀναίσχυντος εἰμί εἰ μή ἄρα δεινός καλέω οὗτος λέγω ὁ ἀληθής λέγω",
    ]
y_hat = [
    "ὅστις μέν σύ ὦ ἀνήρ Ἀθηναῖος πάσχω ὑπό ὁ ἐμός κατήγορος οὐ οἶδα",                                                                                                                 
    "ἐγώ δέ οὖν καί αὐτός ὑπό αὐτός ὀλίγος ἐμαυτοῦ ἐπιλανθάνομαι οὕτως πιθανός λέγω",                                                                                                                 
    "καίτοι ἀληθής γε ὡς ἔπος εἶπον οὐδείς ἐρῶ",                                                                                                                      
    "μάλιστα δέ αὐτός εἷς θαυμάζω ὁ πολύς ὅς ψεύδω οὗτος ἐν ὅς λέγω ὡς χράω σύ εὐλαβέομαι μή ὑπό ἐγώ ἐξαπατάω ὡς δεινός εἰμί λέγω",                                                                                                     
    "ὁ γάρ μή αἰσχύνω ὅστις αὐτίκα ὑπό ἐγώ ἐξελέγχω ἔργον ἐπειδάν μηδέ ὁπωσοῦν φαίνω δεινός λέγω οὗτος ἐγώ δοκέω αὐτός ἀναίσχυντος εἰμί εἰ μή ἄρα δεινός καλέω οὗτος λέγω ὁ ἀληθής λέγω",
    ]

In [None]:
# model.to('cuda')

In [30]:
# extract predictions for each masked sentence
predictions = []

for sentence in input:
    # tokenize the sentence and find the index of the masked token
    tokenized_text = tokenizer.tokenize(sentence)
    masked_index = tokenized_text.index("[MASK]")

    # convert the tokenized sentence to input ids and create a tensor of input ids
    input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
    input_ids = torch.tensor([input_ids])

    # use the model to make predictions on the masked token
    with torch.no_grad():
        outputs = model(input_ids.to(device))
        predictions.append(outputs[0][0, masked_index].topk(k=1))
      

# convert the predicted token ids to tokens
predicted_tokens = [tokenizer.convert_ids_to_tokens(pred.indices.tolist()) for pred in predictions]

# create a dataframe with the context sentences, masked sentences, original sentences, and predicted tokens
output = pd.DataFrame({
    'context_sentence': [' '.join(sentence.split()[:-1]) for sentence in y_hat],
    'masked_sentence': input,
    'original_sentence': y_hat,
    'predicted_tokens': predicted_tokens,
})

# print the output dataframe
output.head(5)


Unnamed: 0,context_sentence,masked_sentence,original_sentence,predicted_tokens
0,ὅστις μέν σύ ὦ ἀνήρ Ἀθηναῖος πάσχω ὑπό ὁ ἐμός ...,ὅστις μέν σύ ὦ [MASK] Ἀθηναῖος πάσχω ὑπό ὁ ἐμό...,ὅστις μέν σύ ὦ ἀνήρ Ἀθηναῖος πάσχω ὑπό ὁ ἐμός ...,[σωκρατης]
1,ἐγώ δέ οὖν καί αὐτός ὑπό αὐτός ὀλίγος ἐμαυτοῦ ...,ἐγώ δέ οὖν [MASK] αὐτός ὑπό αὐτός ὀλίγος ἐμαυτ...,ἐγώ δέ οὖν καί αὐτός ὑπό αὐτός ὀλίγος ἐμαυτοῦ ...,[και]
2,καίτοι ἀληθής γε ὡς ἔπος εἶπον οὐδείς,καίτοι ἀληθής γε ὡς ἔπος εἶπον οὐδείς [MASK],καίτοι ἀληθής γε ὡς ἔπος εἶπον οὐδείς ἐρῶ,[ουδεις]
3,μάλιστα δέ αὐτός εἷς θαυμάζω ὁ πολύς ὅς ψεύδω ...,μάλιστα δέ αὐτός εἷς [MASK] ὁ πολύς ὅς ψεύδω ο...,μάλιστα δέ αὐτός εἷς θαυμάζω ὁ πολύς ὅς ψεύδω ...,[γε]
4,ὁ γάρ μή αἰσχύνω ὅστις αὐτίκα ὑπό ἐγώ ἐξελέγχω...,ὁ γάρ μή αἰσχύνω ὅστις αὐτίκα ὑπό ἐγώ ἐξελέγχω...,ὁ γάρ μή αἰσχύνω ὅστις αὐτίκα ὑπό ἐγώ ἐξελέγχω...,[λεγω]


In [31]:
torch.save(model.state_dict(), "bert.pt")