In [3]:
import pandas as pd
import torch
import transformers
import random
from transformers import AutoTokenizer, BertForMaskedLM
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm  
import os
import torch.optim as optim
from transformers import AdamW

In [1]:
#checking if CUDA is available and printing GPU details
if torch.cuda.is_available():
    print("CUDA is available. GPU:", torch.cuda.get_device_name(0))
else:
    print("CUDA is not available.")

CUDA is available. GPU: Tesla V100-SXM2-32GB


In [37]:
#reading the dataset
data = pd.read_csv('../mlm_transfer/ready_for_mlm_final.csv')

In [11]:
#importing tokenizer and model for BERT MLM transfer
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/bert-base-greek-uncased-v1")
model = BertForMaskedLM.from_pretrained("altsoph/bert-base-ancientgreek-uncased")

Some weights of BertForMaskedLM were not initialized from the model checkpoint at altsoph/bert-base-ancientgreek-uncased and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
print(data.head())

   Unnamed: 0                                               text
0           0  δεινοί τινεσ εἰσὶ νουθετεῖν ἑτέρουσ ἀφε...
1           1  οὐκ ἀμφοτέρων ἄρα τῶν καιρῶν διήμαρτον...
2           2  ̔ μὲν ἐπίδειξισ οὐδαμῶσ εὐτυχὴσ οὐδὲ ...
3           3  ̓́στι μὲν οὐ σμικρὸν, ὦ ἄνδρεσ σμυρναῖ...
4           4  νόμοσ ἐστὶ τοῖσ ̔́λλησι παλαιὸσ, οἶμαι ...


In [15]:
df = df[['text']]

In [16]:
df

Unnamed: 0,text
0,δεινοί τινεσ εἰσὶ νουθετεῖν ἑτέρουσ ἀφε...
1,οὐκ ἀμφοτέρων ἄρα τῶν καιρῶν διήμαρτον...
2,̔ μὲν ἐπίδειξισ οὐδαμῶσ εὐτυχὴσ οὐδὲ ...
3,"̓́στι μὲν οὐ σμικρὸν, ὦ ἄνδρεσ σμυρναῖ..."
4,"νόμοσ ἐστὶ τοῖσ ̔́λλησι παλαιὸσ, οἶμαι ..."
...,...
154,̓σαῖοσ δὲ ὁ δημοσθένουσ καθηγησάμενοσ κ...
155,̔ητορική ἐστι δύναμισ τεχνικὴ πιθανοῦ...
156,περὶ δεινάρχου τοῦ ῥήτοροσ οὐδὲν εἰρ...
157,*************** δικανικοῖσ μὲν οὖν ο...


In [17]:
data = df

In [18]:
df

Unnamed: 0,text
0,δεινοί τινεσ εἰσὶ νουθετεῖν ἑτέρουσ ἀφε...
1,οὐκ ἀμφοτέρων ἄρα τῶν καιρῶν διήμαρτον...
2,̔ μὲν ἐπίδειξισ οὐδαμῶσ εὐτυχὴσ οὐδὲ ...
3,"̓́στι μὲν οὐ σμικρὸν, ὦ ἄνδρεσ σμυρναῖ..."
4,"νόμοσ ἐστὶ τοῖσ ̔́λλησι παλαιὸσ, οἶμαι ..."
...,...
154,̓σαῖοσ δὲ ὁ δημοσθένουσ καθηγησάμενοσ κ...
155,̔ητορική ἐστι δύναμισ τεχνικὴ πιθανοῦ...
156,περὶ δεινάρχου τοῦ ῥήτοροσ οὐδὲν εἰρ...
157,*************** δικανικοῖσ μὲν οὖν ο...


In [20]:
#convert the dataframe to a list of strings
df = data['text'].tolist()

In [21]:
#print the number of rows in the dataframe
print(len(df))

159


In [22]:
#tokenize the data and prepare it for model input
inputs = tokenizer(df, return_tensors = 'pt', max_length = 512, truncation = True, padding = 'max_length')

In [23]:
#create the labels by cloning the input IDs
inputs['labels'] = inputs.input_ids.detach().clone()

In [24]:
#create a random array to decide which tokens to mask
rand = torch.rand(inputs.input_ids.shape)

In [25]:
mask_arr = rand < 0.15

In [26]:
inputs.input_ids != 101

tensor([[False,  True,  True,  ...,  True,  True,  True],
        [False,  True,  True,  ...,  True,  True,  True],
        [False,  True,  True,  ...,  True,  True,  True],
        ...,
        [False,  True,  True,  ...,  True,  True,  True],
        [False,  True,  True,  ...,  True,  True,  True],
        [False,  True,  True,  ...,  True,  True,  True]])

In [27]:
#15% of tokens will be masked
mask_arr = (rand < 0.15) * (inputs.input_ids != 101) * (inputs.input_ids != 102)

In [28]:
mask_arr[0].shape

torch.Size([512])

In [29]:
#create a selection array to apply the mask to the input IDs
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(
        torch.flatten(mask_arr[i].nonzero()).tolist()
    )

In [30]:
selection[:5]

[[3,
  5,
  7,
  13,
  20,
  42,
  52,
  53,
  54,
  57,
  70,
  80,
  82,
  87,
  92,
  107,
  120,
  124,
  126,
  134,
  140,
  142,
  145,
  154,
  160,
  164,
  170,
  174,
  179,
  195,
  199,
  217,
  223,
  227,
  246,
  249,
  254,
  257,
  266,
  269,
  277,
  278,
  281,
  291,
  293,
  299,
  304,
  314,
  319,
  322,
  323,
  329,
  341,
  342,
  348,
  357,
  359,
  370,
  380,
  387,
  390,
  391,
  398,
  405,
  409,
  413,
  418,
  424,
  425,
  440,
  445,
  450,
  457,
  460,
  465,
  468,
  474,
  497,
  506,
  508,
  509],
 [4,
  12,
  16,
  28,
  36,
  41,
  50,
  56,
  61,
  62,
  63,
  76,
  84,
  90,
  94,
  109,
  110,
  116,
  118,
  119,
  120,
  130,
  142,
  158,
  165,
  168,
  175,
  180,
  206,
  207,
  219,
  226,
  229,
  236,
  255,
  259,
  264,
  268,
  273,
  288,
  291,
  295,
  302,
  306,
  307,
  308,
  314,
  316,
  323,
  325,
  326,
  329,
  357,
  362,
  366,
  370,
  376,
  384,
  385,
  394,
  410,
  414,
  424,
  428,
  431,
  437,
  45

In [31]:
#apply the mask (replace tokens with the mask token ID 103)
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

In [32]:
inputs.input_ids

tensor([[  101, 24688,   269,  ...,   103,  4858,   102],
        [  101,  4858,  2361,  ...,   103,   287,   102],
        [  101,   351,   103,  ...,  3258,   279,   102],
        ...,
        [  101,   518, 17327,  ..., 12856,   281,   102],
        [  101,   117,   117,  ...,   575,   273,   102],
        [  101,   294,  2207,  ...,   877,  3436,   102]])

In [33]:
#a custom dataset class to handle the input encodings
class GreekDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [34]:
#create a dataset and dataloader for the inputs
dataset = GreekDataset(inputs)

In [35]:
loader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

In [36]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(35000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [None]:
epochs = 1
model_path = '../mlm_transfer/model'  #model save path
tokenizer_path = '../mlm_transfer/tokenizer'   #tokenizer save path

In [40]:
optimizer = AdamW(model.parameters(), lr=1e-4) 

for epoch in range(epochs):
    loop = tqdm(loader, leave=True)
    for batch in loop:
        optimizer.zero_grad() 
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()  
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

    model.save_pretrained(os.path.join(model_path, f'epoch_{epoch}'))
    tokenizer.save_pretrained(os.path.join(tokenizer_path, f'epoch_{epoch}'))

    print(f'Model and tokenizer saved for epoch {epoch}')


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 10/10 [00:11<00:00,  1.15s/it, loss=4.85]


Model and tokenizer saved for epoch 0


In [41]:
#here goes roberta mlm transfer

In [48]:
from transformers import RobertaTokenizer, RobertaForMaskedLM

tokenizer = RobertaTokenizer.from_pretrained("bowphs/GreBerta")
model = RobertaForMaskedLM.from_pretrained("bowphs/GreBerta")

In [49]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

In [None]:
epochs = 1
model_path = '../mlm_transfer/model_roberta'  
tokenizer_path = '../mlm_transfer/tokenizer_roberta'  

In [45]:
optimizer = AdamW(model.parameters(), lr=1e-4)  

for epoch in range(epochs):
    loop = tqdm(loader, leave=True)
    for batch in loop:
        optimizer.zero_grad()  
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()  
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

    model.save_pretrained(os.path.join(model_path, f'epoch_{epoch}'))
    tokenizer.save_pretrained(os.path.join(tokenizer_path, f'epoch_{epoch}'))

    print(f'Model and tokenizer saved for epoch {epoch}')


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Epoch 0: 100%|██████████| 10/10 [00:06<00:00,  1.55it/s, loss=1.85]


Model and tokenizer saved for epoch 0
