In [1]:
! pip install transformers --quiet

In [2]:
from transformers import AutoTokenizer, AutoModel, pipeline
import torch
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [23]:
# Open the files and read their content
with open('/content/Plato (0059) - Timaeus (031)_.txt', 'r') as f:
    original = f.readlines()
with open('/content/Plato (0059) - Timaeus (031)_mask.txt', 'r') as f:
    input = f.readlines()

In [None]:
col = ['mask_value', 
       'predicted_mask_1', 'score_prediction_1', 'cosine_similarity_1', 
       'predicted_mask_2', 'score_prediction_2', 'cosine_similarity_2',
       'predicted_mask_3', 'score_prediction_3', 'cosine_similarity_3',
       ]
df1 = pd.DataFrame(columns=col)

for i, sentence in enumerate(input):
    # find position of [MASK] token
    try:
      mask_pos = sentence.index('[MASK]')
       # find true mask token in y_hat
      true_value = original[i][mask_pos:]
      separators = ' .,;:'
      for char in true_value:
          if char in separators:
              true_value = true_value[:true_value.index(char)]
              break

      # add to dataframe
      df1 = df1.append({
          'mask_value': true_value
      }, ignore_index=True)
    except:
      print(sentence)
    
  

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

df1

In [6]:
ag_model = 'pranaydeeps/Ancient-Greek-BERT'
pr_model = 'Sonnenblume/bert-base-uncased-ancient-greek-v4'
my_model = 'Sonnenblume/bert-base-uncased-ancient-greek-v3'
mg_model = 'nlpaueb/bert-base-greek-uncased-v1'


model_name = my_model

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

model.to(device)

p = pipeline(
  "fill-mask",
  model=model_name,
  tokenizer=tokenizer,
)

for i, sentence in enumerate(input):

    if i%50 == 0:
      print(f'{i}/{len(input)}')

    try:    
      pred = p(sentence)

    
      # Replace values of corresponding row in df1
      df1.at[i, 'predicted_mask_1'] = pred[0]["token_str"]
      df1.at[i, 'predicted_mask_2'] = pred[1]["token_str"]
      df1.at[i, 'predicted_mask_3'] = pred[2]["token_str"]

      df1.at[i, 'score_prediction_1'] = pred[0]["score"]
      df1.at[i, 'score_prediction_2'] = pred[1]["score"]
      df1.at[i, 'score_prediction_3'] = pred[2]["score"]

      df1.at[i, 'cosine_similarity_1'] = calculate_cosine_similarity(model, tokenizer, df1.at[i, 'mask_value'], pred[0]["token_str"], device=device)
      df1.at[i, 'cosine_similarity_2'] = calculate_cosine_similarity(model, tokenizer, df1.at[i, 'mask_value'], pred[1]["token_str"], device=device)
      df1.at[i, 'cosine_similarity_3'] = calculate_cosine_similarity(model, tokenizer, df1.at[i, 'mask_value'], pred[2]["token_str"], device=device)


    except Exception as e:
      print(e)

In [30]:
df1_sorted = df1.sort_values(by=['score_prediction_1'], ascending=[False])

df1_sorted

df1_sorted.to_csv(f'{model_name}_Plato (0059) - Timaeus (031).csv')

In [8]:
def remove_subword_prefix(subword):
    if subword.startswith("##"):
        return subword[2:]
    else:
        return " " + subword

In [9]:
def get_predictions(tokenizer, model, sentence):
   # tokenize the sentence and find the index of the masked token
    tokenized_text = get_tokens(tokenizer, sentence)
    masked_index = tokenized_text.index('[MASK]')

    # convert the tokenized sentence to input ids and create a tensor of input ids
    input_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
    input_ids = torch.tensor([input_ids])

    # use the model to make predictions on the masked token
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    input_ids = input_ids.to(device)

    with torch.no_grad():
        outputs = model(input_ids)
        predictions = outputs[0][0, masked_index].topk(k=3, sorted=True)
        predicted_token_indices = predictions.indices.tolist()
        predicted_token_logits = predictions.values.tolist()
        predicted_token_probs = torch.softmax(torch.tensor(predicted_token_logits), dim=0).tolist()

    # convert the predicted token ids to tokens and concatenate subwords with "##" prefix
    predicted_tokens = [remove_subword_prefix(tokenizer.convert_ids_to_tokens(pred)) for pred in predicted_token_indices]

    return predicted_tokens, predicted_token_probs


In [10]:
def get_tokens(tokenizer, sentence):

    # Tokenize the sentence
    return  tokenizer.tokenize(sentence)

In [11]:
def calculate_embeddings(tokens, tokenizer, model, device=torch.device('cpu')):
    
    # Get embeddings for the two texts separately
    embeddings = []
    with torch.no_grad():
      attention_mask = [1 if i!= '[PAD]' else 0 for i in tokens]
      input_ids = torch.tensor([tokenizer.convert_tokens_to_ids(tokens)], dtype=torch.long).to(device)

      attention_mask = torch.tensor(attention_mask).unsqueeze(0).to(device)
      with torch.no_grad():
          outputs = model(input_ids, attention_mask=attention_mask)

      # Get the output from the classification layer
      hidden_state = outputs.last_hidden_state[:, 0, :]
      embeddings.append(hidden_state)

    return embeddings

In [12]:
def calculate_cosine_similarity(model, tokenizer, original, predicted, device=torch.device('cpu')):

    # tokenize sentences
    token_original = get_tokens(tokenizer, original)
    token_predicted = get_tokens(tokenizer, predicted)

    # Get embeddings
    embeddings_original = calculate_embeddings(token_original, tokenizer, model, device=device)
    embeddings_predicted = calculate_embeddings(token_predicted, tokenizer, model, device=device)
    
    # Move embeddings to CPU
    embeddings_original = torch.cat(embeddings_original).to('cpu').numpy()
    embeddings_predicted = torch.cat(embeddings_predicted).to('cpu').numpy()

    # Calculate the cosine similarity between the two sets of embeddings
    cosine_sim = cosine_similarity(embeddings_original, embeddings_predicted)

    # avg_cosine_sim = np.mean(cosine_sim)

    return cosine_sim[0, 0]