## Model analysis


https://huggingface.co/PlanTL-GOB-ES/roberta-base-ca


In [2]:
import numpy as np
import pandas as pd

import json
from tqdm.notebook import tqdm
import random
import pickle
from typing import List

from transformers import AutoTokenizer, pipeline


In [2]:
file_path = "frases.pkl"
with open(file_path, "rb") as f:
    sentences = pickle.load(f)

print(len(sentences))


65746


In [26]:
# Load the tokenizer and pipeline from huggingface

model_name = "PlanTL-GOB-ES/roberta-base-ca"

tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(model_name)
unmasker = pipeline("fill-mask", model=model_name, tokenizer=model_name)


In [27]:
# A simple exploration on what the main functions do

random.seed(42)
[text] = random.sample(sentences, 1)

print(text)
print()

encoded = tokenizer.encode(text)
print(encoded)
print()

decoded = tokenizer.decode(encoded)
print(decoded)


Es tractava d’una supernova, l’explosió d’una estrella massiva els darrers instants de la seva vida.

[0, 546, 10903, 260, 720, 251, 590, 1986, 15022, 15, 265, 720, 251, 21171, 260, 720, 251, 590, 11353, 13736, 338, 3242, 22173, 263, 280, 497, 1096, 17, 2]

<s> Es tractava d’una supernova, l’explosió d’una estrella massiva els darrers instants de la seva vida.</s>


In [6]:
MASK_ID = tokenizer.convert_tokens_to_ids("<mask>")
MASK_ID


51999

In [42]:
# Prediction helper functions
def get_masked_list(ids):
    tokenized_list = [ids[:i] + [MASK_ID] + ids[i + 1 :] for i in range(len(ids))]
    masked_list = [tokenizer.decode(i) for i in tokenized_list]
    return masked_list


def predict_masked(masked_sentence):
    unmasked = unmasker(masked_sentence)

    unmasked_words = [pred["token_str"].strip() for pred in unmasked]
    unmasked_scores = [pred["score"] for pred in unmasked]
    # [{"words": [pred1, pred2], "scores": [score1, score2]}, ...]
    return {"words": unmasked_words, "scores": unmasked_scores}


In [43]:
# Metrics helper functions
def ndcg(item, pred_items: list) -> int:
    if item in pred_items:
        index = pred_items.index(item)
        return np.reciprocal(np.log2(index + 2))
    return 0


def get_hits_dict(original: List[str], predictions: List[List[str]]) -> dict:
    TOPS = [1, 3, 5]
    tops_dict = {}
    for top in TOPS:
        top_result = [
            word in preds[:top] for (word, preds) in zip(original, predictions)
        ]
        tops_dict[top] = np.asarray(top_result).mean()
    return tops_dict


def get_metrics(ids, predictions):
    confidence = np.array([pred["scores"][0] for pred in predictions]).mean()

    decoded = [tokenizer.decode(i).strip() for i in ids]
    pred_words = [pred["words"] for pred in predictions]

    ndcg_score = np.array(list(map(ndcg, decoded, pred_words))).mean()
    hits_dict = get_hits_dict(decoded, pred_words)

    return {
        "conf": confidence,
        "ndcg_score": ndcg_score,
        "hits": hits_dict,
    }


In [44]:
def sentence_prediction(text: str):
    ids = tokenizer.encode(text)
    masked_list = get_masked_list(ids)
    predictions = [predict_masked(masked) for masked in masked_list]

    metrics = get_metrics(ids, predictions)

    return metrics


In [25]:
# Difference between raw and processed data

raw_sentence = "Es tractava d’una supernova, l’explosió d’una estrella massiva els darrers instants de la seva vida."
processed_sentence = "Es tractava d'una supernova, l'explosió d'una estrella massiva els darrers instants de la seva vida."

raw_result = sentence_prediction(raw_sentence)
print("Raw data")
print(json.dumps(raw_result, indent=4))
print()

print("Preprocessed data")
processed_result = sentence_prediction(processed_sentence)
print(json.dumps(processed_result, indent=4))


Raw data
{
    "conf": 0.641585918336079,
    "ndcg_score": 0.596054431773405,
    "hits": {
        "1": 0.3793103448275862,
        "3": 0.7586206896551724,
        "5": 0.7586206896551724
    }
}

Preprocessed data
{
    "conf": 0.7795122174116281,
    "ndcg_score": 0.7074149715659375,
    "hits": {
        "1": 0.5769230769230769,
        "3": 0.8076923076923077,
        "5": 0.8076923076923077
    }
}


In [None]:
import torch
from torch.utils.data import Dataset


class SentenceDataset(Dataset):
    def __init__(self, sentences):
        self.sentences = sentences

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        return self.sentences[idx]


In [None]:
sents = SentenceDataset([s[2] for s in sentences])

print(len(sents))
sents[0]


In [45]:
results = []


In [46]:
NUM_SAMPLES = 10000
# random.seed(42)
# sents = random.sample(sentences, len(results) + NUM_SAMPLES)

print(len(results))

for i in tqdm(range(NUM_SAMPLES)):
    index = len(results)
    results.append((index, sentence_prediction(sents[index])))
    if (i % 1000) == 0:
        with open("results_checkpoint.pkl", "wb") as f:
            print(i)
            pickle.dump(results, f)


0


  0%|          | 0/10 [00:00<?, ?it/s]

In [14]:
# results_path = "samples/3_results_10k.pkl"
# results_path = "samples/4_2_results_10k.pkl"
results_path = "samples/4_4_results_10-20k.pkl"
with open(results_path, "rb") as f:
    results = pickle.load(f)


In [15]:
print(len(results))

data = [r[1] for r in results]

averages = {
    "ndcg_score": np.array([r["ndcg_score"] for r in data]).mean(),
    "conf": np.array([r["conf"] for r in data]).mean(),
    "hit1": np.array([r["hits"][1] for r in data]).mean(),
    "hit3": np.array([r["hits"][3] for r in data]).mean(),
    "hit5": np.array([r["hits"][5] for r in data]).mean(),
}

# Printing the results
just_len = max([len(key) for key in averages.keys()]) + 1

for (k, v) in averages.items():
    print(f"{k}:".ljust(just_len, " "), v)
    if k == "conf":
        print()


10000
ndcg_score: 0.6663205763763352
conf:       0.6762789860632521

hit1:       0.5704864394340048
hit3:       0.7004400136505109
hit5:       0.7477276735828828
