In [1]:
# !pip install transformers

from transformers import pipeline
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# Load zero-shot classifier (multilingual)
classifier = pipeline("zero-shot-classification", model="joeddav/xlm-roberta-large-xnli")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/734 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at joeddav/xlm-roberta-large-xnli were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Device set to use cuda:0


In [8]:
df = pd.read_csv("/content/HIPE-2022-v2.1-letemps-test-fr.tsv",
                 sep="\t",
                 comment="#",
                 quoting=3,
                 names=["TOKEN", "NE-COARSE-LIT", "NE-COARSE-METO", "NE-FINE-LIT", "NE-FINE-METO",
                        "NE-FINE-COMP", "NE-NESTED", "NEL-LIT", "NEL-METO", "MISC"])
df = df[df["TOKEN"] != "TOKEN"]

df["sentence_id"] = (df["MISC"].fillna("_").str.contains("EndOfSentence")).cumsum()

sentences_df = df.groupby("sentence_id").agg({
    "TOKEN": lambda x: " ".join(x.astype(str)),
    "NE-COARSE-LIT": list
}).reset_index()

def has_entity(labels, prefix):
    return any(l.startswith(f"B-{prefix}") or l.startswith(f"I-{prefix}") for l in labels)

sentences_df["person"] = sentences_df["NE-COARSE-LIT"].apply(lambda x: has_entity(x, "pers"))
sentences_df["location"] = sentences_df["NE-COARSE-LIT"].apply(lambda x: has_entity(x, "loc"))
sentences_df["organization"] = sentences_df["NE-COARSE-LIT"].apply(lambda x: has_entity(x, "org"))

# Limit to subset for testing speed
# sentences_df = sentences_df.iloc[:200]

In [9]:
# Run zero-shot classification
ner_labels = ["person", "location", "organization"]
zs_results = []

for text in tqdm(sentences_df["TOKEN"]):
    output = classifier(text, candidate_labels=ner_labels, multi_label=True)
    zs_results.append(dict(zip(output["labels"], output["scores"])))


100%|██████████| 2382/2382 [02:12<00:00, 17.94it/s]


In [10]:
zs_df = pd.DataFrame(zs_results)
zs_binary = (zs_df >= 0.5).astype(bool)  # threshold at 0.5

evaluation_df = pd.concat([sentences_df[["person", "location", "organization"]].reset_index(drop=True), zs_binary], axis=1, keys=["gold", "pred"])
gold = evaluation_df["gold"]
pred = evaluation_df["pred"]

# Evaluation
for label in ner_labels:
    print(f"\n Evaluation for {label.upper()}")
    print(classification_report(gold[label], pred[label], digits=3))
    print("Accuracy:", accuracy_score(gold[label], pred[label]))



 Evaluation for PERSON
              precision    recall  f1-score   support

       False      0.920     0.320     0.474      2131
        True      0.117     0.765     0.203       251

    accuracy                          0.366      2382
   macro avg      0.519     0.542     0.339      2382
weighted avg      0.836     0.366     0.446      2382

Accuracy: 0.36649874055415615

 Evaluation for LOCATION
              precision    recall  f1-score   support

       False      0.881     0.366     0.517      2008
        True      0.178     0.735     0.286       374

    accuracy                          0.424      2382
   macro avg      0.529     0.550     0.401      2382
weighted avg      0.771     0.424     0.480      2382

Accuracy: 0.4235936188077246

 Evaluation for ORGANIZATION
              precision    recall  f1-score   support

       False      0.980     0.661     0.789      2327
        True      0.028     0.418     0.053        55

    accuracy                          0.655