In [None]:
# notes
# running the suite with a prediction function does not allow for access to the scores, so if you need the scores themselves, you have to predict to a file
# and run the suite from the file

In [1]:
from checklist.editor import Editor
from checklist.test_types import MFT
from checklist.test_suite import TestSuite
from checklist.pred_wrapper import PredictorWrapper
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# initialise CheckList Editor objects
eng_editor = Editor() # default language = English
deu_editor = Editor(language="german")

# initialise CheckList TestSuite for running the tests
suite = TestSuite()

In [3]:
# linguistic resources

# parallel adjective lists for data creation
eng_pos = ["good", "nice", "great"]
deu_pos = ["gut", "schön", "super"]
bar_pos = ["guad", "schee", "subba"]

eng_neg = ["bad", "boring", "stupid"]
deu_neg = ["schlecht", "langweilig", "blöd"]
bar_neg = ["schlecht", "fad", "bled"]

# parallel noun lists for data creation
# picked from Editor suggestions in such a way that mostly all words are different
eng_noun = ["game", "site", "picture", "book", "story", "man", "world", "city", "time", "weather", "life"]

# standard German and Bavarian examples with determiners to avoid errors
deu_noun = [
    ("Das", "Spiel"), ("Die", "Seite"), ("Das", "Bild"), ("Das", "Buch"), ("Die", "Geschichte"), 
    ("Der", "Mann"), ("Die", "Welt"), ("Die", "Stadt"), ("Die", "Zeit"), ("Das", "Wetter"), ("Das", "Leben")
    ]

# bavarian determiners are with spaces to handle "d'" and "s'" determiners
bar_noun = [
    ("Des ", "Spui"), ("De ", "Seitn"), ("Des ", "Buidl"), ("Des ", "Buach"), ("De ", "Gschicht"), 
    ("Der ", "Mo"), ("D'", "Weid"), ("D'", "Stod"), ("D'", "Zeid"), ("S'", "Weda"), ("S'", "Lebm")
    ] 

# negative phrases
eng_neg_p = ["I don't think that", "I hate that", "I don't like that"]
deu_neg_p = ["Ich denke nicht, dass", "Ich hasse, dass", "Ich mag nicht, dass"]
bar_neg_p = ["I deng ned, dass", "I hass des, wenn", "I mog des ned, wenn"] # bavarian constructions work a bit differently than standard German

# positive phrases
eng_pos_p = ["I like that", "I love that", "I'm sure that"]
deu_pos_p = ["Ich mag, dass", "Ich liebe, dass", "Ich bin sicher, dass"]
bar_neg_p = ["I mog des, wenn", "I liebs, wenn", "I bin ma sicha, dass"]

In [5]:
# Sanity tests - classify sentences
data = eng_editor.template("The {noun} is {adj}.", noun=eng_noun, adj=eng_pos, labels=1) # positive samples
data += eng_editor.template("The {noun} is {adj}.", noun=eng_noun, adj=eng_neg, labels=0) # negative samples
test = MFT(**data)
suite.add(test, "Sanity Checks\nLanguage: English", "Negation", "Simple positive and negative sentences.", overwrite=True)

In [7]:
# English tests

# test for negated positive examples, expectation: negative (0)
# e.g. The game is not good.
data01 = eng_editor.template("The {noun} is not {adj}.", noun=eng_noun, adj=eng_pos, labels=0) # not pos = negative
test = MFT(**data01)
suite.add(test, "Positive Adjective Negations: Negative\nLanguage: English", "Negation", "Sentences with negated positive adjectives.", overwrite=True)

# test for negations of positive phrases, expectation: negative (0)
# e.g. I hate that the game is good.
data02 = eng_editor.template("{p} the {noun} is {adj}.", p=eng_neg_p, noun=eng_noun, adj=eng_pos, labels=0)
test = MFT(**data02)
suite.add(test, "Positive Phrase Negations: Negative\nLanguage: English", "Negation", "Negations of positive sentences.", overwrite=True)

# test for negated negative adjectives, expectation: positive (1) (or 2)
# e.g. The game is not bad.
data03 = eng_editor.template("The {noun} is not {adj}.", noun=eng_noun, adj=eng_pos, labels=1) # not neg = positive
test = MFT(**data03)
suite.add(test, "Negative Adjective Negations: Positive\nLanguage: English", "Negation", "Sentences with negated negative adjectives.", overwrite=True)

# test for positive additional phrases to negative phrases, expectation: positive (1) (or 2)
# e.g. I like that the game is bad.
data04 = eng_editor.template("{p} the {noun} is {adj}.", p=eng_pos_p, noun=eng_noun, adj=eng_neg, labels=1)
test = MFT(**data04)
suite.add(test, "Negative Phrases with Positive Additions: Positive\nLanguage: English", "Negation", "Positive additions to negative sentences.", overwrite=True)

In [8]:
# load vanilla mBERT model
model_name = "google-bert/bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# initialise pipeline for predictions
device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device, top_k=None)

lbl2idx = {"LABEL_0": 0, "LABEL_1": 1}
idx2lbl = {0: "LABEL_0", 1: "LABEL_1"}

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda


In [7]:
def predict(data):
    # read data and predict
    raw_preds = pipe(data)

    preds = []
    confs = []

    # write results in correct CheckList format to a file
    for result in raw_preds:
        negative = result[0]
        positive = result[1]

        max_pred = max([negative, positive], key=lambda x: x["score"])
        max_label = max_pred["label"]

        # prediction, negative_score, neutral_score, positive_score
        preds.append(lbl2idx[max_label])
        confs.append(np.array([negative["score"], positive["score"]]))

    return preds, confs

In [89]:
suite.run(predict, overwrite=True)

def format_example(x, pred, conf, label, meta=None): 
    return f"Sentence: {x}\nGold: {pred}\t\tPredicted: {label}\np(negative) = {round(conf[0], 2)}\tp(positive) = {round(conf[1], 2)}"

suite.summary(format_example_fn = format_example)

Negation

Positive Adjective Negations: Negative
Language: English
Test cases:      33
Fails (rate):    11 (33.3%)

Example fails:
Sentence: The game is not great.
Gold: 1		Predicted: 0
p(negative) = 0.5	p(positive) = 0.5
----
Sentence: The man is not nice.
Gold: 1		Predicted: 0
p(negative) = 0.51	p(positive) = 0.49
----
Sentence: The time is not great.
Gold: 1		Predicted: 0
p(negative) = 0.51	p(positive) = 0.49
----


Positive Phrase Negations: Negative
Language: English
Test cases:      99
Fails (rate):    52 (52.5%)

Example fails:
Sentence: I don't think that the weather is great.
Gold: 1		Predicted: 0
p(negative) = 0.51	p(positive) = 0.49
----
Sentence: I hate that the story is great.
Gold: 1		Predicted: 0
p(negative) = 0.5	p(positive) = 0.5
----
Sentence: I hate that the weather is great.
Gold: 1		Predicted: 0
p(negative) = 0.52	p(positive) = 0.48
----


Negative Adjective Negations: Positive
Language: English
Test cases:      33
Fails (rate):    22 (66.7%)

Example fails:
Senten

In [12]:
def predict_to_file(data, preds):
    # read data and predict
    with open(data, "r") as f:
        data = [line.strip() for line in f.readlines()]

    raw_preds = pipe(data)

    # write results in correct CheckList format to a file
    with open(preds, "w") as f:
        f.write("label negative positive\n")
        for result in raw_preds:
            negative = result[0]
            positive = result[1]

            max_pred = max([negative, positive], key=lambda x: x["score"])
            max_label = max_pred["label"]

            f.write(f"{lbl2idx[max_label]} {negative["score"]} {positive["score"]}\n")

In [None]:
data_file = "/tmp/data.txt"
predictions_file = "/tmp/predictions.txt"

suite.to_raw_file(data_file)
predict_to_file(data_file, predictions_file)

suite.run_from_file(predictions_file, file_format="pred_and_softmax", ignore_header=True, overwrite=True)

In [126]:
suite.summary(format_example_fn = format_example)

Negation

Positive Adjective Negations: Negative
Language: English
Test cases:      33
Fails (rate):    11 (33.3%)

Example fails:
Sentence: The life is not nice.
Gold: 1		Predicted: 0
p(negative) = 0.52	p(positive) = 0.48
----
Sentence: The world is not great.
Gold: 1		Predicted: 0
p(negative) = 0.51	p(positive) = 0.49
----
Sentence: The world is not good.
Gold: 1		Predicted: 0
p(negative) = 0.5	p(positive) = 0.5
----


Positive Phrase Negations: Negative
Language: English
Test cases:      99
Fails (rate):    52 (52.5%)

Example fails:
Sentence: I don't think that the world is great.
Gold: 1		Predicted: 0
p(negative) = 0.52	p(positive) = 0.48
----
Sentence: I don't think that the life is nice.
Gold: 1		Predicted: 0
p(negative) = 0.53	p(positive) = 0.47
----
Sentence: I don't like that the time is good.
Gold: 1		Predicted: 0
p(negative) = 0.51	p(positive) = 0.49
----


Negative Adjective Negations: Positive
Language: English
Test cases:      33
Fails (rate):    22 (66.7%)

Example fail

In [3]:
#!pip install "git+https://github.com/huggingface/transformers.git@6e0515e99c39444caae39472ee1b2fd76ece32f1" --upgrade

## modernbert

In [9]:
# load multilingual sentiment analysis model
model_name = "clapAI/modernBERT-base-multilingual-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# initialise pipeline for predictions
device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device, top_k=None)

lbl2idx = model.config.id2label
idx2lbl = model.config.label2id
print(lbl2idx)
print(idx2lbl)

Device set to use cuda


{0: 'negative', 1: 'neutral', 2: 'positive'}
{'negative': 0, 'neutral': 1, 'positive': 2}


In [10]:
suite = TestSuite()

In [11]:
# Sanity tests
data = eng_editor.template("The {noun} is {adj}.", noun=eng_noun, adj=eng_pos, labels=2) # positive samples
data += eng_editor.template("The {noun} is {adj}.", noun=eng_noun, adj=eng_neg, labels=0) # negative samples
test = MFT(**data)
suite.add(test, "Sanity Check\nLanguage: English", "Negation", "Simple positive and negative sentences.", overwrite=True)

In [12]:
# English tests

# test for negated positive examples, expectation: negative (0)
# e.g. The game is not good.
data01 = eng_editor.template("The {noun} is not {adj}.", noun=eng_noun, adj=eng_pos, labels=0) # not pos = negative
test = MFT(**data01)
suite.add(test, "Positive Adjective Negations: Negative\nLanguage: English", "Negation", "Sentences with negated positive adjectives.", overwrite=True)

# test for negations of positive phrases, expectation: negative (0)
# e.g. I hate that the game is good.
data02 = eng_editor.template("{p} the {noun} is {adj}.", p=eng_neg_p, noun=eng_noun, adj=eng_pos, labels=0)
test = MFT(**data02)
suite.add(test, "Positive Phrase Negations: Negative\nLanguage: English", "Negation", "Negations of positive sentences.", overwrite=True)

# test for negated negative adjectives, expectation: positive (1) (or 2)
# e.g. The game is not bad.
data03 = eng_editor.template("The {noun} is not {adj}.", noun=eng_noun, adj=eng_pos, labels=2) # not neg = positive
test = MFT(**data03)
suite.add(test, "Negative Adjective Negations: Positive\nLanguage: English", "Negation", "Sentences with negated negative adjectives.", overwrite=True)

# test for positive additional phrases to negative phrases, expectation: positive (1) (or 2)
# e.g. I like that the game is bad.
data04 = eng_editor.template("{p} the {noun} is {adj}.", p=eng_pos_p, noun=eng_noun, adj=eng_neg, labels=2)
test = MFT(**data04)
suite.add(test, "Negative Phrases with Positive Additions: Positive\nLanguage: English", "Negation", "Positive additions to negative sentences.", overwrite=True)

In [21]:
def predict_to_file(data, preds):
    # read data and predict
    with open(data, "r") as f:
        data = [line.strip() for line in f.readlines()]
    
    raw_preds = pipe(data)

    # write results in correct CheckList format to a file
    with open(preds, "w") as f:
        f.write("prediction positive neutral negative\n")
        for result in raw_preds:
            positive = result[0]
            neutral = result[1]
            negative = result[2]

            max_pred = max([negative, positive, neutral], key=lambda x: x["score"])
            max_label = max_pred["label"]

            f.write(f"{idx2lbl[max_label]} {positive["score"]} {neutral["score"]} {negative["score"]} \n")

In [22]:
data_file = "/tmp/data.txt"
predictions_file = "/tmp/predictions.txt"

suite.to_raw_file(data_file)
predict_to_file(data_file, predictions_file)

suite.run_from_file(predictions_file, file_format="pred_and_softmax", ignore_header=True, overwrite=True)

[{'label': 'positive', 'score': 0.7783976197242737}, {'label': 'negative', 'score': 0.11116747558116913}, {'label': 'neutral', 'score': 0.11043490469455719}]


In [23]:
!cat /tmp/predictions.txt | head -10

prediction positive neutral negative
2 0.7783976197242737 0.11116747558116913 0.11043490469455719 
2 0.539217472076416 0.3811604678630829 0.07962208241224289 
2 0.9282175898551941 0.0496378131210804 0.022144557908177376 
2 0.8421204686164856 0.10394899547100067 0.053930483758449554 
2 0.4523354470729828 0.40778809785842896 0.13987642526626587 
2 0.9213922023773193 0.05033658817410469 0.028271200135350227 
2 0.5943126082420349 0.29849109053611755 0.10719628632068634 
2 0.6096277832984924 0.2928953170776367 0.09747689217329025 
2 0.8210318684577942 0.09937994182109833 0.07958824932575226 


In [37]:
def format_example(x, pred, conf, label, meta=None): 
    return f"Sentence: {x}\nGold: {label}\t\tPredicted: {pred}\np(positive) = {round(conf[0], 2)}\tp(neutral) = {round(conf[1], 2)}\tp(negative) = {round(conf[2], 2)}"

suite.summary(format_example_fn = format_example)

Negation

Sanity Check
Language: English
Test cases:      66
Fails (rate):    3 (4.5%)

Example fails:
Sentence: The weather is bad.
Gold: 0		Predicted: 1
p(positive) = 0.57	p(neutral) = 0.42	p(negative) = 0.01
----
Sentence: The weather is boring.
Gold: 0		Predicted: 1
p(positive) = 0.51	p(neutral) = 0.48	p(negative) = 0.01
----
Sentence: The weather is good.
Gold: 2		Predicted: 1
p(positive) = 0.49	p(neutral) = 0.47	p(negative) = 0.04
----


Positive Adjective Negations: Negative
Language: English
Test cases:      33
Fails (rate):    3 (9.1%)

Example fails:
Sentence: The weather is not good.
Gold: 0		Predicted: 1
p(positive) = 0.63	p(neutral) = 0.36	p(negative) = 0.01
----
Sentence: The weather is not nice.
Gold: 0		Predicted: 1
p(positive) = 0.66	p(neutral) = 0.32	p(negative) = 0.02
----
Sentence: The weather is not great.
Gold: 0		Predicted: 1
p(positive) = 0.74	p(neutral) = 0.24	p(negative) = 0.02
----


Positive Phrase Negations: Negative
Language: English
Test cases:      99
Fa