In [None]:
# Paper: Beyond Accuracy: Behavioral Testing of NLP Models with CheckList ACL 2020

# One sentence summary:
# Method to test the behaviour of NLP models through different experiments on small
# manufactured example test sets.

# Criticism:
# The authors focus mainly on English NLP models and show that the CheckList analyses
# uncover errors in commercial and openly available models. However, there is a wide usage
# of multilingual models in the NLP community for multilingual or low-resource experiments
# which makes an error analysis of multilingual models relevant and interesting.

# Planned Implementation:
# The plan is to take a multilingual model (e.g., mBERT) and create a Minimum Functionality
# Test (MFT) for testing how the model handles negations in parallel Sentiment Analysis
# sentences in English (like "It was bad.", "It was good.", "It was not good."), German ("Es war
# schlecht.", "Es war gut.", "Es war nicht gut.") and Bavarian ("Des war schlecht.", "Des war
# guad.", "Des war ned guad."). The experiments should then show whether the CheckList
# methods can also be used in a multilingual setup and whether mBERT's behaviour changes
# for different languages.

# Feedback:
# The proposal extends CheckList to other languages and implements a Minimum Functionality Test comparing English, German and Bavarian.
# Looks good. Approved.

In [None]:
# research questions:
# can CheckList be applied to multilingual models? (only tested on English models)
# can CheckList be used to compare model capabilities across languages? (here: has the model different behaviour when handling negations in different languages)

In [None]:
# sources

# code for data creation partly taken from 
# https://github.com/marcotcr/checklist/blob/master/notebooks/tutorials/1.%20Generating%20data.ipynb
# with major changes

# code for test creation and execution partly taken from
# https://github.com/marcotcr/checklist/blob/master/notebooks/tutorials/3.%20Test%20types%2C%20expectation%20functions%2C%20running%20tests.ipynb and
# https://github.com/marcotcr/checklist/blob/master/notebooks/tutorials/4.%20The%20CheckList%20process.ipynb
# with minor changes

# code for model training taken partly from 
# https://huggingface.co/blog/sentiment-analysis-python
# with minor changes

In [None]:
# notes
# hard to find adjective pairs where standard German and Bavarian are different -> focus on only three pairs per sentiment

# test sentences only work with predicative adjektive constructions to avoid issues with inflection of the pre-chosen adjectives in standard German and Bavarian

In [19]:
import checklist
from checklist.editor import Editor
from checklist.test_types import MFT
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np

In [3]:
# initialise Editor objects
eng_editor = Editor() # default language is English
deu_editor = Editor(language="german")

In [7]:
# get adj suggestions from the CheckList Editors in German and English
print("English adj suggestions: ")
print(eng_editor.suggest("This is a {mask} {noun}.", noun=["book", "movie"])[:20], "\n") 

print("German adj suggestions: ")
print(deu_editor.suggest("Das ist ein {mask} Buch.")[:20]) 
print(deu_editor.suggest("Das ist ein {mask} Film.")[:20])

English adj suggestions: 
['great', 'good', 'wonderful', 'beautiful', 'fantastic', 'terrible', 'fascinating', 'big', 'nice', 'bad', 'long', 'fun', 'scary', 'terrific', 'tough', 'short', 'serious', 'huge', 'remarkable', 'new'] 

German adj suggestions: 
['gutes', 'neues', 'großes', 'kleines', 'starkes', 'politisches', 'anderes', 'wichtiges', 'ganzes', 'besonderes', 'altes', 'deutsches', 'eigenes', 'super', 'weiteres', 'solches', 'zweites', 'einziges', 'letztes', 'erstes']
['guter', 'schlechter', 'großer', 'schöner', 'super', 'wichtiger', 'deutscher', 'schwieriger', 'historischer', 'neuer', 'kleiner', 'politischer', 'typischer', 'anderer', 'besonderer', 'amerikanischer', 'klassischer', 'einfacher', 'internationaler', 'langer']


In [4]:
# parallel adjective lists for data creation
eng_pos = ["good", "nice", "great"]
deu_pos = ["gut", "schön", "super"]
bar_pos = ["guad", "schee", "subba"]

eng_neg = ["bad", "boring", "stupid"]
deu_neg = ["schlecht", "langweilig", "blöd"]
bar_neg = ["schlecht", "fad", "bled"]

In [9]:
# get noun suggestions from the CheckList Editors in German and English
# english suggestions
print("English noun suggestions: ")
print(eng_editor.suggest("This {mask} is {pos}.", pos=eng_pos)[:10]) 
print(eng_editor.suggest("This {mask} is {neg}.", neg=eng_neg)[:10], "\n") 

# suggestions for standard German with different determiners
print("German noun suggestions: ")
print("Der", deu_editor.suggest("Der {mask} ist {pos}.", pos=deu_pos)[:10])
print("Der", deu_editor.suggest("Der {mask} ist {neg}.", neg=deu_neg)[:10])
print("Die", deu_editor.suggest("Die {mask} ist {pos}.", pos=deu_pos)[:10])
print("Die", deu_editor.suggest("Die {mask} ist {neg}.", neg=deu_neg)[:10])
print("Das", deu_editor.suggest("Das {mask} ist {pos}.", pos=deu_pos)[:10])
print("Das", deu_editor.suggest("Das {mask} ist {neg}.", neg=deu_neg)[:10])

English noun suggestions: 
['game', 'one', 'stuff', 'book', 'article', 'video', 'post', 'movie', 'code', 'thing']
['post', 'article', 'game', 'stuff', 'video', 'movie', 'list', 'shit', 'thing', 'book'] 

German noun suggestions: 
Der ['Preis', 'Job', 'Platz', 'Service', 'Abend', 'Wind', 'Rest', 'Sieg', 'Auftritt', 'Eindruck']
Der ['Rest', 'Job', 'Film', 'Ton', 'Fußball', 'Mann', 'Preis', 'Markt', 'Anfang', 'Fall']
Die ['Stimmung', 'Atmosphäre', 'Situation', 'Sache', 'Resonanz', 'Lage', 'Idee', 'Antwort', 'Qualität', 'Zusammenarbeit']
Die ['Stimmung', 'Sache', 'Situation', 'Lage', 'Welt', 'Idee', 'Musik', 'Antwort', 'Geschichte', 'Zeit']
Das ['Ergebnis', 'Wetter', 'alles', 'Leben', 'Angebot', 'hier', 'Spiel', 'Resultat', 'Stadion', 'Essen']
Das ['Ergebnis', 'alles', 'Wetter', 'Leben', 'aber', 'Spiel', 'Ganze', 'hier', 'Auto', 'Geschäft']


In [5]:
# parallel noun lists for data creation
# picked from Editor suggestions in such a way that mostly all words are different
eng_noun = ["game", "site", "picture", "book", "story", "man", "world", "city", "time", "weather", "life"]

# standard German and Bavarian examples with determiners to avoid errors
deu_noun = [
    ("Das", "Spiel"), ("Die", "Seite"), ("Das", "Bild"), ("Das", "Buch"), ("Die", "Geschichte"), 
    ("Der", "Mann"), ("Die", "Welt"), ("Die", "Stadt"), ("Die", "Zeit"), ("Das", "Wetter"), ("Das", "Leben")
    ]

# bavarian determiners are with spaces to handle "d'" and "s'" determiners
bar_noun = [
    ("Des ", "Spui"), ("De ", "Seitn"), ("Des ", "Buidl"), ("Des ", "Buach"), ("De ", "Gschicht"), 
    ("Der ", "Mo"), ("D'", "Weid"), ("D'", "Stod"), ("D'", "Zeid"), ("S'", "Weda"), ("S'", "Lebm")
    ] 

In [6]:
# create english data
eng = eng_editor.template("The {noun} is not {adj}.", noun=eng_noun, adj=eng_pos, labels=0) # not pos = negative
eng += eng_editor.template("The {noun} is not {adj}.", noun=eng_noun, adj=eng_neg, labels=1) # not neg = positive

# create german data
deu = deu_editor.template("{detnoun[0]} {detnoun[1]} ist nicht {adj}.", detnoun=deu_noun, adj=deu_pos, labels=0)
deu += deu_editor.template("{detnoun[0]} {detnoun[1]} ist nicht {adj}.", detnoun=deu_noun, adj=deu_neg, labels=1)

# create bavarian data
bar = deu_editor.template("{detnoun[0]}{detnoun[1]} is ned {adj}.", detnoun=bar_noun, adj=bar_pos, labels=0)
bar += deu_editor.template("{detnoun[0]}{detnoun[1]} is ned {adj}.", detnoun=bar_noun, adj=bar_neg, labels=1)

In [17]:
# initialise minimum functionality tests
eng_test = MFT(**eng, name="English Negations", capability="Negation", description="Litotes sentences to test negation capabilities.")
deu_test = MFT(**deu, name="Standard German Negations", capability="Negation", description="Litotes sentences to test negation capabilities.")
bar_test = MFT(**bar, name="Bavarian Negations", capability="Negation", description="Litotes sentences to test negation capabilities.")

['Des Spui is ned guad.', 'Des Spui is ned schee.', 'Des Spui is ned subba.', 'De Seitn is ned guad.', 'De Seitn is ned schee.', 'De Seitn is ned subba.', 'Des Buidl is ned guad.', 'Des Buidl is ned schee.', 'Des Buidl is ned subba.', 'Des Buach is ned guad.', 'Des Buach is ned schee.', 'Des Buach is ned subba.', 'De Gschicht is ned guad.', 'De Gschicht is ned schee.', 'De Gschicht is ned subba.', 'Der Mo is ned guad.', 'Der Mo is ned schee.', 'Der Mo is ned subba.', "D'Weid is ned guad.", "D'Weid is ned schee.", "D'Weid is ned subba.", "D'Stod is ned guad.", "D'Stod is ned schee.", "D'Stod is ned subba.", "D'Zeid is ned guad.", "D'Zeid is ned schee.", "D'Zeid is ned subba.", "S'Weda is ned guad.", "S'Weda is ned schee.", "S'Weda is ned subba.", "S'Lebm is ned guad.", "S'Lebm is ned schee.", "S'Lebm is ned subba.", 'Des Spui is ned schlecht.', 'Des Spui is ned fad.', 'Des Spui is ned bled.', 'De Seitn is ned schlecht.', 'De Seitn is ned fad.', 'De Seitn is ned bled.', 'Des Buidl is ned

In [8]:
# load finetuned mBERT for sentiment analysis
model_name = "./models/mBERT_sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# initialise pipeline for predictions
device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device, return_all_scores=True)

Device set to use cuda


In [20]:
# make predictions on test data
lbl2idx = {"negative": 0, "neutral": 1, "positive": 2}

def predict(test):
    # read data and predict
    data = test.to_raw_examples() # necessary for internal CheckList structures (self.result_indexes)
    raw_preds = pipe(data)

    preds = []
    confs = []

    # write results in correct CheckList format to a file
    for result in raw_preds:
        negative = result[0]
        neutral = result[1]
        positive = result[2]

        max_pred = max([negative, neutral, positive], key=lambda x: x["score"])
        max_label = max_pred["label"]

        # prediction, negative_score, neutral_score, positive_score
        preds.append(lbl2idx[max_label])
        confs.append(np.array([negative["score"], neutral["score"], positive["score"]]))

    return preds, confs

eng_preds, eng_confs = predict(eng_test)
deu_preds, deu_confs = predict(deu_test)
bar_preds, bar_confs = predict(bar_test)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [21]:
eng_test.run_from_preds_confs(eng_preds, eng_confs, overwrite=True)

eng_test.summary()

Test cases:      66
Fails (rate):    54 (81.8%)

Example fails:
0.9 0.1 0.0 The game is not stupid.
----
0.9 0.1 0.0 The book is not stupid.
----
0.6 0.3 0.1 The world is not boring.
----


In [22]:
deu_test.run_from_preds_confs(deu_preds, deu_confs, overwrite=True)

deu_test.summary()

Test cases:      66
Fails (rate):    34 (51.5%)

Example fails:
0.8 0.2 0.1 Das Leben ist nicht schlecht.
----
0.6 0.3 0.1 Das Leben ist nicht langweilig.
----
0.5 0.4 0.1 Das Bild ist nicht langweilig.
----


In [24]:
bar_test.run_from_preds_confs(bar_preds, bar_confs, overwrite=True)

bar_test.summary()

Test cases:      66
Fails (rate):    34 (51.5%)

Example fails:
0.6 0.2 0.1 D'Weid is ned bled.
----
0.5 0.3 0.2 Des Buach is ned fad.
----
0.8 0.1 0.1 S'Lebm is ned schlecht.
----


In [18]:
# from checklist.test_types import MFT
# import numpy as np

# eng_test = MFT(**eng, name="English Negations", capability="Negation", description="Litotes sentences to test negation capabilities.")

# eng_data = eng_test.to_raw_examples()

# # with open(raw_file_path, "r") as f:
# #     eng_data = f.read().splitlines()

# raw_preds = pipe(eng_data)
# preds = []
# confs = []

# for result in raw_preds:
#     negative = result[0]
#     neutral = result[1]
#     positive = result[2]

#     max_pred = max([negative, neutral, positive], key=lambda x: x["score"])
#     max_label = max_pred["label"]

#     # prediction, negative_score, neutral_score, positive_score
#     preds.append(lbl2idx[max_label])
#     confs.append(np.array([negative["score"], neutral["score"], positive["score"]]))


# from checklist.abstract_test import read_pred_file
# from checklist.test_types import MFT

# # Assuming eng_test is already created as an MFT instance

# # Read predictions and confidences from the file
# # preds, confs = read_pred_file("/tmp/preds_eng.txt", file_format="softmax")

# # Verify the parsed predictions and confidences
# print("Predictions:", preds[:5])
# print("Confidences:", confs[:5])

# print("len preds: ", len(preds))
# print("len confs: ", len(confs))

# if preds is None or confs is None:
#     print("Error: The predictions or confidences are None. Please check the file format.")

# # Run the test using the loaded predictions and confidences
# eng_test.run_from_preds_confs(preds, confs, overwrite=True)

# # Print the summary of the results
# eng_test.summary()


Predictions: [0, 2, 2, 0, 2]
Confidences: [array([0.44630149, 0.25521377, 0.29848468]), array([0.17414553, 0.14964208, 0.67621237]), array([0.28616518, 0.242695  , 0.47113985]), array([0.46459395, 0.29296777, 0.24243827]), array([0.23632866, 0.1914082 , 0.57226312])]
len preds:  66
len confs:  66
Test cases:      66
Fails (rate):    54 (81.8%)

Example fails:
0.9 0.1 0.0 The game is not stupid.
----
0.2 0.2 0.6 The picture is not nice.
----
0.2 0.2 0.6 The city is not nice.
----
