In [None]:
# notes
# running the suite with a prediction function does not allow for access to the scores, so if you need the scores themselves, you have to predict to a file
# and run the suite from the file

In [12]:
from checklist.editor import Editor
from checklist.test_types import MFT
from checklist.test_suite import TestSuite
from checklist.pred_wrapper import PredictorWrapper
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
import matplotlib.pyplot as plt

In [65]:
# initialise CheckList Editor objects
eng_editor = Editor() # default language = English
deu_editor = Editor(language="german")

# initialise CheckList TestSuite for running the tests
suite = TestSuite()

In [5]:
# linguistic resources

# parallel adjective lists for data creation
eng_pos = ["good", "nice", "great"]
deu_pos = ["gut", "schön", "super"]
bar_pos = ["guad", "schee", "subba"]

eng_neg = ["bad", "boring", "stupid"]
deu_neg = ["schlecht", "langweilig", "blöd"]
bar_neg = ["schlecht", "fad", "bled"]

# parallel noun lists for data creation
# picked from Editor suggestions in such a way that mostly all words are different
eng_noun = ["game", "site", "picture", "book", "story", "man", "world", "city", "time", "weather", "life"]

# standard German and Bavarian examples with determiners to avoid errors
deu_noun = [
    ("Das", "Spiel"), ("Die", "Seite"), ("Das", "Bild"), ("Das", "Buch"), ("Die", "Geschichte"), 
    ("Der", "Mann"), ("Die", "Welt"), ("Die", "Stadt"), ("Die", "Zeit"), ("Das", "Wetter"), ("Das", "Leben")
    ]

# bavarian determiners are with spaces to handle "d'" and "s'" determiners
bar_noun = [
    ("Des ", "Spui"), ("De ", "Seitn"), ("Des ", "Buidl"), ("Des ", "Buach"), ("De ", "Gschicht"), 
    ("Der ", "Mo"), ("D'", "Weid"), ("D'", "Stod"), ("D'", "Zeid"), ("S'", "Weda"), ("S'", "Lebm")
    ] 

# negative phrases
eng_neg_p = ["I don't think that", "I hate that", "I don't like that"]
deu_neg_p = ["Ich denke nicht, dass", "Ich hasse, dass", "Ich mag nicht, dass"]
bar_neg_p = ["I deng ned, dass", "I hass des, wenn", "I mog des ned, wenn"] # bavarian constructions work a bit differently than standard German

# positive phrases
eng_pos_p = ["I like that", "I love that", "I'm sure that"]
deu_pos_p = ["Ich mag, dass", "Ich liebe, dass", "Ich bin sicher, dass"]
bar_neg_p = ["I mog des, wenn", "I liebs, wenn", "I bin ma sicha, dass"]

In [83]:
# Sanity tests
data = eng_editor.template("The {noun} is {adj}.", noun=eng_noun, adj=eng_pos, labels=1) # positive samples
data += eng_editor.template("The {noun} is {adj}.", noun=eng_noun, adj=eng_neg, labels=0) # negative samples
test = MFT(**data)
suite.add(test, "Sanity Check\nLanguage: English", "Negation", "Simple positive and negative sentences.", overwrite=True)

In [68]:
# English tests

# test for negated positive examples, expectation: negative (0)
data = eng_editor.template("The {noun} is not {adj}.", noun=eng_noun, adj=eng_pos, labels=0) # not pos = negative
test = MFT(data.data, labels=0)
suite.add(test, "Positive Adjective Negations: Negative\nLanguage: English", "Negation", "Sentences with negated positive adjectives.", overwrite=True)

# test for negations of positive phrases, expectation: negative (0)
data = eng_editor.template("{p} the {noun} is {adj}.", p=eng_neg_p, noun=eng_noun, adj=eng_pos, labels=0)
test = MFT(data.data, labels=0)
suite.add(test, "Positive Phrase Negations: Negative\nLanguage: English", "Negation", "Negations of positive sentences.", overwrite=True)

# test for negated negative adjectives, expectation: positive (1) (for SentiMBERT 2)
data = eng_editor.template("The {noun} is not {adj}.", noun=eng_noun, adj=eng_pos, labels=1) # not neg = positive
test = MFT(data.data, labels=1)
suite.add(test, "Negative Adjective Negations: Positive\nLanguage: English", "Negation", "Sentences with negated negative adjectives.", overwrite=True)

# test for positive additional phrases to negative phrases, expectation: positive (1) (for SentiMBERT 2)
data = eng_editor.template("{p} the {noun} is {adj}.", p=eng_pos_p, noun=eng_noun, adj=eng_neg, labels=1)
test = MFT(data.data, labels=1)
suite.add(test, "Negative Phrases with Positive Additions: Positive\nLanguage: English", "Negation", "Positive additions to negative sentences.", overwrite=True)

In [71]:
# load vanilla mBERT model
model_name = "google-bert/bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# initialise pipeline for predictions
device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device, top_k=None)

lbl2idx = {"LABEL_0": 0, "LABEL_1": 1}
idx2lbl = {0: "LABEL_0", 1: "LABEL_1"}

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda


In [72]:
def predict(data):
    # read data and predict
    raw_preds = pipe(data)

    preds = []
    confs = []

    # write results in correct CheckList format to a file
    for result in raw_preds:
        negative = result[0]
        positive = result[1]

        max_pred = max([negative, positive], key=lambda x: x["score"])
        max_label = max_pred["label"]

        # prediction, negative_score, neutral_score, positive_score
        preds.append(lbl2idx[max_label])
        confs.append(np.array([negative["score"], positive["score"]]))

    return preds, confs

In [84]:
suite.run(predict, overwrite=True)

Running Positive Adjective Negations: Negative
Language: English
Predicting 33 examples


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Running Positive Phrase Negations: Negative
Language: English
Predicting 99 examples
Running Negative Adjective Negations: Positive
Language: English
Predicting 33 examples
Running Negative Phrases with Positive Additions: Positive
Language: English
Predicting 99 examples
Running Sanity Check
Language: English
Predicting 66 examples


In [74]:
def format_example(x, pred, conf, label, meta=None): 
    return f"Sentence: {x}\nGold: {pred}\t\tPredicted: {label}\np(negative) = {round(conf[0], 2)}\tp(positive) = {round(conf[1], 2)}"

In [89]:
suite.summary(format_example_fn = format_example)

Negation

Positive Adjective Negations: Negative
Language: English
Test cases:      33
Fails (rate):    11 (33.3%)

Example fails:
Sentence: The game is not great.
Gold: 1		Predicted: 0
p(negative) = 0.5	p(positive) = 0.5
----
Sentence: The man is not nice.
Gold: 1		Predicted: 0
p(negative) = 0.51	p(positive) = 0.49
----
Sentence: The time is not great.
Gold: 1		Predicted: 0
p(negative) = 0.51	p(positive) = 0.49
----


Positive Phrase Negations: Negative
Language: English
Test cases:      99
Fails (rate):    52 (52.5%)

Example fails:
Sentence: I don't think that the weather is great.
Gold: 1		Predicted: 0
p(negative) = 0.51	p(positive) = 0.49
----
Sentence: I hate that the story is great.
Gold: 1		Predicted: 0
p(negative) = 0.5	p(positive) = 0.5
----
Sentence: I hate that the weather is great.
Gold: 1		Predicted: 0
p(negative) = 0.52	p(positive) = 0.48
----


Negative Adjective Negations: Positive
Language: English
Test cases:      33
Fails (rate):    22 (66.7%)

Example fails:
Senten

In [119]:
def predict_to_file(data):
    # read data and predict
    with open(data, "r") as f:
        data = [line.strip() for line in f.readlines()]

    raw_preds = pipe(data)

    # write results in correct CheckList format to a file
    with open("/tmp/predictions.txt", "w") as f:
        f.write("label negative positive\n")
        for result in raw_preds:
            negative = result[0]
            positive = result[1]

            max_pred = max([negative, positive], key=lambda x: x["score"])
            max_label = max_pred["label"]

            f.write(f"{lbl2idx[max_label]} {negative["score"]} {positive["score"]}\n")

In [None]:
suite.to_raw_file("/tmp/data.txt")

In [120]:
predict_to_file("/tmp/data.txt")

In [123]:
suite.run_from_file("/tmp/predictions.txt", file_format="pred_and_softmax", ignore_header=True, overwrite=True)

In [126]:
suite.summary(format_example_fn = format_example)

Negation

Positive Adjective Negations: Negative
Language: English
Test cases:      33
Fails (rate):    11 (33.3%)

Example fails:
Sentence: The life is not nice.
Gold: 1		Predicted: 0
p(negative) = 0.52	p(positive) = 0.48
----
Sentence: The world is not great.
Gold: 1		Predicted: 0
p(negative) = 0.51	p(positive) = 0.49
----
Sentence: The world is not good.
Gold: 1		Predicted: 0
p(negative) = 0.5	p(positive) = 0.5
----


Positive Phrase Negations: Negative
Language: English
Test cases:      99
Fails (rate):    52 (52.5%)

Example fails:
Sentence: I don't think that the world is great.
Gold: 1		Predicted: 0
p(negative) = 0.52	p(positive) = 0.48
----
Sentence: I don't think that the life is nice.
Gold: 1		Predicted: 0
p(negative) = 0.53	p(positive) = 0.47
----
Sentence: I don't like that the time is good.
Gold: 1		Predicted: 0
p(negative) = 0.51	p(positive) = 0.49
----


Negative Adjective Negations: Positive
Language: English
Test cases:      33
Fails (rate):    22 (66.7%)

Example fail