In [None]:
!pip install checklist textblob 

In [None]:
# !jupyter nbextension install --py --sys-prefix checklist.viewer
# !jupyter nbextension enable --py --sys-prefix checklist.viewer

In [None]:
!python -m spacy download en_core_web_sm

In [3]:
from textblob import TextBlob
import checklist
from checklist.editor import Editor
from checklist.perturb import Perturb
from checklist.test_types import MFT, INV, DIR
from checklist.pred_wrapper import PredictorWrapper

<IPython.core.display.Javascript object>

In [4]:
editor = Editor()

<IPython.core.display.Javascript object>

# Minimum Functionality Test (MFT)

In [5]:
TextBlob("Going out to eat is not a bad option").sentiment[0]

0.3499999999999999

<IPython.core.display.Javascript object>

In [6]:
pos = [
    "good",
    "realistic",
    "healthy",
    "attractive",
    "appealing",
    "acceptable",
    "best",
    "feasible",
    "easy",
    "ideal",
    "affordable",
    "economical",
    "recommended",
    "exciting",
    "inexpensive",
    "obvious",
    "great",
    "appropriate",
    "effective",
    "excellent",
]

neg = [
    "bad",
    "unhealthy",
    "expensive",
    "boring",
    "terrible",
    "worst",
    "unfeasible",
    "unappropriate",
    "awful",
    "time-consuming",
]

<IPython.core.display.Javascript object>

In [7]:
editor.template(
    "Staying at home is not {a:pos} option.", pos=["good", "great"], nsamples=2
).data

['Staying at home is not a great option.',
 'Staying at home is not a great option.']

<IPython.core.display.Javascript object>

In [8]:
editor.template("{mask} is not {a:pos} option.", pos=["good", "great"], nsamples=2).data

  return torch._C._cuda_getDeviceCount() > 0
  to_pred = torch.tensor(to_pred, device=self.device).to(torch.int64)


['sleep is not a good option.', 'Rest is not a great option.']

<IPython.core.display.Javascript object>

In [45]:
samples = editor.template(
    "{mask} is not {a:pos} option.", pos=pos, labels=0, save=True, nsamples=100
)
samples += editor.template(
    "{mask} is not {a:neg} option.", neg=neg, labels=1, save=True, nsamples=100
)

<IPython.core.display.Javascript object>

In [48]:
test = MFT(
    samples.data, labels=samples.labels, name="Test negation", capability="Negation"
)

<IPython.core.display.Javascript object>

In [51]:
TextBlob("good").sentiment[0]

0.7

<IPython.core.display.Javascript object>

In [56]:
(-0.3 + 1) / 2

0.35

<IPython.core.display.Javascript object>

In [11]:
import numpy as np


def predict_proba(inputs):
    p1 = np.array([(TextBlob(x).sentiment[0] + 1) / 2.0 for x in inputs]).reshape(-1, 1)
    p0 = 1 - p1
    return np.hstack((p0, p1))


wrapped_pp = PredictorWrapper.wrap_softmax(predict_proba)

<IPython.core.display.Javascript object>

In [12]:
predict_proba(["good"])

array([[0.15, 0.85]])

<IPython.core.display.Javascript object>

In [59]:
wrapped_pp(["Training is not an unfeasible option."])

(array([0]), array([[0.5, 0.5]]))

<IPython.core.display.Javascript object>

In [49]:
test.run(wrapped_pp)

Predicting 200 examples


<IPython.core.display.Javascript object>

In [50]:
test.summary()

Test cases:      200
Fails (rate):    115 (57.5%)

Example fails:
0.0 Shipping is not an awful option.
----
0.5 Pinterest is not a time-consuming option.
----
0.5 Training is not an unfeasible option.
----


<IPython.core.display.Javascript object>

In [16]:
test.visual_summary()

TestSummarizer(stats={'npassed': 85, 'nfailed': 115, 'nfiltered': 0}, summarizer={'name': 'Test negation', 'de…

<IPython.core.display.Javascript object>

# Invariance Tests

In [17]:
import spacy

nlp = spacy.load("en_core_web_sm")

<IPython.core.display.Javascript object>

In [18]:
data = [
    "The cake is great.",
    "awful",
    "Michael had fun traveling to Mexico",
    "Anna hates party.",
    "This laptop is not very good",
]

<IPython.core.display.Javascript object>

In [19]:
pdata = list(nlp.pipe(data))

<IPython.core.display.Javascript object>

In [20]:
pdata

[The cake is great.,
 awful,
 Michael had fun traveling to Mexico,
 Anna hates party.,
 This laptop is not very good]

<IPython.core.display.Javascript object>

In [21]:
def test_invariant(data: list, method: callable, wrapped_predict: callable):
    t = Perturb.perturb(data, method)
    print("First sample before and after pertubation:")
    print("\n".join(t.data[0]))
    print("\nSummary:")
    test = INV(**t)
    test.run(wrapped_predict)
    test.summary()

<IPython.core.display.Javascript object>

## Add Punctuation

In [22]:
test_invariant(pdata, Perturb.punctuation, wrapped_pp)

First sample before and after pertubation:
The cake is great.
The cake is great

Summary:
Predicting 10 examples
Test cases:      5
Fails (rate):    0 (0.0%)


<IPython.core.display.Javascript object>

## Add Typos

In [23]:
test_invariant(data, Perturb.add_typos, wrapped_pp)

First sample before and after pertubation:
The cake is great.
The cake is rgeat.

Summary:
Predicting 10 examples
Test cases:      5
Fails (rate):    2 (40.0%)

Example fails:
0.4 This laptop is not very good
1.0 This laptop i snot very good

----
0.9 The cake is great.
0.5 The cake is rgeat.

----


<IPython.core.display.Javascript object>

## Change Names

In [24]:
test_invariant(pdata, Perturb.change_names, wrapped_pp)

First sample before and after pertubation:
Michael had fun traveling to Mexico
George had fun traveling to Mexico
Kenneth had fun traveling to Mexico
Kevin had fun traveling to Mexico
Dustin had fun traveling to Mexico
Zachary had fun traveling to Mexico
Luis had fun traveling to Mexico
Patrick had fun traveling to Mexico
Antonio had fun traveling to Mexico
Ryan had fun traveling to Mexico
James had fun traveling to Mexico

Summary:
Predicting 22 examples
Test cases:      2
Fails (rate):    0 (0.0%)


<IPython.core.display.Javascript object>

## Change Location

In [25]:
test_invariant(pdata, Perturb.change_location, wrapped_pp)

First sample before and after pertubation:
Michael had fun traveling to Mexico
Michael had fun traveling to Philippines
Michael had fun traveling to Brazil
Michael had fun traveling to Myanmar
Michael had fun traveling to Sudan
Michael had fun traveling to Nigeria
Michael had fun traveling to Algeria
Michael had fun traveling to Iran
Michael had fun traveling to Turkey
Michael had fun traveling to France
Michael had fun traveling to Uganda

Summary:
Predicting 11 examples
Test cases:      1
Fails (rate):    0 (0.0%)


<IPython.core.display.Javascript object>

## Change to related nouns

In [26]:
import nltk

nltk.download("brown")
nltk.download("punkt")

[nltk_data] Downloading package brown to /home/khuyen/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /home/khuyen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

<IPython.core.display.Javascript object>

In [27]:
def find_first_noun(text: str):
    spacy_text = nlp(text)
    nouns = [word.text for word in spacy_text if word.tag_ == "NN"]
    if len(nouns) >= 1:
        return nouns[0]
    return nouns

<IPython.core.display.Javascript object>

In [28]:
def change_to_related_nouns(sent: str, num_words: int = 5):
    noun = find_first_noun(sent)
    if noun:
        related_nouns = editor.related_words(sent, noun)[:num_words]
        return [sent.replace(noun, new_word) for new_word in related_nouns]
    return sent

<IPython.core.display.Javascript object>

In [30]:
text = data[0]
find_first_noun(text)

'cake'

<IPython.core.display.Javascript object>

In [31]:
test_invariant(data, change_to_related_nouns, wrapped_pp)

First sample before and after pertubation:
The cake is great.
The game is great.
The movie is great.
The book is great.
The food is great.
The story is great.

Summary:
Predicting 20 examples
Test cases:      4
Fails (rate):    0 (0.0%)


<IPython.core.display.Javascript object>

# Directional Expectation Tests

In [32]:
from checklist.expect import Expect

<IPython.core.display.Javascript object>

## Expect Changes

In [33]:
def changed_pred(orig_pred, pred, orig_conf, conf, labels=None, meta=None):
    return pred != orig_pred


expect_fn = Expect.pairwise(changed_pred)

<IPython.core.display.Javascript object>

In [35]:
t = Perturb.perturb(pdata, Perturb.add_negation)
t.data[0:2]

[['The cake is great.', 'The cake is not great.'],
 ['Michael had fun traveling to Mexico',
  "Michael didn't have fun traveling to Mexico"]]

<IPython.core.display.Javascript object>

In [36]:
test = DIR(**t, expect=expect_fn)
test.run(wrapped_pp)
test.summary()

Predicting 6 examples
Test cases:      3
Fails (rate):    2 (66.7%)

Example fails:
0.5 Anna hates party.
0.1 Anna doesn't hate party.

----
0.7 Michael had fun traveling to Mexico
0.7 Michael didn't have fun traveling to Mexico

----


<IPython.core.display.Javascript object>

## Expect Monotone Decrease

In [37]:
data2 = [
    "The cheesecake is great.",
    "awesome",
    "Michael had fun traveling to Mexico",
    "Anna loves party.",
    "This laptop is very good",
]

pdata2 = list(nlp.pipe(data2))

<IPython.core.display.Javascript object>

In [38]:
monotonic_decreasing = Expect.monotonic(label=1, increasing=False, tolerance=0.1)

<IPython.core.display.Javascript object>

In [42]:
t = Perturb.perturb(pdata2, Perturb.add_negation)

print("First sample before and after pertubation:")
print("\n".join(t.data[0]))
print("\nSummary:")

test = DIR(**t, expect=monotonic_decreasing)
test.run(wrapped_pp)
test.summary()

First sample before and after pertubation:
The cheesecake is great.
The cheesecake is not great.

Summary:
Predicting 8 examples
Test cases:      4
After filtering: 2 (50.0%)
Fails (rate):    1 (50.0%)

Example fails:
0.5 Anna loves party.
0.8 Anna doesn't love party.

----


<IPython.core.display.Javascript object>