# Bad explanations

In [1]:
import json
import numpy as np
import sys
from IPython.display import display, HTML
from pathlib import Path


sys.path.append("..")

from experiments.datasets import HateXplain
from experiments.explainers import LimeExplainer
from experiments.metrics import alternative_auprc
from experiments.models import DistilBertLogisticRegression

In [2]:
date = "2023-03-14-22-37-07"
experiment = "hatexplain-lime-distilbert-2"

here_path = Path().absolute()
experiments_path = here_path.parent / "data" / "experiments"
experiment_path = experiments_path / date / experiment
results_path = experiment_path / "results.jsonl"

In [3]:
results = []
with open(results_path, 'r') as f:
    for line in f:
        results.append(json.loads(line))

In [4]:
def select_results_with_weight(weight):
    for result in results:
        if result['weight'] == weight:
            return result

In [5]:
greatest_alternative_auprc = 0
preferred_result = None
for result in results:
    alternative_auprc_ = result['explainability']['alternative_auprc']
    alternative_auprc_ = [x for x in alternative_auprc_ if x is not None and not np.isnan(x)]
    alternative_auprc_ = np.mean(alternative_auprc_)
    if  alternative_auprc_ > greatest_alternative_auprc:
        greatest_alternative_auprc = alternative_auprc_
        preferred_result = result
print(greatest_alternative_auprc)
print(preferred_result['weight'])

0.821847995830308
0.19537185696468617


In [6]:
bad_result = select_results_with_weight(1)
good_result = preferred_result

In [7]:
dataset = HateXplain(
    negative_rationales=2,
    shuffle=True,
    random_state=42,
)

In [None]:
model_name = "visual-ds/distilbert-base-uncased-hatexplain"
use_auth_token = True
model_1 = DistilBertLogisticRegression(
    model_name=model_name,
    device=0,
    batch_size=128,
    random_state=46,
    lr_tol=1e-4,
    lr_C=1.0,
    lr_max_iter=1e3,
    n_jobs=1,
    cross_val=True,
    use_auth_token=use_auth_token,
)
model_1.fit(dataset.train_dataset, 1.0, 0.0)

In [None]:
model_2 = DistilBertLogisticRegression(
    model_name=model_name,
    device=0,
    batch_size=128,
    random_state=46,
    lr_tol=1e-4,
    lr_C=model_1.clf.C,
    lr_max_iter=1e3,
    n_jobs=1,
    cross_val=False,
    use_auth_token=use_auth_token,
)
model_2.fit(dataset.train_dataset, good_result['weight'], 1.0 - good_result['weight'])

In [10]:
# good_alternative_auprc = good_result['explainability']['alternative_auprc']
# good_alternative_auprc = [x if x is not None and not np.isnan(x) else 0 for x in good_alternative_auprc]

# bad_alternative_auprc = bad_result['explainability']['alternative_auprc']
# bad_alternative_auprc = [x if x is not None and not np.isnan(x) else 0 for x in bad_alternative_auprc]

# difference = np.array(good_alternative_auprc) - np.array(bad_alternative_auprc)
# suggestions = np.argsort(difference)[::-1]

In [11]:
def explanation_in_html(scores, sample):
    max_abs_score = np.max(np.abs(scores))
    scores = scores/max_abs_score

    green = np.array([60, 179, 113])  # MediumSeaGreen
    white = np.array([255, 255, 255])  # White
    red = np.array([220, 20, 60])  # Crimson

    colors = []
    for score in scores:
        if score >= 0:
            color = score*green + (1 - score)*white
            colors.append(color)
        else:
            score *= -1
            color = score*red + (1 - score)*white
            colors.append(color)
    colors = [list(color) for color in colors]

    html = ""
    tokens = sample['tokens']
    for token, color in zip(tokens, colors):
        color = f"rgb({color[0]},{color[1]},{color[2]})"
        span_style = f"style=\"background-color:{color}\""
        html += f"<span {span_style}>{token}</span>"
        html += " "
    html = html[:-1]

    span_style = "style=\"color:Black;background-color:White;\""
    html = f"<span {span_style}>{html}</span>"
    return html

In [12]:
def rescale_explanation(scores):
    scores = np.array(scores)
    scores[scores < 0] = 0
    max_value = np.max(scores)
    min_value = np.min(scores)
    length = len(scores)
    steps = np.linspace(min_value, max_value, length)

    indices = np.argsort(scores)
    scores = np.array(scores)
    rescaled_scores = np.zeros(length)
    for index, step in zip(indices, steps):
        rescaled_scores[index] = step

    return rescaled_scores

In [13]:
sample = dataset.test_dataset[688]
explainer = LimeExplainer(
    num_samples=1000,
    random_state=46,
    split_token=dataset.split_token
)
bad_explanation = explainer.explain_sample(model_1, sample)
bad_explanation = rescale_explanation(bad_explanation)
print(bad_explanation)
display(HTML(explanation_in_html(bad_explanation, sample)))
good_explanation = explainer.explain_sample(model_2, sample)
good_explanation = rescale_explanation(good_explanation)
print(good_explanation)
display(HTML(explanation_in_html(good_explanation, sample)))

In [14]:
good_auprc = good_result['explainability']['alternative_auprc']
good_auprc = [x if x is not None and not np.isnan(x) else 0 for x in good_auprc]
bad_auprc = bad_result['explainability']['alternative_auprc']
bad_auprc = [x if x is not None and not np.isnan(x) else 0 for x in bad_auprc]
auprc_diff = np.array(good_auprc) - np.array(bad_auprc)

for i in np.argsort(auprc_diff)[::-1]:
    sample = dataset.test_dataset[i]
    if sample['label'] == "normal":
        continue
    explainer = LimeExplainer(
        num_samples=1000,
        random_state=46,
        split_token=dataset.split_token,
    )
    bad_explanation = explainer.explain_sample(model_1, sample)
    bad_auprc = alternative_auprc(bad_explanation, sample['rationales'], None, None, None, None)
    good_explanation = explainer.explain_sample(model_2, sample)
    good_auprc = alternative_auprc(good_explanation, sample['rationales'], None, None, None, None)
    # if good_auprc - bad_auprc <= 0.5:
    #     print(f"Sample {i} has no good explanation: {good_auprc}, {bad_auprc}")
    #     continue

    print(f"Sample {i}, label {sample['label']}")
    display(HTML(dataset.sample_in_html(sample)))
    n_rationales = sum(sample['rationales'])

    predict_proba = model_1.predict_proba([sample])[0]
    predicted_label = model_1.classes[np.argmax(predict_proba)]
    print(f"Bad explanation: {bad_auprc}; prediction: {predicted_label}")
    bad_explanation = rescale_explanation(bad_explanation)
    display(HTML(explanation_in_html(bad_explanation, sample)))
    argsort = np.argsort(bad_explanation)[::-1]
    bad_explanation[argsort[:n_rationales]] = 1
    bad_explanation[argsort[n_rationales:]] = 0
    display(HTML(explanation_in_html(bad_explanation, sample)))
    tokens = sample['tokens']
    colorbox_tokens = [
        r"\colorbox{white}{" + token + "}" if explanation == 0 else
        r"\colorbox{movie}{" + token + "}" for token, explanation in
        zip(tokens, bad_explanation)
    ]
    print("".join(colorbox_tokens))

    predict_proba = model_2.predict_proba([sample])[0]
    predicted_label = model_2.classes[np.argmax(predict_proba)]
    print(f"Good explanation: {good_auprc}; prediction: {predicted_label}")
    good_explanation = rescale_explanation(good_explanation)
    display(HTML(explanation_in_html(good_explanation, sample)))
    argsort = np.argsort(good_explanation)[::-1]
    good_explanation[argsort[:n_rationales]] = 1
    good_explanation[argsort[n_rationales:]] = 0
    display(HTML(explanation_in_html(good_explanation, sample)))
    colorbox_tokens = [
        r"\colorbox{white}{" + token + "}" if explanation == 0 else
        r"\colorbox{movie}{" + token + "}" for token, explanation in
        zip(tokens, good_explanation)
    ]
    print("".join(colorbox_tokens))