<a href="https://colab.research.google.com/github/totminaekaterina/RUSSE-2022-Detoxification/blob/main/evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
use_cuda = False

In [None]:
!pip install --upgrade transformers==4.6.0

In [3]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
import torch
from tqdm.auto import tqdm, trange
from nltk.translate.chrf_score import corpus_chrf

In [4]:
def prepare_target_label(model, target_label):
    if target_label in model.config.id2label:
        pass
    elif target_label in model.config.label2id:
        target_label = model.config.label2id.get(target_label)
    elif target_label.isnumeric() and int(target_label) in model.config.id2label:
        target_label = int(target_label)
    else:
        raise ValueError(f'target_label "{target_label}" is not in model labels or ids: {model.config.id2label}.')
    return target_label


def classify_texts(model, tokenizer, texts, second_texts=None, target_label=None, batch_size=32, verbose=False):
    target_label = prepare_target_label(model, target_label)
    res = []
    if verbose:
        tq = trange
    else:
        tq = range
    for i in tq(0, len(texts), batch_size):
        inputs = [texts[i:i+batch_size]]
        if second_texts is not None:
            inputs.append(second_texts[i:i+batch_size])
        inputs = tokenizer(*inputs, return_tensors='pt', padding=True, truncation=True, max_length=512).to(model.device)
        with torch.no_grad():
            preds = torch.softmax(model(**inputs).logits, -1)[:, target_label].cpu().numpy()
        res.append(preds)
    return np.concatenate(res)


def evaluate_style(
    model,
    tokenizer,
    texts,
    target_label=1,  # 1 is toxic, 0 is neutral
    batch_size=32, 
    verbose=False
):
    target_label = prepare_target_label(model, target_label)
    scores = classify_texts(
        model,
        tokenizer,
        texts, 
        batch_size=batch_size, verbose=verbose, target_label=target_label
    )
    return rotation_calibration(scores, 0.90)


def evaluate_meaning(
    model,
    tokenizer,
    original_texts, 
    rewritten_texts,
    target_label='entailment', 
    bidirectional=True, 
    batch_size=32, 
    verbose=False, 
    aggregation='prod'
):
    target_label = prepare_target_label(model, target_label)
    scores = classify_texts(
        model, tokenizer,
        original_texts, rewritten_texts, 
        batch_size=batch_size, verbose=verbose, target_label=target_label
    )
    if bidirectional:
        reverse_scores = classify_texts(
            model, tokenizer,
            rewritten_texts, original_texts,
            batch_size=batch_size, verbose=verbose, target_label=target_label
        )
        if aggregation == 'prod':
            scores = reverse_scores * scores
        elif aggregation == 'mean':
            scores = (reverse_scores + scores) / 2
        elif aggregation == 'f1':
            scores = 2 * reverse_scores * scores / (reverse_scores + scores)
        else:
            raise ValueError('aggregation should be one of "mean", "prod", "f1"')
    return scores


def encode_cls(texts, model, tokenizer, batch_size=32, verbose=False):
    results = []
    if verbose:
        tq = trange
    else:
        tq = range
    for i in tq(0, len(texts), batch_size):
        batch = texts[i: i + batch_size]
        with torch.no_grad():
            out = model(**tokenizer(batch, return_tensors='pt', padding=True, truncation=True).to(model.device))
            embeddings = out.pooler_output
            embeddings = torch.nn.functional.normalize(embeddings).cpu().numpy()
            results.append(embeddings)
    return np.concatenate(results)


def evaluate_cosine_similarity(
    model,
    tokenizer,
    original_texts,
    rewritten_texts,
    batch_size=32,
    verbose=False,
):
    scores = (
        encode_cls(original_texts, model=model, tokenizer=tokenizer, batch_size=batch_size, verbose=verbose)
        * encode_cls(rewritten_texts, model=model, tokenizer=tokenizer, batch_size=batch_size, verbose=verbose)
    ).sum(1)
    return rotation_calibration(scores, 1.50)


def evaluate_cola(
    model,
    tokenizer,
    texts,
    target_label=1,
    batch_size=32, 
    verbose=False
):
    target_label = prepare_target_label(model, target_label)
    scores = classify_texts(
        model, tokenizer,
        texts, 
        batch_size=batch_size, verbose=verbose, target_label=target_label
    )
    return scores


def evaluate_cola_relative(
    model,
    tokenizer,
    original_texts,
    rewritten_texts,
    target_label=1,
    batch_size=32,
    verbose=False,
    maximum=0,
):
    target_label = prepare_target_label(model, target_label)
    original_scores = classify_texts(
        model, tokenizer,
        original_texts,
        batch_size=batch_size, verbose=verbose, target_label=target_label
    )
    rewritten_scores = classify_texts(
        model, tokenizer,
        rewritten_texts,
        batch_size=batch_size, verbose=verbose, target_label=target_label
    )
    scores = rewritten_scores - original_scores
    if maximum is not None:
        scores = np.minimum(0, scores)
    return rotation_calibration(scores, 1.15, px=0)


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def rotation_calibration(data, coef=1.0, px=1, py=1, minimum=0, maximum=1):
    result = (data - px) * coef + py
    if minimum is not None:
        result = np.maximum(minimum, result)
    if maximum is not None:
        result = np.minimum(maximum, result)
    return result


def evaluate_style_transfer(
    original_texts,
    rewritten_texts,
    style_model,
    style_tokenizer,
    meaning_model,
    meaning_tokenizer,
    cola_model,
    cola_tokenizer,
    style_target_label=1,
    batch_size=32,
    verbose=True,
    aggregate=False,
    style_calibration=None,
    meaning_calibration=None,
    fluency_calibration=None,
):
    if verbose: print('Style evaluation')
    accuracy = evaluate_style(
        style_model,
        style_tokenizer,
        rewritten_texts,
        target_label=style_target_label, batch_size=batch_size, verbose=verbose
    )
    if verbose: print('Meaning evaluation')
    similarity = evaluate_cosine_similarity(
        meaning_model,
        meaning_tokenizer,
        original_texts, 
        rewritten_texts,
        batch_size=batch_size, verbose=verbose
    )
    if verbose: print('Fluency evaluation')
    fluency = evaluate_cola_relative(
        cola_model,
        cola_tokenizer,
        rewritten_texts=rewritten_texts,
        original_texts=original_texts,
        batch_size=batch_size, verbose=verbose,
    )

    joint = accuracy * similarity * fluency
    if verbose and (style_calibration or meaning_calibration or fluency_calibration):
        print('Scores:')
        print(f'Style transfer accuracy (STA):  {np.mean(accuracy)}')
        print(f'Meaning preservation (SIM):     {np.mean(similarity)}')
        print(f'Fluency score (FL):             {np.mean(fluency)}')
        print(f'Joint score (J):                {np.mean(joint)}')

    result = dict(
        accuracy=accuracy,
        similarity=similarity,
        fluency=fluency,
        joint=joint
    )
    if aggregate:
        return {k: float(np.mean(v)) for k, v in result.items()}
    return result

In [5]:
def evaluate(original, preds, batch_size):
    return evaluate_style_transfer(
        original_texts=original,
        rewritten_texts=preds,
        style_model=style_model,
        style_tokenizer=style_tokenizer,
        meaning_model=meaning_model,
        meaning_tokenizer=meaning_tokenizer,
        cola_model=fluency_model,
        cola_tokenizer=fluency_tolenizer,
        style_target_label=0,
        batch_size=batch_size,
        aggregate=True
    )

In [6]:
def load_model(model_name=None, model=None, tokenizer=None,
               model_class=AutoModelForSequenceClassification, use_cuda=True):
    if model is None:
        if model_name is None:
            raise ValueError('Either model or model_name should be provided')
        model = model_class.from_pretrained(model_name)
        if torch.cuda.is_available() and use_cuda:
            model.cuda()
    if tokenizer is None:
        if model_name is None:
            raise ValueError('Either tokenizer or model_name should be provided')
        tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

In [None]:
!wget https://raw.githubusercontent.com/totminaekaterina/RUSSE-2022-Detoxification/main/prepared_data/dev_cleaned.csv

In [None]:
!wget https://raw.githubusercontent.com/totminaekaterina/RUSSE-2022-Detoxification/main/prepared_data/test_cleaned.csv

In [None]:
!wget https://raw.githubusercontent.com/skoltech-nlp/russe_detox_2022/main/data/input/dev.tsv

In [10]:
df = pd.read_csv('dev.tsv', sep='\t')
df = df.fillna('')

In [11]:
df.head(2)

Unnamed: 0,toxic_comment,neutral_comment1,neutral_comment2,neutral_comment3
0,пиздеж! температуры горения хватит чтобы её ра...,Враньё! Температуры горения хватит чтобы ее ра...,"неправда,температуры горения хватит чтобы расп...",Враньё! Температуры горения хватит на чтобы её...
1,а ты чмо там был.ты вообще служил.гандон,А ты там был? Ты вообще служил?,,


In [12]:
toxic_inputs = df['toxic_comment'].tolist()

neutral_references = []
for index, row in df.iterrows():
    neutral_references.append([row['neutral_comment1'], row['neutral_comment2'], row['neutral_comment3']])

In [13]:
df_dev = pd.read_csv('dev_cleaned.csv',sep='\t')
preds = df_dev['neutral_comment'].tolist()

In [14]:
print(preds[:5])

['Теплые температуры горения хватит чтобы её расплавить', 'а ты что там был.ты вообще служил', 'а сам где кормишься', 'этому сайту я давно не доверяю пишут разную ерунду', 'у меня сегодня подобный звонок был. достали']


In [15]:
df_dev.head()

Unnamed: 0,neutral_comment
0,Теплые температуры горения хватит чтобы её рас...
1,а ты что там был.ты вообще служил
2,а сам где кормишься
3,этому сайту я давно не доверяю пишут разную ер...
4,у меня сегодня подобный звонок был. достали


#Style Transfer Accuracy (STA)

In [16]:
style_model, style_tokenizer = load_model('SkolkovoInstitute/russian_toxicity_classifier', use_cuda=use_cuda)

Downloading:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/712M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/585 [00:00<?, ?B/s]

In [17]:
accuracy = evaluate_style(
    model = style_model,
    tokenizer = style_tokenizer,
    texts = preds,
    target_label=0,
    batch_size=32, 
    verbose=True
)

  0%|          | 0/25 [00:00<?, ?it/s]

In [18]:
print(f'Style transfer accuracy (STA):  {np.mean(accuracy)}')

Style transfer accuracy (STA):  0.7891517877578735


#Meaning Preservation Score (SIM)

In [19]:
meaning_model, meaning_tokenizer = load_model('cointegrated/LaBSE-en-ru', use_cuda=use_cuda, model_class=AutoModel)

Downloading:   0%|          | 0.00/806 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/516M [00:00<?, ?B/s]

Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/521k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

In [20]:
similarity = evaluate_cosine_similarity(
    model = meaning_model,
    tokenizer = meaning_tokenizer,
    original_texts = toxic_inputs,
    rewritten_texts = preds,
    batch_size=32,
    verbose=True,
    )

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

In [21]:
print(f'Meaning preservation (SIM):  {np.mean(similarity)}')

Meaning preservation (SIM):  0.7298054695129395


#Fluency score (FL)

In [22]:
fluency_model, fluency_tolenizer = load_model('SkolkovoInstitute/rubert-base-corruption-detector', use_cuda=use_cuda)

Downloading:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/712M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.62M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/508 [00:00<?, ?B/s]

In [23]:
fluency = evaluate_cola_relative(
    model = fluency_model,
    tokenizer = fluency_tolenizer,
    original_texts = toxic_inputs,
    rewritten_texts = preds,
    target_label=1,
    batch_size=32,
    verbose=True
)

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

In [24]:
print(f'Fluency score (FL):  {np.mean(fluency)}')

Fluency score (FL):  0.7285358905792236


#Joint score (J)

In [25]:
joint = accuracy * similarity * fluency

In [26]:
print(f'Joint score (J):   {np.mean(joint)}')

Joint score (J):   0.4215254485607147


#ChrF1 with references

In [None]:
!pip install sacrebleu

In [76]:
from sacrebleu.metrics import CHRF

In [30]:
results = evaluate(toxic_inputs, preds, batch_size = 32)

Style evaluation


  0%|          | 0/25 [00:00<?, ?it/s]

Meaning evaluation


  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

Fluency evaluation


  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

In [74]:
with open('results.md', 'w') as f:
  f.writelines('| Model | ACC | SIM | FL | J | ChrF1 |\n')
  f.writelines('| ----- | --- | --- | -- | - | ---- |\n')

In [32]:
print(neutral_references[0])

['Враньё! Температуры горения хватит чтобы ее расплавить', 'неправда,температуры горения хватит чтобы расплавить её', 'Враньё! Температуры горения хватит на чтобы её расплавить полностью.']


In [33]:
len(neutral_references) == len(preds)

True

In [35]:
chrf = CHRF()

In [36]:
print(neutral_references[0])

['Враньё! Температуры горения хватит чтобы ее расплавить', 'неправда,температуры горения хватит чтобы расплавить её', 'Враньё! Температуры горения хватит на чтобы её расплавить полностью.']


In [37]:
print(preds[0])

Теплые температуры горения хватит чтобы её расплавить


In [54]:
chrf.corpus_score(str(neutral_references), preds)

chrF2 = 55.56

In [75]:
with open('results.md', 'a') as res_file:
  res_file.writelines(f"{'e.totmina_model'}|{results['accuracy']:.4f}|{results['similarity']:.4f}|"
  f"{results['fluency']:.4f}|{results['joint']:.4f}|{chrf.corpus_score(str(neutral_references), preds)}|\n")