In [1]:
from IPython.display import clear_output

In [2]:
!pip install -U gensim numpy torch transformers ufal.udpipe
clear_output()

In [32]:
import random
from typing import List, Optional
from warnings import filterwarnings

import gensim
import numpy as np
import torch
from sklearn.metrics import pairwise
from tqdm.notebook import tqdm
from transformers import BertForSequenceClassification, BertTokenizer
from ufal.udpipe import Model, Pipeline

In [33]:
filterwarnings("ignore")

In [4]:
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
else:
    DEVICE = torch.device("cpu")

In [16]:
!wget http://vectors.nlpl.eu/repository/20/213.zip
!wget https://rusvectores.org/static/models/udpipe_syntagrus.model
!mkdir ru_fasttext
!unzip 213.zip -d ru_fasttext
!rm -rf 213.zip

--2022-03-21 13:19:00--  http://vectors.nlpl.eu/repository/20/213.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 129.240.189.181
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|129.240.189.181|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1485270300 (1.4G) [application/zip]
Saving to: ‘213.zip’


2022-03-21 13:20:05 (22.0 MB/s) - ‘213.zip’ saved [1485270300/1485270300]

--2022-03-21 13:20:05--  https://rusvectores.org/static/models/udpipe_syntagrus.model
Resolving rusvectores.org (rusvectores.org)... 116.203.104.23
Connecting to rusvectores.org (rusvectores.org)|116.203.104.23|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 40616122 (39M)
Saving to: ‘udpipe_syntagrus.model’


2022-03-21 13:20:08 (17.0 MB/s) - ‘udpipe_syntagrus.model’ saved [40616122/40616122]

Archive:  213.zip
  inflating: ru_fasttext/meta.json   
  inflating: ru_fasttext/model.model  
  inflating: ru_fasttext/model.model.vectors_ngrams.npy  
  inflating: ru_fas

In [5]:
def classify_texts(
    model: BertForSequenceClassification,
    tokenizer: BertTokenizer,
    texts: List[str],
    batch_size: int = 32,
    desc: Optional[str] = None,
) -> np.ndarray:
    """
    Computes confidencies for a BERT-based text classifier
    on a list of texts.

    Parameters:
        model: BertForSequenceClassification
        tokenizer: BertTokenizer
        texts: List[str]
        batch_size: int
        desc: str or None

    Returns:
        np.ndarray
    """
    ans = []

    for i in tqdm(range(0, len(texts), batch_size), desc=desc):
        batch = tokenizer(
            texts[i : i + batch_size],
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512,
        ).to(model.device)
        with torch.no_grad():
            preds = torch.softmax(model(**batch).logits, -1)[:, 1].cpu().numpy()
        ans.append(preds)

    return np.concatenate(ans)

In [12]:
def style_transfer_accuracy(
    preds: List[str], batch_size: int = 32
) -> np.ndarray:
    """
    Computes style transfer accuracies for the list of model predictions.

    Parameters:
        preds: List[str]
        batch_size: int

    Returns:
        np.ndarray
    """
    tokenizer = BertTokenizer.from_pretrained(
        "SkolkovoInstitute/russian_toxicity_classifier"
    )
    model = BertForSequenceClassification.from_pretrained(
        "SkolkovoInstitute/russian_toxicity_classifier"
    ).to(DEVICE)
    ans = classify_texts(
        model,
        tokenizer,
        preds,
        batch_size=batch_size,
        desc="Calculating predictions' toxicity...",
    )
    return np.array([1 - x for x in ans])

In [17]:
print("Loading FastText model...")
model = gensim.models.KeyedVectors.load("ru_fasttext/model.model")
print("Loading UDPipe model...")
model_udpipe = Model.load("udpipe_syntagrus.model")
pipeline = Pipeline(
    model_udpipe, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu"
    )

Loading FastText model...
Loading UDPipe model...


In [21]:
def get_sentence_vector(text: str, model, pipeline) -> np.ndarray:
    """
    Computes a vector of a given text

    Parameters:
        text: str
        model: a model used for word embeddings
        pipeline: pipeline used for text preprocessing

    Returns:
        np.ndarray
    """
    processed = pipeline.process(text)
    content = [
        line for line in processed.split("\n") if not line.startswith("#")
    ]
    tagged = [w.split("\t") for w in content if w]
    tokens = []

    for token in tagged:
        if token[3] != "PUNCT":
            tokens.append(token[2])

    embd = [model[token] for token in tokens]
    return np.nan_to_num(np.mean(embd, axis=0).reshape(1, -1))

In [49]:
def cosine_similarity(inputs: List[str], preds: List[str]) -> np.ndarray:
    """
    Computes cosine similarities between vectors of texts' embeddings.

    Parameters:
        inputs: List[str]
        preds: List[str]

    Returns:
        np.ndarray
    """
    print("Loading FastText model...")
    model = gensim.models.KeyedVectors.load("ru_fasttext/model.model")
    print("Loading UDPipe model...")
    model_udpipe = Model.load("udpipe_syntagrus.model")
    pipeline = Pipeline(
        model_udpipe, "tokenize", Pipeline.DEFAULT, Pipeline.DEFAULT, "conllu"
    )
    ans = []

    for text_1, text_2 in tqdm(
        zip(inputs, preds), desc="Calculating cosine similarities..."
    ):
        try:
            ans.append(
                pairwise.cosine_similarity(
                    get_sentence_vector(text_1, model, pipeline),
                    get_sentence_vector(text_2, model, pipeline)
                    )[0][0]
                )
        except ValueError:
            ans.append(0.0)

    return np.array(ans)

In [35]:
def fluency_score(
    inputs: List[str], preds: List[str], batch_size: int = 32
) -> np.ndarray:
    """
    Computes fluency scores
    for the two lists of original and predicted sentences.

    Parameters:
        inputs: List[str]
        preds: List[str]
        batch_size: int

    Returns:
        np.ndarray
    """
    tokenizer = BertTokenizer.from_pretrained(
        "SkolkovoInstitute/rubert-base-corruption-detector"
    )
    model = BertForSequenceClassification.from_pretrained(
        "SkolkovoInstitute/rubert-base-corruption-detector"
    ).to(DEVICE)
    input_scores = classify_texts(
        model,
        tokenizer,
        inputs,
        batch_size=batch_size,
        desc="Calculating original sentences' fluency...",
    )
    pred_scores = classify_texts(
        model,
        tokenizer,
        preds,
        batch_size=batch_size,
        desc="Calculating predictions' fluency...",
    )
    ans = pred_scores - input_scores
    ans = ans * 1.15 + 1
    ans = np.maximum(0, ans)
    ans = np.minimum(1, ans)
    return ans

In [7]:
!wget https://raw.githubusercontent.com/vyhuholl/russian_detoxification/master/data/test.txt

--2022-03-21 13:07:31--  https://raw.githubusercontent.com/vyhuholl/russian_detoxification/master/data/test.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1941971 (1.9M) [text/plain]
Saving to: ‘test.txt’


2022-03-21 13:07:32 (33.7 MB/s) - ‘test.txt’ saved [1941971/1941971]



In [8]:
!wget https://raw.githubusercontent.com/vyhuholl/russian_detoxification/master/data/preds_delete.txt

--2022-03-21 13:07:36--  https://raw.githubusercontent.com/vyhuholl/russian_detoxification/master/data/preds_delete.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1775173 (1.7M) [text/plain]
Saving to: ‘preds_delete.txt’


2022-03-21 13:07:37 (31.2 MB/s) - ‘preds_delete.txt’ saved [1775173/1775173]



In [10]:
with open("test.txt") as inputs_file, open("preds_delete.txt") as preds_file:
    inputs = inputs_file.readlines()
    preds = preds_file.readlines()

In [13]:
sta = style_transfer_accuracy(preds)

Calculating predictions' toxicity...:   0%|          | 0/387 [00:00<?, ?it/s]

In [14]:
np.mean(sta)

0.34900120309976107

In [50]:
cs = cosine_similarity(inputs, preds)

Loading FastText model...
Loading UDPipe model...


Calculating cosine similarities...: 0it [00:00, ?it/s]

In [51]:
 np.mean(cs)

0.9666603768298948

In [36]:
fl = fluency_score(inputs, preds)

Downloading:   0%|          | 0.00/1.57M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/508 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/679M [00:00<?, ?B/s]

Calculating original sentences' fluency...:   0%|          | 0/387 [00:00<?, ?it/s]

Calculating predictions' fluency...:   0%|          | 0/387 [00:00<?, ?it/s]

In [37]:
np.mean(fl)

0.83641124

In [52]:
js = sta * cs * fl

In [53]:
np.mean(js)

0.25762027191186176