<a href="https://colab.research.google.com/github/totminaekaterina/RUSSE-2022-Detoxification/blob/main/prepare_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install --upgrade transformers==4.6.0

Collecting transformers==4.6.0
  Downloading transformers-4.6.0-py3-none-any.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 5.3 MB/s 
Collecting huggingface-hub==0.0.8
  Downloading huggingface_hub-0.0.8-py3-none-any.whl (34 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 35.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 42.1 MB/s 
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.0.8 sacremoses-0.0.47 tokenizers-0.10.3 transformers-4.6.0


In [6]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [24]:
import json
from collections import Counter
import re
from pathlib import Path
import pandas as pd
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
from rouge import Rouge
from sklearn.metrics.pairwise import cosine_similarity
import random

In [25]:
MAX_LENGTH = 200


def get_word_tokens(text):
    tokens = re.sub(r"[^\w\s]", "", text).split()
    tokens = [t.lower() for t in tokens]
    return tokens


# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


def get_similarities(model, tokenizer, input_texts, output_texts):
    # Tokenize sentences
    encoded_input = tokenizer(input_texts, padding=True, truncation=True,
                              max_length=MAX_LENGTH, return_tensors="pt").to(model.device)
    encoded_output = tokenizer(output_texts, padding=True, truncation=True,
                              max_length=MAX_LENGTH, return_tensors="pt").to(model.device)

    # Compute token embeddings
    with torch.no_grad():
        model_emb_input = model(**encoded_input)
        model_emb_output = model(**encoded_output)

    # Perform pooling. In this case, mean pooling
    input_embeddings = mean_pooling(model_emb_input, encoded_input["attention_mask"]).cpu()
    output_embeddings = mean_pooling(model_emb_output, encoded_output["attention_mask"]).cpu()
    similarity = cosine_similarity(input_embeddings, output_embeddings)
    return similarity


def get_rougel(input_text, output_text):
    """
    Returns rouge-l f-score
    """
    rouge = Rouge()
    scores = []
    # try/except because of empty output or just dot (in dev_sents)
    try:
        score = rouge.get_scores(input_text, output_text)[0]
        score = score["rouge-l"]["f"]
    except ValueError:  
        score = 0.0
    return score


def set_random_seed(seed_val):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

In [None]:
!wget https://raw.githubusercontent.com/skoltech-nlp/russe_detox_2022/main/data/input/train.tsv
!wget https://raw.githubusercontent.com/skoltech-nlp/russe_detox_2022/main/data/input/dev.tsv
!wget https://raw.githubusercontent.com/skoltech-nlp/russe_detox_2022/main/data/input/test.tsv

In [27]:
DATA_DIR = Path("/content")
TRAIN_DIR = DATA_DIR / "train.tsv"

DEV_PATH = DATA_DIR / "dev.tsv"
TEST_PATH = DATA_DIR / "test.tsv"

OUTPUT_DIR = Path(DATA_DIR / "prepared_data")
OUTPUT_DIR.mkdir(exist_ok=True)

# model for embeddings

In [None]:
tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/sbert_large_nlu_ru")
model = AutoModel.from_pretrained("sberbank-ai/sbert_large_nlu_ru")

# validation dataset

In [33]:
dev_df = pd.read_csv(DEV_PATH, sep="\t")
dev_df.drop(["neutral_comment2"], axis=1, inplace=True)
dev_df.drop(["neutral_comment3"], axis=1, inplace=True)
dev_df.columns = ["toxic_comment", "neutral_comment1"]

dev_df["cosine_sim"] = dev_df.apply(lambda x: get_similarities(model, tokenizer, x["toxic_comment"], x["neutral_comment1"]),
                                    axis=1)
dev_df["cosine_sim"] = dev_df["cosine_sim"].apply(lambda x: x[0][0])
dev_df["rouge_l"] = dev_df.apply(lambda x: get_rougel(x["toxic_comment"], x["neutral_comment1"]), axis=1)
dev_df["input_len"] = dev_df["toxic_comment"].apply(lambda x: len(get_word_tokens(x)))
dev_df["output_len"] = dev_df["neutral_comment1"].apply(lambda x: len(get_word_tokens(x)))
dev_df.to_csv(OUTPUT_DIR / "dev_df_metrics.csv", index=False)

# train dataset

In [44]:
train_df = pd.read_csv(TRAIN_DIR, sep="\t")
train_df.drop(["index"], axis=1, inplace=True)
train_df.drop(["neutral_comment2"], axis=1, inplace=True)
train_df.drop(["neutral_comment3"], axis=1, inplace=True)
train_df.columns = ["toxic_comment", "neutral_comment1"]

train_df["cosine_sim"] = train_df.apply(lambda x: get_similarities(model, tokenizer, x["toxic_comment"], x["neutral_comment1"]),
                                        axis=1)
train_df["cosine_sim"] = train_df["cosine_sim"].apply(lambda x: x[0][0])

train_df["rouge_l"] = train_df.apply(lambda x: get_rougel(x["toxic_comment"], x["neutral_comment1"]), axis=1)

train_df["input_len"] = train_df["toxic_comment"].apply(lambda x: len(get_word_tokens(x)))
train_df["output_len"] = train_df["neutral_comment1"].apply(lambda x: len(get_word_tokens(x)))

# select data

In [45]:
train_df_part = train_df[
                       (train_df["cosine_sim"] < 0.99)
                       & (train_df["cosine_sim"] > 0.6)
                       & (train_df["rouge_l"] < 0.8)
                       & (train_df["rouge_l"] > 0.1)
                       & (train_df["output_len"] <= train_df["input_len"])]

train_df_part[["toxic_comment", "neutral_comment1"]].to_csv(OUTPUT_DIR / "train_df_metrics.csv", index=False)

In [47]:
train_df.to_csv(OUTPUT_DIR / "train_df_metrics.csv", index=False)