<a href="https://colab.research.google.com/github/totminaekaterina/RUSSE-2022-Detoxification/blob/main/prepare_datasets_with_metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --upgrade transformers==4.6.0

Collecting transformers==4.6.0
  Downloading transformers-4.6.0-py3-none-any.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 15.4 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 31.0 MB/s 
Collecting huggingface-hub==0.0.8
  Downloading huggingface_hub-0.0.8-py3-none-any.whl (34 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 32.4 MB/s 
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.0.8 sacremoses-0.0.49 tokenizers-0.10.3 transformers-4.6.0


In [2]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [3]:
import json
from collections import Counter
import re
from pathlib import Path
import pandas as pd
import numpy as np
import torch
from transformers import AutoModel, AutoTokenizer
from rouge import Rouge
from sklearn.metrics.pairwise import cosine_similarity
import random

In [4]:
MAX_LENGTH = 200


def get_word_tokens(text):
    tokens = re.sub(r"[^\w\s]", "", text).split()
    tokens = [t.lower() for t in tokens]
    return tokens


def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


def get_similarities(model, tokenizer, input_texts, output_texts):
    encoded_input = tokenizer(input_texts, padding=True, truncation=True,
                              max_length=MAX_LENGTH, return_tensors="pt").to(model.device)
    encoded_output = tokenizer(output_texts, padding=True, truncation=True,
                              max_length=MAX_LENGTH, return_tensors="pt").to(model.device)

    with torch.no_grad():
        model_emb_input = model(**encoded_input)
        model_emb_output = model(**encoded_output)

    input_embeddings = mean_pooling(model_emb_input, encoded_input["attention_mask"]).cpu()
    output_embeddings = mean_pooling(model_emb_output, encoded_output["attention_mask"]).cpu()
    similarity = cosine_similarity(input_embeddings, output_embeddings)
    return similarity


def get_rougel(input_text, output_text):
    """
    Returns rouge-l f-score
    """
    rouge = Rouge()
    scores = []
    try:
        score = rouge.get_scores(input_text, output_text)[0]
        score = score["rouge-l"]["f"]
    except ValueError:  
        score = 0.0
    return score


def set_random_seed(seed_val):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

In [5]:
!wget https://raw.githubusercontent.com/skoltech-nlp/russe_detox_2022/main/data/input/train.tsv
!wget https://raw.githubusercontent.com/skoltech-nlp/russe_detox_2022/main/data/input/dev.tsv
!wget https://raw.githubusercontent.com/skoltech-nlp/russe_detox_2022/main/data/input/test.tsv

--2022-03-25 02:26:23--  https://raw.githubusercontent.com/skoltech-nlp/russe_detox_2022/main/data/input/train.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1902888 (1.8M) [text/plain]
Saving to: ‘train.tsv’


2022-03-25 02:26:23 (82.3 MB/s) - ‘train.tsv’ saved [1902888/1902888]

--2022-03-25 02:26:23--  https://raw.githubusercontent.com/skoltech-nlp/russe_detox_2022/main/data/input/dev.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 200691 (196K) [text/plain]
Saving to: ‘dev.tsv’


2022-03-25 02:26:23 (16.8 MB/s) - ‘dev

In [6]:
DATA_DIR = Path("/content")
TRAIN_DIR = DATA_DIR / "train.tsv"

DEV_PATH = DATA_DIR / "dev.tsv"
TEST_PATH = DATA_DIR / "test.tsv"

OUTPUT_DIR = Path(DATA_DIR / "prepared_data")
OUTPUT_DIR.mkdir(exist_ok=True)

# model for embeddings

In [7]:
tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/sbert_large_nlu_ru")
model = AutoModel.from_pretrained("sberbank-ai/sbert_large_nlu_ru")

Downloading:   0%|          | 0.00/655 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/323 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

# validation dataset

In [8]:
dev_df = pd.read_csv(DEV_PATH, sep="\t")
dev_df.drop(["neutral_comment2"], axis=1, inplace=True)
dev_df.drop(["neutral_comment3"], axis=1, inplace=True)
dev_df.columns = ["toxic_comment", "neutral_comment1"]

dev_df["cosine_sim"] = dev_df.apply(lambda x: get_similarities(model, tokenizer, x["toxic_comment"], x["neutral_comment1"]),
                                    axis=1)
dev_df["cosine_sim"] = dev_df["cosine_sim"].apply(lambda x: x[0][0])
dev_df["rouge_l"] = dev_df.apply(lambda x: get_rougel(x["toxic_comment"], x["neutral_comment1"]), axis=1)
dev_df["input_len"] = dev_df["toxic_comment"].apply(lambda x: len(get_word_tokens(x)))
dev_df["output_len"] = dev_df["neutral_comment1"].apply(lambda x: len(get_word_tokens(x)))
dev_df.to_csv(OUTPUT_DIR / "dev_df_metrics.csv", index=False)

# train dataset

In [9]:
train_df = pd.read_csv(TRAIN_DIR, sep="\t")
train_df.drop(["index"], axis=1, inplace=True)
train_df.drop(["neutral_comment2"], axis=1, inplace=True)
train_df.drop(["neutral_comment3"], axis=1, inplace=True)
train_df.columns = ["toxic_comment", "neutral_comment1"]

train_df["cosine_sim"] = train_df.apply(lambda x: get_similarities(model, tokenizer, x["toxic_comment"], x["neutral_comment1"]),
                                        axis=1)
train_df["cosine_sim"] = train_df["cosine_sim"].apply(lambda x: x[0][0])

train_df["rouge_l"] = train_df.apply(lambda x: get_rougel(x["toxic_comment"], x["neutral_comment1"]), axis=1)

train_df["input_len"] = train_df["toxic_comment"].apply(lambda x: len(get_word_tokens(x)))
train_df["output_len"] = train_df["neutral_comment1"].apply(lambda x: len(get_word_tokens(x)))

# select data

In [10]:
train_df_part = train_df[
                       (train_df["cosine_sim"] < 0.99)
                       & (train_df["cosine_sim"] > 0.6)
                       & (train_df["rouge_l"] < 0.8)
                       & (train_df["rouge_l"] > 0.1)
                       & (train_df["output_len"] <= train_df["input_len"])]

train_df_part[["toxic_comment", "neutral_comment1"]].to_csv(OUTPUT_DIR / "train_part.csv", index=False)

In [11]:
train_df.to_csv(OUTPUT_DIR / "train_df_metrics.csv", index=False)

#results

In [13]:
train_df_metrics = pd.read_csv('train_df_metrics.csv', sep=",")

In [14]:
train_df_metrics.head(8)

Unnamed: 0,toxic_comment,neutral_comment1,cosine_sim,rouge_l,input_len,output_len
0,"и,чё,блядь где этот херой был до этого со свои...","Ну и где этот герой был,со своими доказательст...",0.716566,0.444444,10,8
1,"О, а есть деанон этого петуха?","О, а есть деанон",0.640666,0.8,6,4
2,"херну всякую пишут,из-за этого лайка.долбоебизм.","Чушь всякую пишут, из- за этого лайка.",0.896456,0.461538,5,7
3,из за таких пидоров мы и страдаем,из за таких плохих людей мы и страдаем,0.827146,0.8,7,8
4,гондон путинский он а не артист,"Человек Путина он, а не артист",0.868615,0.5,6,6
5,как урод такую херню пишет???,кто такую ерунду пишет,0.623283,0.222222,5,4
6,в гсвг за такие сапоги пиздюлей получил бы от ...,В ГСВГ за такие сапоги наказали бы сослуживцы,0.89283,0.444444,10,8
7,Скудоумие это свойство личности проявлять умст...,Слабоумие это свойство личности проявлять умст...,0.927118,0.909091,11,11


In [15]:
cosine = np.mean(train_df_metrics["cosine_sim"])

In [16]:
rouge = np.mean(train_df_metrics["rouge_l"])

In [17]:
print(f'Cosine similarity (CS): {cosine}')

Cosine similarity (CS): 0.7980690782843975


In [18]:
print(f'Rouge-l (R): {rouge}')

Rouge-l (R): 0.5354918039272097


In [19]:
with open('results_R_CS_train.md', 'w') as f:
  f.writelines('| Cosine similarity | ROUGE-L | \n')
  f.writelines('| ----- | --- | \n')

In [20]:
with open('results_R_CS_train.md', 'a') as res_file:
  res_file.writelines(f"{cosine:.4f}|{rouge:.4f}|\n")