In [1]:
%pip install pandas numpy seaborn matplotlib spacy nltk datasketch sentence-transformers scikit-learn unisim gensim simhash Levenshtein tensorflow-hub transformers torch plotly nbformat datasets ipywidgets pymorphy3

Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('words')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/zuzuka28/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/zuzuka28/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package words to /Users/zuzuka28/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zuzuka28/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
from datasets import load_dataset

ds = load_dataset("google/wiki40b", "en")

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [5]:
ds["train"][1]["text"]

'\n_START_ARTICLE_\nÉmile Dubonnet\n_START_PARAGRAPH_\nÉmile Dubonnet (18 October 1883 - 1940) was a French balloonist active from 1908 to 1913. He participated in the 1908, 1909, and 1911 Gordon Bennett Cup in ballooning and was a member of the Aéro-Club de France. He won the La Grande Medialle de Aéro-Club de France in 1912. He holds a Fédération Aéronautique Internationale record from 1912-1913.\n_START_SECTION_\nBiography\n_START_PARAGRAPH_\nHe was born on 18 October 1883 in Paris to a winemaker. In 1910 he flew over Paris in his Tellier brothers aircraft. He started from the Juvisy-sur-Orge field and made a landing at Bois de Boulogne _NEWLINE_He also helped form the first professional baseball league in France, the French Baseball Union, in 1912.'

In [6]:
import nltk
from nltk.tokenize import sent_tokenize
from typing import List
from datasets import DatasetDict

nltk.download('punkt')

def clean_example(sentence: str) -> str:
    to_replace = ["_START_ARTICLE_", "_START_SECTION_", "_START_PARAGRAPH_", "_NEWLINE_", "\n"]
    for item in to_replace:
        sentence = sentence.replace(item, " ")
    return sentence.strip()

def split_into_sentences(text: str) -> List[str]:
    return sent_tokenize(text, language='english')

def truncate_at_word_boundary(text: str, max_length: int) -> str:
    if len(text) <= max_length:
        return text
    truncated = text[:max_length]
    last_space = truncated.rfind(' ')
    if last_space == -1:
        return truncated
    return truncated[:last_space].strip()

def collect_texts_by_size(dataset: DatasetDict, target_size: int, num_texts: int) -> List[str]:
    texts = []

    for example in dataset:
        text = example['text']
        sentences = split_into_sentences(text)

        full_text = " ".join(sentences)
        full_text = clean_example(full_text)

        if len(full_text) < target_size:
            continue

        truncated_text = truncate_at_word_boundary(full_text, target_size)

        if truncated_text:
            texts.append(truncated_text)

        if len(texts) >= num_texts:
            break

    return texts

num_texts = 100
target_sizes = [100, 250, 500, 750, 1000, 2000]

result = {}

for size in target_sizes:
    result[size] = collect_texts_by_size(ds["train"], size, num_texts)

data = []

for size, texts in result.items():
    data.extend([{'size': size, 'text': text} for text in texts ])

df = pd.DataFrame(data)
df.to_csv('source_texts.csv', index=False)

[nltk_data] Downloading package punkt to /Users/zuzuka28/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
from text_augmentor import SentenceAugmentor, WordAugmentor, CharacterAugmentor

sentence_augmentor = SentenceAugmentor(language="english")
word_augmentor = WordAugmentor(language="english")
character_augmentor = CharacterAugmentor(language="english")

sentence_methods = {
    "sent_random_deletion": sentence_augmentor.random_deletion,
    # "sent_random_insertion": sentence_augmentor.random_insertion,
    "sent_repeat_sentence": sentence_augmentor.repeat_sentence,
    # "sent_random_substitution": sentence_augmentor.random_substitution,
    "sent_neighboring_swap": sentence_augmentor.neighboring_swap,
}

word_methods = {
    "word_random_deletion": word_augmentor.random_deletion,
    "word_random_insertion": word_augmentor.random_insertion,
    "word_random_substitution": word_augmentor.random_substitution,
    "word_repeat_word": word_augmentor.repeat_word,
    "word_neighboring_swap": word_augmentor.neighboring_swap,
}

character_methods = {
    "char_random_deletion": character_augmentor.random_deletion,
    "char_qwerty_typo_substitution": character_augmentor.qwerty_typo_substitution,
    "char_character_repetition": character_augmentor.character_repetition,
    "char_random_character_from_language_alphabet_insertion": character_augmentor.random_character_from_language_alphabet_insertion,
    "char_neighboring_swap": character_augmentor.neighboring_swap,
}

all_methods = {**sentence_methods, **word_methods, **character_methods}

[nltk_data] Downloading package punkt to /Users/zuzuka28/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/zuzuka28/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package words to /Users/zuzuka28/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [None]:
from tqdm.notebook import tqdm

df = pd.read_csv('source_texts.csv')
df['method'] = pd.Series(["" for _ in range(0,  len(df))])
df['rate'] = pd.Series([0 for _ in range(0,  len(df))])

result = df


for method_name, f in tqdm(all_methods.items()):
    method_df = df.copy()
    method_df["method"] = pd.Series([method_name for _ in range(0,  len(df))])

    for rate in tqdm([i / 100 for i in range(5, 100, 5)], desc=method_name, leave=False):
        rate_df = method_df.copy()
        rate_df["rate"] = pd.Series([rate for _ in range(0,  len(df))])
        rate_df["text"] = rate_df["text"].apply(lambda text: f(text, rate))
        result = pd.concat([result, rate_df])

result.to_csv('augmented_texts.csv', index=False)
result

In [1]:
from sentence_transformers import SentenceTransformer
import gensim.downloader as gensim_api
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel
import tensorflow_hub as hub

st_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
word2vec_model = gensim_api.load("word2vec-google-news-300")
fasttext_model = gensim_api.load("fasttext-wiki-news-subwords-300")
glove_model = gensim_api.load("glove-wiki-gigaword-50")
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")
# use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = RobertaModel.from_pretrained("roberta-base")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from datasketch import MinHash
from simhash import Simhash
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from unisim import TextSim
from Levenshtein import ratio
import torch
from functools import wraps
import time
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

device = "cpu"

if torch.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda:0"

nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)
stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))

def preprocess_word(word):
    word = re.sub(r'[^a-zA-Z]', '', word)
    return word

def safe_stem(word):
    try:
        return stemmer.stem(word)
    except RecursionError:
        return word 

def preprocess_text(text):
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\d+", "", text)

    words = nltk.word_tokenize(text.lower())

    processed_words = []
    for word in words:
        cleaned_word = preprocess_word(word)
        if cleaned_word:
            stemmed_word = safe_stem(cleaned_word)
            processed_words.append(stemmed_word)

    return processed_words


def create_shingles(tokens, shingle_size=3):
    return [
        " ".join(tokens[i : i + shingle_size])
        for i in range(len(tokens) - shingle_size + 1)
    ]


def minhash_similarity(text1, text2, num_perm=256, shingle_size=3):
    tokens1 = preprocess_text(text1)
    tokens2 = preprocess_text(text2)

    shingles1 = create_shingles(tokens1, shingle_size)
    shingles2 = create_shingles(tokens2, shingle_size)

    if not shingles1 and not shingles2:
        return 1.0
    if not shingles1 or not shingles2:
        return 0.0

    m1 = MinHash(num_perm=num_perm)
    m2 = MinHash(num_perm=num_perm)

    for shingle in shingles1:
        m1.update(shingle.encode("utf-8"))
    for shingle in shingles2:
        m2.update(shingle.encode("utf-8"))

    return m1.jaccard(m2)


# Функция для вычисления сходства с использованием SimHash
def simhash_similarity(text1, text2):
    tokens1 = preprocess_text(text1)
    tokens2 = preprocess_text(text2)

    hash1 = Simhash(tokens1)
    hash2 = Simhash(tokens2)

    return 1 - hash1.distance(hash2) / 64


# Функция для вычисления сходства с использованием Sentence Transformers
def sentence_transformers_similarity(text1, text2):
    model = st_model
    embeddings = model.encode([text1, text2])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]


# Функция для вычисления сходства с использованием CountVectorizer
def count_vectorizer_similarity(text1, text2):
    text1 = " ".join(preprocess_text(text1))
    text2 = " ".join(preprocess_text(text2))
    vectorizer = CountVectorizer().fit([text1, text2])
    vectors = vectorizer.transform([text1, text2])
    return cosine_similarity(vectors[0], vectors[1])[0][0]


# Функция для вычисления сходства с использованием TfidfVectorizer
def tfidf_vectorizer_similarity(text1, text2):
    text1 = " ".join(preprocess_text(text1))
    text2 = " ".join(preprocess_text(text2))
    vectorizer = TfidfVectorizer().fit([text1, text2])
    vectors = vectorizer.transform([text1, text2])
    return cosine_similarity(vectors[0], vectors[1])[0][0]


# Функция для вычисления сходства с использованием Word2Vec
def word2vec_similarity(text1, text2):
    model = word2vec_model

    vector1 = [model[word] for word in text1.split() if word in model]
    vector2 = [model[word] for word in text2.split() if word in model]
    if len(vector1) == 0 or len(vector2) == 0:
        return 0

    return cosine_similarity([np.mean(vector1, axis=0)], [np.mean(vector2, axis=0)])[0][
        0
    ]

# Функция для вычисления сходства с использованием FastText
def fasttext_similarity(text1, text2):
    model = fasttext_model
    vector1 = [model[word] for word in text1.split() if word in model]
    vector2 = [model[word] for word in text2.split() if word in model]
    if len(vector1) == 0 or len(vector2) == 0:
        return 0

    return cosine_similarity([np.mean(vector1, axis=0)], [np.mean(vector2, axis=0)])[0][
        0
    ]


# Функция для вычисления сходства с использованием FastText
def glove_similarity(text1, text2):
    model = glove_model
    vector1 = [model[word] for word in text1.split() if word in model]
    vector2 = [model[word] for word in text2.split() if word in model]
    if len(vector1) == 0 or len(vector2) == 0:
        return 0

    return cosine_similarity([np.mean(vector1, axis=0)], [np.mean(vector2, axis=0)])[0][0]



# Функция для вычисления сходства с использованием BERT
def bert_similarity(text1, text2):
    tokenizer = bert_tokenizer
    model = bert_model.to(device)
    inputs1 = tokenizer(text1, return_tensors="pt", truncation=True, padding=True).to(
        device
    )
    inputs2 = tokenizer(text2, return_tensors="pt", truncation=True, padding=True).to(
        device
    )
    with torch.no_grad():
        outputs1 = model(**inputs1).last_hidden_state.mean(dim=1)
        outputs2 = model(**inputs2).last_hidden_state.mean(dim=1)
    return cosine_similarity(outputs1.cpu().numpy(), outputs2.cpu().numpy())[0][0]


# Функция для вычисления сходства с использованием Universal Sentence Encoder (USE)
def use_similarity(text1, text2):
    model = use_model
    embeddings = model([text1, text2])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]


# Функция для вычисления сходства с использованием Jaccard Similarity
def jaccard_similarity(text1, text2):
    set1 = set(text1.split())
    set2 = set(text2.split())
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union


# Функция для вычисления сходства с использованием Levenshtein Distance
def levenshtein_similarity(text1, text2):
    return ratio(text1, text2)


# Функция для вычисления сходства с использованием RoBERTa
def roberta_similarity(text1, text2):
    tokenizer = roberta_tokenizer
    model = roberta_model.to(device)
    inputs1 = tokenizer(text1, return_tensors="pt", truncation=True, padding=True).to(
        device
    )
    inputs2 = tokenizer(text2, return_tensors="pt", truncation=True, padding=True).to(
        device
    )
    with torch.no_grad():
        outputs1 = model(**inputs1).last_hidden_state.mean(dim=1)
        outputs2 = model(**inputs2).last_hidden_state.mean(dim=1)
    return cosine_similarity(outputs1.cpu().numpy(), outputs2.cpu().numpy())[0][0]


# Функция для вычисления сходства с использованием UniSim
def unisim_similarity(text1, text2):
    text_sim = TextSim(store_data=False)
    return text_sim.similarity(text1, text2)


# Функция для сравнения двух текстов с использованием заданных методов
def compare_texts(text1, text2, methods):
    results = {}
    for name, func in methods.items():
        try:
            results[name] = func(text1, text2)
        except Exception as e:
            results[name] = f"Error: {str(e)}"

    results_df = pd.DataFrame(results).T
    results_df.columns = ["similarity", "time_ms"]

    return results_df


def measure_time(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        execution_time = end_time - start_time
        return result, execution_time

    return wrapper


similarity_functions = {
    "MinHash": measure_time(minhash_similarity),
    "SimHash": measure_time(simhash_similarity),
    "Sentence Transformers": measure_time(sentence_transformers_similarity),
    "Count Vectorizer": measure_time(count_vectorizer_similarity),
    "Tfidf Vectorizer": measure_time(tfidf_vectorizer_similarity),
    "Word2Vec": measure_time(word2vec_similarity),
    "GLoVe": measure_time(glove_similarity),
    "FastText": measure_time(fasttext_similarity),
    "BERT": measure_time(bert_similarity),
    # "USE": measure_time(use_similarity),
    "Jaccard": measure_time(jaccard_similarity),
    "Levenshtein": measure_time(levenshtein_similarity),
    "RoBERTa": measure_time(roberta_similarity),
    "UniSim": measure_time(unisim_similarity),
}

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from tqdm.notebook import tqdm

df = pd.read_csv('./data/pub2-noise/input/source_texts.csv')

n = 10

comparison_methods = similarity_functions

results = []

for size, group in tqdm(df.groupby('size')):
    texts = group['text'].tolist()
    if len(texts) < 2:
        print("Not enough texts to compare.")
        continue

    if len(texts) < 2 * n:
        print(f"Not enough texts to compare {n} pairs. Available texts: {len(texts)}")
        continue

    similarity_results = {method_name: [] for method_name in comparison_methods.keys()}

    for i in range(n):
        text1, text2 = texts[2 * i], texts[2 * i + 1]

        for method_name, method_func in comparison_methods.items():
            similarity, _ = method_func(text1, text2)
            similarity_results[method_name].append(similarity)

    for method_name, similarities in similarity_results.items():
        avg_similarity = np.mean(similarities)
        results.append({
            "Size": size,
            "Method": method_name,
            "Average Similarity": avg_similarity
        })

results_df = pd.DataFrame(results)

fig = px.line(
    results_df,
    x="Size",
    y="Average Similarity",
    color="Method",
    title="Average Similarity by Text Size and Method",
    labels={"Size": "Text Size", "Average Similarity": "Average Similarity Score"},
    markers=True
)

fig.update_layout(
    xaxis_title="Text Size",
    yaxis_title="Average Similarity Score",
    legend_title="Method",
    template="plotly_white"
)

fig.show()

In [16]:
import pandas as pd
from tqdm.notebook import tqdm

data = pd.read_csv('augmented_texts.csv')

reference_texts = data[data['method'].isna()].reset_index(drop=True)['text']
modified_texts = data[data['method'].notna()]

methods = {
    "MinHash": measure_time(minhash_similarity),
    # "SimHash": measure_time(simhash_similarity),
    # "Sentence Transformers": measure_time(sentence_transformers_similarity),
    # "Count Vectorizer": measure_time(count_vectorizer_similarity),
    # "Tfidf Vectorizer": measure_time(tfidf_vectorizer_similarity),
    # "Word2Vec": measure_time(word2vec_similarity),
    # "GLoVe": measure_time(glove_similarity),
    # "FastText": measure_time(fasttext_similarity),
    # "BERT": measure_time(bert_similarity),
    # "USE": measure_time(use_similarity),
    # "Jaccard": measure_time(jaccard_similarity),
    # "Levenshtein": measure_time(levenshtein_similarity),
    # "RoBERTa": measure_time(roberta_similarity),
    # "UniSim": measure_time(unisim_similarity),
}

results = []

grouped = modified_texts.groupby(['method', 'rate'])

for method_name, similarity_function in tqdm(methods.items(), desc="Similarity Methods"):
    current_results = []
    
    for (method, rate), group in tqdm(grouped, desc=f"Processing {method_name}"):
        for idx, row in group.reset_index(drop=True).iterrows():
            modified_text = row['text']
            reference_text = reference_texts[idx]
            
            similarity, comparison_time = similarity_function(reference_text, modified_text)
            
            current_results.append({
                'size': row['size'],
                'comparison_method': method_name,
                'augmentation_method': method,
                'rate': rate,
                'time': comparison_time,
                'similarity': similarity
            })
    
    current_results_df = pd.DataFrame(current_results)

    output_file = f"comparison_results_{method_name}.csv"
    current_results_df.to_csv(output_file, index=False)

    print(f"Results for {method_name} saved to {output_file}.")

Similarity Methods:   0%|          | 0/1 [00:00<?, ?it/s]

Processing MinHash:   0%|          | 0/247 [00:00<?, ?it/s]

Results for MinHash saved to comparison_results_MinHash.csv.


In [None]:
import pandas as pd
import glob


file_pattern = 'comparison_results_*.csv'
files = glob.glob(file_pattern)

combined_df = pd.DataFrame()

for file in files:
    df = pd.read_csv(file)
    combined_df = pd.concat([combined_df, df], ignore_index=True)

combined_df.to_csv('comparison_results.csv', index=False)

In [None]:
import plotly.express as px
import pandas as pd

df = pd.read_csv('./data/pub2-noise/output/comparison_results.csv')

df['prefix'] = df['augmentation_method'].apply(lambda x: x.split('_')[0])

fig = px.box(
    df, 
    x='prefix', 
    y='similarity', 
    color='prefix',
    title='Similarity distribution',
    labels={'prefix': 'noise type', 'similarity': 'similarity'}
)

fig.show()

In [None]:
import plotly.express as px
import pandas as pd

df = pd.read_csv('./data/pub2-noise/output/comparison_results.csv')
df['prefix'] = df['augmentation_method'].apply(lambda x: x.split('_')[0])

comparison_methods = df['comparison_method'].unique()

for method in comparison_methods:
    method_data = df[df['comparison_method'] == method]
    
    fig = px.box(
        method_data, 
        x='prefix', 
        y='similarity', 
        color='prefix',
        title=f'Similarity distribution: {method}',
        labels={'prefix': 'noize type', 'similarity': 'similarity'}
    )
    fig.show()

In [None]:
import pandas as pd
import plotly.express as px

df = pd.read_csv('./data/pub2-noise/output/comparison_results.csv')

new_rows = []
for comparison_method in df['comparison_method'].unique():
    for size in df['size'].unique():
        for augmentation_method in df['augmentation_method'].unique():
            new_rows.append({
                'comparison_method': comparison_method,
                'size': size,
                'augmentation_method': augmentation_method,
                'rate': 0,
                'similarity': 1
            })

new_df = pd.DataFrame(new_rows)

df = pd.concat([df, new_df], ignore_index=True)

agg_data = df.groupby(['comparison_method', 'size', 'augmentation_method', 'rate'], as_index=False)['similarity'].mean()

augmentation_methods = agg_data['augmentation_method'].unique()
sizes = agg_data['size'].unique()

for size in sizes:
    for aug_method in augmentation_methods:
        filtered_data = agg_data[(agg_data['augmentation_method'] == aug_method) & (agg_data['size'] == size)]
        
        fig = px.line(filtered_data, x='rate', y='similarity', color='comparison_method',
                      title=f'Similarity vs Noise Level (rate) for Augmentation Method: {aug_method}, Text Size: {size}',
                      labels={'rate': 'Noise Level (rate)', 'similarity': 'Similarity', 
                              'comparison_method': 'Comparison Method'})
        fig.show()

In [None]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np

df = pd.read_csv('./data/pub2-noise/output/comparison_results.csv')

df['prefix'] = df['augmentation_method'].apply(lambda x: x.split('_')[0])

prefixes = df['prefix'].unique()
comparison_methods = df['comparison_method'].unique()

for prefix in prefixes:
    for comp_method in comparison_methods:
        filtered_data = df[(df['prefix'] == prefix) & 
                           (df['comparison_method'] == comp_method)]
        
        grouped_data = filtered_data.groupby(['size', 'rate'], as_index=False)['similarity'].mean()
        
        heatmap_data = grouped_data.pivot_table(index='size', columns='rate', values='similarity')
        
        y_values = heatmap_data.index
        y_tickvals = np.arange(len(y_values))
        
        fig = go.Figure(data=go.Heatmap(
            z=heatmap_data.values,
            x=heatmap_data.columns,
            y=y_tickvals, 
            colorscale='Viridis',
            colorbar=dict(title='Similarity'),
            zmin=0,
            zmax=1,
            hoverongaps=False,
            text=heatmap_data.values.round(2), 
            texttemplate="%{text}",
            xgap=1, 
            ygap=1, 
        ))
        
        fig.update_layout(
            title=f'Heatmap: Similarity by Noise Level (rate) and Text Size (size)<br>Prefix: {prefix}, Comparison Method: {comp_method}',
            xaxis_title='Noise Level (rate)',
            yaxis_title='Text Size (size)',
            xaxis=dict(tickangle=45, tickmode='array', tickvals=heatmap_data.columns, ticktext=heatmap_data.columns),
            yaxis=dict(
                tickmode='array',
                tickvals=y_tickvals, 
                ticktext=y_values, 
            ),
            autosize=False,
            width=800,
            height=600,
        )
        
        fig.update_xaxes(tickangle=45)
        
        fig.show()

In [33]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np

df = pd.read_csv('./data/pub2-noise/output/comparison_results.csv')

new_rows = []
for comparison_method in df['comparison_method'].unique():
    for size in df['size'].unique():
        for augmentation_method in df['augmentation_method'].unique():
            new_rows.append({
                'comparison_method': comparison_method,
                'size': size,
                'augmentation_method': augmentation_method,
                'rate': 0,
                'similarity': 1
            })

new_df = pd.DataFrame(new_rows)

df = pd.concat([df, new_df], ignore_index=True)

df['prefix'] = df['augmentation_method'].apply(lambda x: x.split('_')[0])

df = df[df["prefix"] == "char"]

prefixes = df['prefix'].unique()
comparison_methods = df['comparison_method'].unique()

combined_data = df.groupby(['size', 'rate', 'comparison_method'], as_index=False)['similarity'].mean()

heatmap_data = combined_data.pivot_table(index=['comparison_method', 'size'], 
                                         columns='rate', 
                                         values='similarity')

y_values = heatmap_data.index
y_tickvals = np.arange(len(y_values))

fig = go.Figure(data=go.Heatmap(
    z=heatmap_data.values,
    x=heatmap_data.columns,
    y=y_tickvals, 
    colorscale='Viridis',
    colorbar=dict(title='Similarity'),
    zmin=0,
    zmax=1,
    hoverongaps=False,
    text=heatmap_data.values.round(2), 
    texttemplate="%{text}",
    xgap=1, 
    ygap=1, 
))

fig.update_layout(
    title='Similarity by Noise Level, Text Size, and Comparison Method',
    xaxis_title='Noise Level',
    yaxis_title='Comparison Method and Text Size',
    xaxis=dict(tickangle=45, tickmode='array', tickvals=heatmap_data.columns, ticktext=heatmap_data.columns),
    yaxis=dict(
        tickmode='array',
        tickvals=y_tickvals, 
        ticktext=[f"{idx[0]}, {idx[1]}" for idx in y_values], 
    ),
    autosize=False,
    width=1200,
    height=6400,
)

fig.update_xaxes(tickangle=45)

fig.show()

In [34]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np

df = pd.read_csv('./data/pub2-noise/output/comparison_results.csv')

new_rows = []
for comparison_method in df['comparison_method'].unique():
    for size in df['size'].unique():
        for augmentation_method in df['augmentation_method'].unique():
            new_rows.append({
                'comparison_method': comparison_method,
                'size': size,
                'augmentation_method': augmentation_method,
                'rate': 0,
                'similarity': 1
            })

new_df = pd.DataFrame(new_rows)

df = pd.concat([df, new_df], ignore_index=True)

df['prefix'] = df['augmentation_method'].apply(lambda x: x.split('_')[0])

df = df[df["prefix"] == "char"]

prefixes = df['prefix'].unique()

comparison_methods = df['comparison_method'].unique()

combined_data = pd.DataFrame()

combined_data = df.groupby(['size', 'rate', 'comparison_method'], as_index=False)['similarity'].mean()

combined_data = combined_data.sort_values(by=['size', 'comparison_method'])

heatmap_data = combined_data.pivot_table(index=['size', 'comparison_method'], 
                                         columns='rate', 
                                         values='similarity')

y_values = heatmap_data.index
y_tickvals = np.arange(len(y_values))

fig = go.Figure(data=go.Heatmap(
    z=heatmap_data.values,
    x=heatmap_data.columns,
    y=y_tickvals, 
    colorscale='Viridis',
    colorbar=dict(title='Similarity'),
    zmin=0,
    zmax=1,
    hoverongaps=False,
    text=heatmap_data.values.round(2), 
    texttemplate="%{text}",
    xgap=1, 
    ygap=1, 
))

fig.update_layout(
    title='Similarity by Noise Level, Text Size, and Comparison Method',
    xaxis_title='Noise Level (rate)',
    yaxis_title='Text Size and Comparison Method',
    xaxis=dict(tickangle=45, tickmode='array', tickvals=heatmap_data.columns, ticktext=heatmap_data.columns),
    yaxis=dict(
        tickmode='array',
        tickvals=y_tickvals, 
        ticktext=[f"{idx[0]}, {idx[1]}" for idx in y_values], 
    ),
    autosize=False,
    width=1200,
    height=6400,
)

fig.update_xaxes(tickangle=45)

fig.show()

Нарезка

In [None]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np
from math import ceil

df = pd.read_csv('./data/pub2-noise/output/comparison_results.csv')

# Добавляем нулевые значения
new_rows = []
for comparison_method in df['comparison_method'].unique():
    for size in df['size'].unique():
        for augmentation_method in df['augmentation_method'].unique():
            new_rows.append({
                'comparison_method': comparison_method,
                'size': size,
                'augmentation_method': augmentation_method,
                'rate': 0,
                'similarity': 1
            })

new_df = pd.DataFrame(new_rows)
df = pd.concat([df, new_df], ignore_index=True)
df['prefix'] = df['augmentation_method'].apply(lambda x: x.split('_')[0])
df = df[df["prefix"] == "char"]

# Жестко заданный порядок методов (как требуется)
CUSTOM_ORDER = [
    "Count Vectorizer",
    "Processed Count Vectorizer",
    "Tfidf Vectorizer",
    "Processed Tfidf Vectorizer",
    # "Jaccard",
    
    "MinHash",
    "3-shingle MinHash",
    "SimHash",
    "Processed SimHash",

    "Levenshtein",
    "Word2Vec",
    "GLoVe",
    "FastText",

    "USE",
    "Sentence Transformers",
    "BERT",
    "UniSim",
]

# Параметр: сколько методов показывать на одной heatmap
COMPARISON_METHODS_PER_PLOT = 4

# Фильтруем только те методы, которые есть в данных, но сохраняем заданный порядок
existing_methods = set(df['comparison_method'].unique())
comparison_methods = [m for m in CUSTOM_ORDER if m in existing_methods]

# Выводим предупреждения о недостающих методах
missing_in_data = set(CUSTOM_ORDER) - existing_methods
if missing_in_data:
    print(f"Warning: These methods are in CUSTOM_ORDER but missing in data: {missing_in_data}")

# Вычисляем количество heatmaps
num_plots = ceil(len(comparison_methods) / COMPARISON_METHODS_PER_PLOT)

# Создаем heatmap для каждой группы методов
for i in range(num_plots):
    start_idx = i * COMPARISON_METHODS_PER_PLOT
    end_idx = start_idx + COMPARISON_METHODS_PER_PLOT
    current_methods = comparison_methods[start_idx:end_idx]
    
    # Фильтруем данные для текущей группы
    current_df = df[df['comparison_method'].isin(current_methods)].copy()
    
    # Преобразуем в категориальный тип с жестко заданным порядком
    current_df['comparison_method'] = pd.Categorical(
        current_df['comparison_method'],
        categories=current_methods,
        ordered=True
    )
    
    # Сортируем строго по заданному порядку
    current_df.sort_values(['comparison_method', 'size', 'rate'], inplace=True)
    
    # Агрегируем данные
    combined_data = current_df.groupby(['size', 'rate', 'comparison_method'], as_index=False)['similarity'].mean()
    
    # Создаем сводную таблицу
    heatmap_data = combined_data.pivot_table(
        index=['comparison_method', 'size'],
        columns='rate',
        values='similarity'
    )

    y_values = heatmap_data.index
    y_tickvals = np.arange(len(y_values))
    
    # Создаем heatmap
    fig = go.Figure(data=go.Heatmap(
        z=heatmap_data.values,
        x=heatmap_data.columns,
        y=y_tickvals,
        colorscale='Viridis',
        colorbar=dict(title='Similarity'),
        zmin=0,
        zmax=1,
        hoverongaps=False,
        text=heatmap_data.values.round(2),
        texttemplate="%{text}",
        xgap=1,
        ygap=1,
    ))

    fig.update_layout(
        title=f'Similarity by Noise Level, Text Size, and Comparison Method',
        xaxis_title='Noise Level',
        yaxis_title='Comparison Method and Text Size',
        xaxis=dict(tickangle=45, tickmode='array', tickvals=heatmap_data.columns, ticktext=heatmap_data.columns),
        yaxis=dict(
            tickmode='array',
            tickvals=y_tickvals,
            ticktext=[f"{idx[0]}, {idx[1]}" for idx in y_values],
        ),
        autosize=False,
        width=1200,
        height=500 + 100 * len(current_methods),  # Динамическая высота
    )

    fig.update_xaxes(tickangle=45)
    fig.show()