# Pub 2

In [1]:
%pip install pandas numpy seaborn matplotlib spacy nltk datasketch sentence-transformers scikit-learn unisim gensim simhash Levenshtein tensorflow-hub transformers torch plotly nbformat

Note: you may need to restart the kernel to use updated packages.


In [2]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('words')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/zuzuka28/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/zuzuka28/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package words to /Users/zuzuka28/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zuzuka28/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Входные данные

В качестве примера был выбран небольшой отрывок из книги Приключения Тома Сойера

In [4]:
source_text = """
Then her conscience reproached her, and she yearned to say something kind and loving; but she judged that this would be construed into a confession that she had been in the wrong, and discipline forbade that. So she kept silence, and went about her affairs with a troubled heart. Tom sulked in a corner and exalted his woes. He knew that in her heart his aunt was on her knees to him, and he was morosely gratified by the consciousness of it. He would hang out no signals, he would take notice of none. He knew that a yearning glance fell upon him, now and then, through a film of tears, but he refused recognition of it. He pictured himself lying sick unto death and his aunt bending over him beseeching one little forgiving word, but he would turn his face to the wall, and die with that word unsaid. Ah, how would she feel then? And he pictured himself brought home from the river, dead, with his curls all wet, and his sore heart at rest. How she would throw herself upon him, and how her tears would fall like rain, and her lips pray God to give her back her boy and she would never, never abuse him any more! But he would lie there cold and white and make no sign—a poor little sufferer, whose griefs were at an end. He so worked upon his feelings with the pathos of these dreams, that he had to keep swallowing, he was so like to choke; and his eyes swam in a blur of water, which overflowed when he winked, and ran down and trickled from the end of his nose. And such a luxury to him was this petting of his sorrows, that he could not bear to have any worldly cheeriness or any grating delight intrude upon it; it was too sacred for such contact; and so, presently, when his cousin Mary danced in, all alive with the joy of seeing home again after an age-long visit of one week to the country, he got up and moved in clouds and darkness out at one door as she brought song and sunshine in at the other.
"""

Количество элементов текста

In [5]:
from nltk.tokenize import word_tokenize, sent_tokenize

def text_element_counters(text: str, lang="english"):
    character_count = len(text)

    words = word_tokenize(text)
    word_count = len(words)

    sentences = sent_tokenize(text)
    sentence_count = len(sentences)
    
    return character_count, word_count, sentence_count


character_count, word_count, sentence_count = text_element_counters(source_text)

print(f"Количество символов: {character_count}")
print(f"Количество слов: {word_count}")
print(f"Количество предложений: {sentence_count}")

Количество символов: 1912
Количество слов: 413
Количество предложений: 13


## Модели для исследования

Определим модели, которые будем исследовать

In [None]:
from datasketch import MinHash
from simhash import Simhash
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Doc2Vec, Word2Vec, FastText
from gensim.models.doc2vec import TaggedDocument
from unisim import TextSim
from Levenshtein import ratio
from transformers import BertTokenizer, BertModel, RobertaTokenizer, RobertaModel
import torch
import tensorflow_hub as hub
from functools import wraps
import time

# Функция для вычисления сходства с использованием MinHash
def minhash_similarity(text1, text2):
    m1 = MinHash()
    m2 = MinHash()
    for word in text1.split():
        m1.update(word.encode('utf8'))
    for word in text2.split():
        m2.update(word.encode('utf8'))
    return m1.jaccard(m2)

# Функция для вычисления сходства с использованием SimHash
def simhash_similarity(text1, text2):
    hash1 = Simhash(text1.split())
    hash2 = Simhash(text2.split())
    return 1 - hash1.distance(hash2) / 64  # Нормализация расстояния в диапазон [0, 1]

st_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Функция для вычисления сходства с использованием Sentence Transformers
def sentence_transformers_similarity(text1, text2):
    model = st_model
    embeddings = model.encode([text1, text2])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

# Функция для вычисления сходства с использованием CountVectorizer
def count_vectorizer_similarity(text1, text2):
    vectorizer = CountVectorizer().fit([text1, text2])
    vectors = vectorizer.transform([text1, text2])
    return cosine_similarity(vectors[0], vectors[1])[0][0]

# Функция для вычисления сходства с использованием TfidfVectorizer
def tfidf_vectorizer_similarity(text1, text2):
    vectorizer = TfidfVectorizer().fit([text1, text2])
    vectors = vectorizer.transform([text1, text2])
    return cosine_similarity(vectors[0], vectors[1])[0][0]

# Функция для вычисления сходства с использованием Doc2Vec
def doc2vec_similarity(text1, text2):
    documents = [TaggedDocument(doc.split(), [i]) for i, doc in enumerate([text1, text2])]
    model = Doc2Vec(documents, vector_size=50, window=5, min_count=1, workers=4)
    vector1 = model.infer_vector(text1.split())
    vector2 = model.infer_vector(text2.split())
    return cosine_similarity([vector1], [vector2])[0][0]

# Функция для вычисления сходства с использованием Word2Vec
def word2vec_similarity(text1, text2):
    sentences = [text1.split(), text2.split()]
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    vector1 = np.mean([model.wv[word] for word in text1.split() if word in model.wv], axis=0)
    vector2 = np.mean([model.wv[word] for word in text2.split() if word in model.wv], axis=0)
    return cosine_similarity([vector1], [vector2])[0][0]

# Функция для вычисления сходства с использованием FastText
def fasttext_similarity(text1, text2):
    sentences = [text1.split(), text2.split()]
    model = FastText(sentences, vector_size=100, window=5, min_count=1, workers=4)
    vector1 = np.mean([model.wv[word] for word in text1.split() if word in model.wv], axis=0)
    vector2 = np.mean([model.wv[word] for word in text2.split() if word in model.wv], axis=0)
    return cosine_similarity([vector1], [vector2])[0][0]

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Функция для вычисления сходства с использованием BERT
def bert_similarity(text1, text2):
    tokenizer =bert_tokenizer
    model = bert_model
    inputs1 = tokenizer(text1, return_tensors='pt', truncation=True, padding=True)
    inputs2 = tokenizer(text2, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs1 = model(**inputs1).last_hidden_state.mean(dim=1)
        outputs2 = model(**inputs2).last_hidden_state.mean(dim=1)
    return cosine_similarity(outputs1.numpy(), outputs2.numpy())[0][0]

use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Функция для вычисления сходства с использованием Universal Sentence Encoder (USE)
def use_similarity(text1, text2):
    model = use_model
    embeddings = model([text1, text2])
    return cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

# Функция для вычисления сходства с использованием Jaccard Similarity
def jaccard_similarity(text1, text2):
    set1 = set(text1.split())
    set2 = set(text2.split())
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

# Функция для вычисления сходства с использованием Levenshtein Distance
def levenshtein_similarity(text1, text2):
    return ratio(text1, text2)

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaModel.from_pretrained('roberta-base')

# Функция для вычисления сходства с использованием RoBERTa
def roberta_similarity(text1, text2):
    tokenizer = roberta_tokenizer
    model = roberta_model
    inputs1 = tokenizer(text1, return_tensors='pt', truncation=True, padding=True)
    inputs2 = tokenizer(text2, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs1 = model(**inputs1).last_hidden_state.mean(dim=1)
        outputs2 = model(**inputs2).last_hidden_state.mean(dim=1)
    return cosine_similarity(outputs1.numpy(), outputs2.numpy())[0][0]

# Функция для вычисления сходства с использованием UniSim
def unisim_similarity(text1, text2):
    text_sim = TextSim()
    return text_sim.similarity(text1, text2)

# Функция для сравнения двух текстов с использованием заданных методов
def compare_texts(text1, text2, methods):
    results = {}
    for name, func in methods.items():
        
        try:
            results[name] = func(text1, text2)
        except Exception as e:
            results[name] = f"Error: {str(e)}"

    results_df = pd.DataFrame(results).T
    results_df.columns = ["similarity", "time_ms"]

    return results_df

def measure_time(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        execution_time = end_time - start_time
        return result, execution_time
    return wrapper

similarity_functions = {
    "MinHash": measure_time(minhash_similarity),
    "SimHash": measure_time(simhash_similarity),
    "Sentence Transformers": measure_time(sentence_transformers_similarity),
    "Count Vectorizer": measure_time(count_vectorizer_similarity),
    "Tfidf Vectorizer": measure_time(tfidf_vectorizer_similarity),
    "Word2Vec": measure_time(word2vec_similarity),
    "FastText": measure_time(fasttext_similarity),
    "BERT": measure_time(bert_similarity),
    "USE": measure_time(use_similarity),
    "Jaccard": measure_time(jaccard_similarity),
    "Levenshtein": measure_time(levenshtein_similarity),
    "RoBERTa": measure_time(roberta_similarity),
    "UniSim": measure_time(unisim_similarity),
}

  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
compare_texts(source_text, source_text, similarity_functions)

Unnamed: 0,similarity,time_ms
MinHash,1.0,0.005789
SimHash,1.0,0.001503
Sentence Transformers,1.0,0.186364
Count Vectorizer,1.0,0.003704
Tfidf Vectorizer,1.0,0.001496
Word2Vec,1.0,0.00763
FastText,1.0,0.639573
BERT,1.0,0.292421
USE,1.0,0.203069
Jaccard,1.0,8.9e-05


## Эксперимент

In [29]:
exp_results = {}

### На уровне символов

#### Проверка устойчивости к замене символов

In [30]:
from text_augmentor import CharacterAugmentor

char_aug = CharacterAugmentor()

results = {}

for rate in [i / 100 for i in range(1, 100, 1)]:
    changed = char_aug.qwerty_typo_substitution(text=source_text, augmentation_rate=rate)
    compared = compare_texts(source_text, changed, similarity_functions)
    results[rate] = compared
    print(compared)

results = pd.concat(results).reset_index()
results.rename(columns={"level_1": "method", "level_0": "rate"}, inplace=True)

exp_results["char_qwerty_typo_substitution"] = results

                       similarity   time_ms
MinHash                  0.843750  0.007410
SimHash                  0.968750  0.001082
Sentence Transformers    0.884734  0.068876
Count Vectorizer         0.992550  0.015477
Tfidf Vectorizer         0.986873  0.001800
Word2Vec                 0.996546  0.015549
FastText                 0.999065  0.611537
BERT                     0.979775  0.256388
USE                      0.987813  0.004292
Jaccard                  0.899123  0.000102
Levenshtein              0.992678  0.000094
RoBERTa                  0.998773  0.326380
UniSim                   0.997376  0.057321
                       similarity   time_ms
MinHash                  0.835938  0.004860
SimHash                  0.937500  0.000698
Sentence Transformers    0.837851  0.038899
Count Vectorizer         0.986946  0.001218
Tfidf Vectorizer         0.976910  0.001196
Word2Vec                 0.994489  0.008179
FastText                 0.997372  0.614231
BERT                     0.96437

In [31]:
import plotly.express as px
import plotly.graph_objects as go

fig = px.line(
    results,
    x="rate",
    y="similarity",
    color="method",
    title="Устойчивость к изменению символов",
    labels={"rate": "Rate", "similarity": "Similarity", "method": "Method"},
    line_shape="linear",
)

fig.update_layout(
    xaxis_title="Rate",
    yaxis_title="Similarity",
    legend_title="Method",
    hovermode="x unified",
    template="plotly_white", 
)


#### Проверка устойчивости к удалению символов

In [39]:
from text_augmentor import CharacterAugmentor

char_aug = CharacterAugmentor()

results = {}

for rate in [i / 100 for i in range(1, 100, 1)]:
    changed = char_aug.random_deletion(text=source_text, augmentation_rate=rate)
    compared = compare_texts(source_text, changed, similarity_functions)
    results[rate] = compared
    print(compared)

results = pd.concat(results).reset_index()
results.rename(columns={"level_1": "method", "level_0": "rate"}, inplace=True)

exp_results["char_random_deletion"] = results

                       similarity   time_ms
MinHash                  0.835938  0.008626
SimHash                  0.937500  0.001579
Sentence Transformers    0.961242  0.121170
Count Vectorizer         0.990762  0.001406
Tfidf Vectorizer         0.984739  0.001171
Word2Vec                 0.994363  0.007633
FastText                 0.996557  0.645811
BERT                     0.978562  0.233500
USE                      0.978256  0.003948
Jaccard                  0.858369  0.000094
Levenshtein              0.995007  0.000058
RoBERTa                  0.998806  0.234059
UniSim                   0.992651  0.060930
                       similarity   time_ms
MinHash                  0.757812  0.004765
SimHash                  0.937500  0.000879
Sentence Transformers    0.941787  0.038103
Count Vectorizer         0.982417  0.001486
Tfidf Vectorizer         0.969738  0.001247
Word2Vec                 0.987620  0.008111
FastText                 0.994834  0.617566
BERT                     0.96510

In [None]:
import plotly.express as px

fig = px.line(
    results,
    x="rate",
    y="similarity",
    color="method",
    title="Устойчивость к удалению символов",
    labels={"rate": "Rate", "similarity": "Similarity", "method": "Method"},
    line_shape="linear",
)

fig.update_layout(
    xaxis_title="Rate",
    yaxis_title="Similarity",
    legend_title="Method",
    hovermode="x unified",
    template="plotly_white", 
)

#### Проверка устойчивости к смене мест

In [None]:
from text_augmentor import CharacterAugmentor

char_aug = CharacterAugmentor()

results = {}

for rate in [i / 100 for i in range(1, 100, 1)]:
    changed = char_aug.neighboring_swap(text=source_text, augmentation_rate=rate)
    compared = compare_texts(source_text, changed, similarity_functions)
    results[rate] = compared
    print(compared)

results = pd.concat(results).reset_index()
results.rename(columns={"level_1": "method", "level_0": "rate"}, inplace=True)

exp_results["char_neighboring_swap"] = results

                       similarity   time_ms
MinHash                  0.789062  0.014152
SimHash                  0.921875  0.019189
Sentence Transformers    0.928980  0.050172
Count Vectorizer         0.986076  0.001220
Tfidf Vectorizer         0.975855  0.001500
Word2Vec                 0.993037  0.007907
FastText                 0.997403  0.613554
BERT                     0.971531  0.296800
USE                      0.973609  0.005505
Jaccard                  0.814815  0.000096
Levenshtein              0.990063  0.000079
RoBERTa                  0.997895  0.277507
UniSim                   0.995929  0.048929
                       similarity   time_ms
MinHash                  0.656250  0.004710
SimHash                  0.953125  0.000819
Sentence Transformers    0.687758  0.038599
Count Vectorizer         0.973819  0.001202
Tfidf Vectorizer         0.955322  0.001187
Word2Vec                 0.984303  0.009003
FastText                 0.988263  0.610960
BERT                     0.92921

In [None]:
import plotly.express as px

fig = px.line(
    results,
    x="rate",
    y="similarity",
    color="method",
    title="Устойчивость смене мест символов",
    labels={"rate": "Rate", "similarity": "Similarity", "method": "Method"},
    line_shape="linear",
)

fig.update_layout(
    xaxis_title="Rate",
    yaxis_title="Similarity",
    legend_title="Method",
    hovermode="x unified",
    template="plotly_white", 
)

#### Проверка устойчивости к изменению кейса символов

In [None]:
from text_augmentor import CharacterAugmentor

char_aug = CharacterAugmentor()

results = {}

for rate in [i / 100 for i in range(1, 100, 1)]:
    changed = char_aug.case_substitution(text=source_text, augmentation_rate=rate)
    compared = compare_texts(source_text, changed, similarity_functions)
    results[rate] = compared
    print(compared)

results = pd.concat(results).reset_index()
results.rename(columns={"level_1": "method", "level_0": "rate"}, inplace=True)

exp_results["char_case_substitution"] = results

                       similarity   time_ms
MinHash                  0.906250  0.005831
SimHash                  0.968750  0.002270
Sentence Transformers    1.000000  0.040412
Count Vectorizer         1.000000  0.001181
Tfidf Vectorizer         1.000000  0.001168
Word2Vec                 0.996804  0.008240
FastText                 0.998871  0.600063
BERT                     1.000000  0.354513
USE                      1.000000  0.004607
Jaccard                  0.873913  0.000112
Levenshtein              0.991109  0.000075
RoBERTa                  0.998635  0.348352
UniSim                   0.996482  0.054999
                       similarity   time_ms
MinHash                  0.851562  0.004587
SimHash                  0.953125  0.000742
Sentence Transformers    1.000000  0.040620
Count Vectorizer         1.000000  0.002045
Tfidf Vectorizer         1.000000  0.001284
Word2Vec                 0.995154  0.008327
FastText                 0.998522  0.612521
BERT                     1.00000

In [None]:
import plotly.express as px

fig = px.line(
    results,
    x="rate",
    y="similarity",
    color="method",
    title="Устойчивость к смене кейса символов",
    labels={"rate": "Rate", "similarity": "Similarity", "method": "Method"},
    line_shape="linear",
)

fig.update_layout(
    xaxis_title="Rate",
    yaxis_title="Similarity",
    legend_title="Method",
    hovermode="x unified",
    template="plotly_white", 
)

#### Проверка устойчивости к дублированию симоволов

In [None]:
from text_augmentor import CharacterAugmentor

char_aug = CharacterAugmentor()

results = {}

for rate in [i / 100 for i in range(1, 100, 1)]:
    changed = char_aug.character_repetition(text=source_text, augmentation_rate=rate)
    compared = compare_texts(source_text, changed, similarity_functions)
    results[rate] = compared
    print(compared)

results = pd.concat(results).reset_index()
results.rename(columns={"level_1": "method", "level_0": "rate"}, inplace=True)

exp_results["char_character_repetition"] = results

                       similarity   time_ms
MinHash                  0.898438  0.040320
SimHash                  0.984375  0.002453
Sentence Transformers    0.911698  0.046293
Count Vectorizer         0.993688  0.001208
Tfidf Vectorizer         0.988931  0.001194
Word2Vec                 0.997521  0.007502
FastText                 0.999342  0.609342
BERT                     0.986841  0.289585
USE                      0.990321  0.004497
Jaccard                  0.907895  0.000099
Levenshtein              0.988880  0.000160
RoBERTa                  0.997132  0.299143
UniSim                   0.993858  0.050742
                       similarity   time_ms
MinHash                  0.796875  0.004385
SimHash                  0.937500  0.000648
Sentence Transformers    0.911447  0.037990
Count Vectorizer         0.984454  0.001208
Tfidf Vectorizer         0.971695  0.001187
Word2Vec                 0.992810  0.007776
FastText                 0.998119  0.609739
BERT                     0.94699

In [None]:
import plotly.express as px

fig = px.line(
    results,
    x="rate",
    y="similarity",
    color="method",
    title="Устойчивость к дублированию символов",
    labels={"rate": "Rate", "similarity": "Similarity", "method": "Method"},
    line_shape="linear",
)

fig.update_layout(
    xaxis_title="Rate",
    yaxis_title="Similarity",
    legend_title="Method",
    hovermode="x unified",
    template="plotly_white", 
)

### На уровне слов

#### Устойчивость к смене мест слов (соседи)

In [None]:
from text_augmentor import WordAugmentor

word_aug = WordAugmentor()

results = {}

for rate in [i / 100 for i in range(1, 100, 1)]:
    changed = word_aug.neighboring_swap(text=source_text, augmentation_rate=rate)
    compared = compare_texts(source_text, changed, similarity_functions)
    results[rate] = compared
    print(compared)

results = pd.concat(results).reset_index()
results.rename(columns={"level_1": "method", "level_0": "rate"}, inplace=True)

exp_results["word_neighboring_swap"] = results

                       similarity   time_ms
MinHash                  0.718750  0.004391
SimHash                  0.812500  0.000703
Sentence Transformers    0.996602  0.036884
Count Vectorizer         1.000000  0.001134
Tfidf Vectorizer         1.000000  0.001148
Word2Vec                 0.912921  0.007309
FastText                 0.828510  0.584669
BERT                     0.998949  0.294000
USE                      0.998118  0.004108
Jaccard                  0.700405  0.000092
Levenshtein              0.983445  0.000076
RoBERTa                  0.994151  0.269703
UniSim                   0.988504  0.047944
                       similarity   time_ms
MinHash                  0.718750  0.004651
SimHash                  0.812500  0.000772
Sentence Transformers    0.997090  0.018839
Count Vectorizer         1.000000  0.001148
Tfidf Vectorizer         1.000000  0.001162
Word2Vec                 0.905373  0.007811
FastText                 0.817074  0.610039
BERT                     0.99745

In [None]:
import plotly.express as px

fig = px.line(
    results,
    x="rate",
    y="similarity",
    color="method",
    title="Устойчивость к смене мест слов (соседи)",
    labels={"rate": "Rate", "similarity": "Similarity", "method": "Method"},
    line_shape="linear",
)

fig.update_layout(
    xaxis_title="Rate",
    yaxis_title="Similarity",
    legend_title="Method",
    hovermode="x unified",
    template="plotly_white", 
)

#### Устойчивость к замене слов

In [40]:
from text_augmentor import WordAugmentor

word_aug = WordAugmentor()

results = {}

for rate in [i / 100 for i in range(1, 100, 1)]:
    changed = word_aug.random_substitution(text=source_text, augmentation_rate=rate)
    compared = compare_texts(source_text, changed, similarity_functions)
    results[rate] = compared
    print(compared)

results = pd.concat(results).reset_index()
results.rename(columns={"level_1": "method", "level_0": "rate"}, inplace=True)

exp_results["word_random_substitution"] = results

                       similarity   time_ms
MinHash                  0.671875  0.004723
SimHash                  0.828125  0.000734
Sentence Transformers    0.976350  0.045298
Count Vectorizer         0.998500  0.001334
Tfidf Vectorizer         0.997042  0.001549
Word2Vec                 0.914657  0.007999
FastText                 0.827268  0.670816
BERT                     0.997904  0.277666
USE                      0.996754  0.005072
Jaccard                  0.681275  0.000121
Levenshtein              0.978395  0.000082
RoBERTa                  0.994531  0.259304
UniSim                   0.986796  0.052762
                       similarity   time_ms
MinHash                  0.703125  0.004779
SimHash                  0.796875  0.000807
Sentence Transformers    0.958204  0.038250
Count Vectorizer         0.995639  0.001246
Tfidf Vectorizer         0.993131  0.001503
Word2Vec                 0.900145  0.008361
FastText                 0.794841  0.609474
BERT                     0.99138

In [41]:
import plotly.express as px

fig = px.line(
    results,
    x="rate",
    y="similarity",
    color="method",
    title="Устойчивость к замене слов",
    labels={"rate": "Rate", "similarity": "Similarity", "method": "Method"},
    line_shape="linear",
)

fig.update_layout(
    xaxis_title="Rate",
    yaxis_title="Similarity",
    legend_title="Method",
    hovermode="x unified",
    template="plotly_white", 
)

#### Устойчивость к удалению слов

In [None]:
from text_augmentor import WordAugmentor

word_aug = WordAugmentor()

results = {}

for rate in [i / 100 for i in range(1, 100, 1)]:
    changed = word_aug.random_deletion(text=source_text, augmentation_rate=rate)
    compared = compare_texts(source_text, changed, similarity_functions)
    results[rate] = compared
    print(compared)

results = pd.concat(results).reset_index()
results.rename(columns={"level_1": "method", "level_0": "rate"}, inplace=True)

exp_results["word_random_deletion"] = results

                       similarity   time_ms
MinHash                  0.718750  0.004500
SimHash                  0.812500  0.000766
Sentence Transformers    0.993771  0.039678
Count Vectorizer         0.999262  0.001203
Tfidf Vectorizer         0.999018  0.001136
Word2Vec                 0.908054  0.006992
FastText                 0.831316  0.609086
BERT                     0.998571  0.259454
USE                      0.997337  0.003988
Jaccard                  0.696356  0.000094
Levenshtein              0.984671  0.000076
RoBERTa                  0.994534  0.294213
UniSim                   0.990447  0.048649
                       similarity   time_ms
MinHash                  0.695312  0.004577
SimHash                  0.812500  0.000713
Sentence Transformers    0.992961  0.036575
Count Vectorizer         0.998130  0.001141
Tfidf Vectorizer         0.996915  0.001162
Word2Vec                 0.898464  0.007873
FastText                 0.810329  0.647204
BERT                     0.99854

In [None]:
import plotly.express as px

fig = px.line(
    results,
    x="rate",
    y="similarity",
    color="method",
    title="Устойчивость к удалению слов",
    labels={"rate": "Rate", "similarity": "Similarity", "method": "Method"},
    line_shape="linear",
)

fig.update_layout(
    xaxis_title="Rate",
    yaxis_title="Similarity",
    legend_title="Method",
    hovermode="x unified",
    template="plotly_white", 
)

#### Устойчивость к добавлению слов

In [None]:
from text_augmentor import WordAugmentor

word_aug = WordAugmentor()

results = {}

for rate in [i / 100 for i in range(1, 100, 1)]:
    changed = word_aug.random_insertion(text=source_text, augmentation_rate=rate)
    compared = compare_texts(source_text, changed, similarity_functions)
    results[rate] = compared
    print(compared)

results = pd.concat(results).reset_index()
results.rename(columns={"level_1": "method", "level_0": "rate"}, inplace=True)

exp_results["word_random_insertion"] = results

                       similarity   time_ms
MinHash                  0.679688  0.004554
SimHash                  0.796875  0.001006
Sentence Transformers    0.960883  0.018026
Count Vectorizer         0.999001  0.001209
Tfidf Vectorizer         0.998030  0.001154
Word2Vec                 0.902010  0.007210
FastText                 0.812892  0.606038
BERT                     0.997124  0.246261
USE                      0.997337  0.004061
Jaccard                  0.689243  0.000106
Levenshtein              0.976233  0.000078
RoBERTa                  0.994164  0.261889
UniSim                   0.984577  0.089773
                       similarity   time_ms
MinHash                  0.687500  0.004812
SimHash                  0.812500  0.001105
Sentence Transformers    0.951362  0.039034
Count Vectorizer         0.998005  0.001187
Tfidf Vectorizer         0.996071  0.001175
Word2Vec                 0.897659  0.008562
FastText                 0.807748  0.609164
BERT                     0.99243

In [None]:
import plotly.express as px

fig = px.line(
    results,
    x="rate",
    y="similarity",
    color="method",
    title="Устойчивость к добавлению слов",
    labels={"rate": "Rate", "similarity": "Similarity", "method": "Method"},
    line_shape="linear",
)

fig.update_layout(
    xaxis_title="Rate",
    yaxis_title="Similarity",
    legend_title="Method",
    hovermode="x unified",
    template="plotly_white", 
)

#### Устойчивость к дублированию слов

In [None]:
from text_augmentor import WordAugmentor

word_aug = WordAugmentor()

results = {}

for rate in [i / 100 for i in range(1, 100, 1)]:
    changed = word_aug.repeat_word(text=source_text, augmentation_rate=rate)
    compared = compare_texts(source_text, changed, similarity_functions)
    results[rate] = compared
    print(compared)

results = pd.concat(results).reset_index()
results.rename(columns={"level_1": "method", "level_0": "rate"}, inplace=True)

exp_results["word_repeat_word"] = results

                       similarity   time_ms
MinHash                  0.710938  0.004523
SimHash                  0.812500  0.000885
Sentence Transformers    1.000000  0.041134
Count Vectorizer         0.998501  0.001285
Tfidf Vectorizer         0.997522  0.001236
Word2Vec                 0.908099  0.007850
FastText                 0.828923  0.605641
BERT                     0.997272  0.279938
USE                      0.996741  0.004271
Jaccard                  0.685259  0.000105
Levenshtein              0.984536  0.000077
RoBERTa                  0.994490  0.286129
UniSim                   0.986111  0.051179
                       similarity   time_ms
MinHash                  0.671875  0.004644
SimHash                  0.812500  0.000795
Sentence Transformers    0.976067  0.044694
Count Vectorizer         0.996567  0.001206
Tfidf Vectorizer         0.993846  0.001160
Word2Vec                 0.903482  0.007992
FastText                 0.823163  0.608835
BERT                     0.99010

In [None]:
import plotly.express as px

fig = px.line(
    results,
    x="rate",
    y="similarity",
    color="method",
    title="Устойчивость к дублированию слов",
    labels={"rate": "Rate", "similarity": "Similarity", "method": "Method"},
    line_shape="linear",
)

fig.update_layout(
    xaxis_title="Rate",
    yaxis_title="Similarity",
    legend_title="Method",
    hovermode="x unified",
    template="plotly_white", 
)

### Проверка на уровне предложений

#### Устойчивость к изменению порядка предложений (соседи)

In [44]:
from text_augmentor import SentenceAugmentor

sent_aug = SentenceAugmentor()

results = {}

for rate in [i / 100 for i in range(1, 100, 1)]:
    changed =  " ".join(sent_aug.neighboring_swap(sentences=sent_aug.split_into_sentences(source_text), augmentation_rate=rate))
    compared = compare_texts(source_text, changed, similarity_functions)
    results[rate] = compared
    print(compared)

results = pd.concat(results).reset_index()
results.rename(columns={"level_1": "method", "level_0": "rate"}, inplace=True)

exp_results["sent_neighboring_swap"] = results

                       similarity   time_ms
MinHash                  1.000000  0.007038
SimHash                  1.000000  0.002334
Sentence Transformers    1.000000  0.078617
Count Vectorizer         1.000000  0.001597
Tfidf Vectorizer         1.000000  0.001435
Word2Vec                 1.000000  0.007394
FastText                 1.000000  0.630901
BERT                     1.000000  0.234735
USE                      0.999535  0.004490
Jaccard                  1.000000  0.000101
Levenshtein              0.999738  0.000006
RoBERTa                  0.999937  0.234381
UniSim                   0.999960  0.052560
                       similarity   time_ms
MinHash                  1.000000  0.004612
SimHash                  1.000000  0.000787
Sentence Transformers    1.000000  0.017743
Count Vectorizer         1.000000  0.001199
Tfidf Vectorizer         1.000000  0.001168
Word2Vec                 1.000000  0.007700
FastText                 1.000000  0.621601
BERT                     1.00000

In [45]:
import plotly.express as px

fig = px.line(
    results,
    x="rate",
    y="similarity",
    color="method",
    title="Устойчивость к изменению порядка предложений (соседи)",
    labels={"rate": "Rate", "similarity": "Similarity", "method": "Method"},
    line_shape="linear",
)

fig.update_layout(
    xaxis_title="Rate",
    yaxis_title="Similarity",
    legend_title="Method",
    hovermode="x unified",
    template="plotly_white", 
)

#### Устойчивость к замене предложений

In [46]:
from text_augmentor import SentenceAugmentor

sent_aug = SentenceAugmentor()

results = {}

for rate in [i / 100 for i in range(1, 100, 1)]:
    changed = " ".join(sent_aug.random_substitution(sentences=sent_aug.split_into_sentences(source_text), augmentation_rate=rate))
    compared = compare_texts(source_text, changed, similarity_functions)
    results[rate] = compared
    print(compared)

results = pd.concat(results).reset_index()
results.rename(columns={"level_1": "method", "level_0": "rate"}, inplace=True)

exp_results["sent_random_substitution"] = results

                       similarity   time_ms
MinHash                  1.000000  0.006264
SimHash                  1.000000  0.002726
Sentence Transformers    1.000000  0.059111
Count Vectorizer         1.000000  0.001297
Tfidf Vectorizer         1.000000  0.001207
Word2Vec                 1.000000  0.020184
FastText                 1.000000  0.647372
BERT                     1.000000  0.255665
USE                      0.999535  0.004044
Jaccard                  1.000000  0.000109
Levenshtein              0.999738  0.000007
RoBERTa                  0.999937  0.235630
UniSim                   0.999960  0.047582
                       similarity   time_ms
MinHash                  1.000000  0.004371
SimHash                  1.000000  0.001067
Sentence Transformers    1.000000  0.018560
Count Vectorizer         1.000000  0.001256
Tfidf Vectorizer         1.000000  0.001257
Word2Vec                 1.000000  0.007989
FastText                 1.000000  0.660166
BERT                     1.00000

In [47]:
import plotly.express as px

fig = px.line(
    results,
    x="rate",
    y="similarity",
    color="method",
    title="Устойчивость к замене предложений",
    labels={"rate": "Rate", "similarity": "Similarity", "method": "Method"},
    line_shape="linear",
)

fig.update_layout(
    xaxis_title="Rate",
    yaxis_title="Similarity",
    legend_title="Method",
    hovermode="x unified",
    template="plotly_white", 
)

#### Устойчивость к добавлению предложений

In [48]:
from text_augmentor import SentenceAugmentor

sent_aug = SentenceAugmentor()

results = {}

for rate in [i / 100 for i in range(1, 100, 1)]:
    changed =  " ".join(sent_aug.random_insertion(sentences=sent_aug.split_into_sentences(source_text), augmentation_rate=rate))
    compared = compare_texts(source_text, changed, similarity_functions)
    results[rate] = compared
    print(compared)

results = pd.concat(results).reset_index()
results.rename(columns={"level_1": "method", "level_0": "rate"}, inplace=True)

exp_results["sent_random_insertion"] = results

                       similarity   time_ms
MinHash                  1.000000  0.006752
SimHash                  1.000000  0.001566
Sentence Transformers    1.000000  0.050151
Count Vectorizer         1.000000  0.001321
Tfidf Vectorizer         1.000000  0.003054
Word2Vec                 1.000000  0.008847
FastText                 1.000000  0.710815
BERT                     1.000000  0.235410
USE                      0.999535  0.004694
Jaccard                  1.000000  0.000114
Levenshtein              0.999738  0.000022
RoBERTa                  0.999937  0.227075
UniSim                   0.999960  0.051889
                       similarity   time_ms
MinHash                  1.000000  0.004526
SimHash                  1.000000  0.001007
Sentence Transformers    1.000000  0.041588
Count Vectorizer         1.000000  0.001319
Tfidf Vectorizer         1.000000  0.001508
Word2Vec                 1.000000  0.010273
FastText                 1.000000  0.618147
BERT                     1.00000

In [49]:
import plotly.express as px

fig = px.line(
    results,
    x="rate",
    y="similarity",
    color="method",
    title="Устойчивость к добавлению предложений",
    labels={"rate": "Rate", "similarity": "Similarity", "method": "Method"},
    line_shape="linear",
)

fig.update_layout(
    xaxis_title="Rate",
    yaxis_title="Similarity",
    legend_title="Method",
    hovermode="x unified",
    template="plotly_white", 
)

#### Устойчивость к удалению предложений

In [50]:
from text_augmentor import SentenceAugmentor

sent_aug = SentenceAugmentor()

results = {}

for rate in [i / 100 for i in range(1, 100, 1)]:
    changed =  " ".join(sent_aug.random_deletion(sentences=sent_aug.split_into_sentences(source_text), augmentation_rate=rate))
    compared = compare_texts(source_text, changed, similarity_functions)
    results[rate] = compared
    print(compared)

results = pd.concat(results).reset_index()
results.rename(columns={"level_1": "method", "level_0": "rate"}, inplace=True)

exp_results["sent_random_deletion"] = results

                       similarity   time_ms
MinHash                  1.000000  0.006349
SimHash                  1.000000  0.001959
Sentence Transformers    1.000000  0.060714
Count Vectorizer         1.000000  0.001214
Tfidf Vectorizer         1.000000  0.001145
Word2Vec                 1.000000  0.023007
FastText                 1.000000  0.616832
BERT                     1.000000  0.247189
USE                      0.999535  0.004560
Jaccard                  1.000000  0.000117
Levenshtein              0.999738  0.000007
RoBERTa                  0.999937  0.312849
UniSim                   0.999960  0.056685
                       similarity   time_ms
MinHash                  1.000000  0.004683
SimHash                  1.000000  0.001329
Sentence Transformers    1.000000  0.044822
Count Vectorizer         1.000000  0.001211
Tfidf Vectorizer         1.000000  0.001166
Word2Vec                 1.000000  0.008533
FastText                 1.000000  0.635990
BERT                     1.00000

In [51]:
import plotly.express as px

fig = px.line(
    results,
    x="rate",
    y="similarity",
    color="method",
    title="Устойчивость к удалению предложений",
    labels={"rate": "Rate", "similarity": "Similarity", "method": "Method"},
    line_shape="linear",
)

fig.update_layout(
    xaxis_title="Rate",
    yaxis_title="Similarity",
    legend_title="Method",
    hovermode="x unified",
    template="plotly_white", 
)

### Проверка с другим текстом

In [52]:
another_text = """We are told by historians of old, that Ingild had four sons, of whom three perished in war,
while OLAF alone resome say that Olaf was the son of Ingild's sister,
though this opinion iin knowledge of his deeds, which
are dim with the dust of antiquifirst eto have control of the affairs
of the sea, earned disgrace from his continual defeats in roving."""

compare_texts(source_text, another_text, similarity_functions)

Unnamed: 0,similarity,time_ms
MinHash,0.046875,0.005234
SimHash,0.609375,0.001038
Sentence Transformers,0.156701,0.0223
Count Vectorizer,0.351136,0.001089
Tfidf Vectorizer,0.231454,0.001059
Word2Vec,0.414463,0.006961
FastText,0.541282,0.687603
BERT,0.759803,0.166444
USE,0.207784,0.003742
Jaccard,0.06,6.5e-05
