In [84]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

In [85]:
import spacy

# Load English tokenizer, tagger, parser, NER, and word vectors
spacy.cli.download('en_core_web_sm')
nlp = spacy.load("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [86]:
from pathlib import Path
import itertools

def read_problem_files(problem_folder, n=None):
    """
    reads ground truth files into dict
    :param truth_folder: path to folder holding ground truth files
    :return: dict of ground truth files with problem-id as key and file content as value
    """
    problems = {}
    files = itertools.islice(Path(problem_folder).glob('problem-*.txt'), n)
    for problem_file in files:
        number = problem_file.name[len("problem-") : -len(".txt")]
        with open(problem_file, 'r', encoding="utf8") as fh:
            problems[number] = fh.readlines()
    return problems

In [87]:
from evaluation.evaluator import read_ground_truth_files

ground_truth = read_ground_truth_files("pan21/train")

In [93]:
from sklearn.metrics.pairwise import cosine_similarity

def most_similar(para1, para2):
    embeddings1 = sbert_model.encode(para1, convert_to_numpy=True)
    embeddings2 = sbert_model.encode(para2, convert_to_numpy=True)

    # Calculate cosine similarity between all sentence pairs
    similarity_matrix = cosine_similarity(embeddings1, embeddings2)

    # Find the indices of the most similar sentence pair
    max_similarity_index = divmod(similarity_matrix.argmax(), similarity_matrix.shape[1])

    # Get the most similar sentence pair
    most_similar_pair = (para1[max_similarity_index[0]], para2[max_similarity_index[1]])

    return most_similar_pair, similarity_matrix


In [89]:
from transformers import BertTokenizer, BertModel
import torch

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [90]:
def get_word_embeddings(sentence):
    # Tokenize the sentence
    tokens = bert_tokenizer.tokenize(sentence)
    token_ids = bert_tokenizer.convert_tokens_to_ids(tokens)

    # Convert token IDs to tensor
    input_ids = torch.tensor([token_ids])

    # Get BERT model output
    with torch.no_grad():
        outputs = bert_model(input_ids)

    # Extract word embeddings from BERT model output
    return outputs[0].numpy()

In [91]:
import numpy as np


def compare_sentences(sent1, sent2):
    diff = len(sent1) - len(sent2)
    embed1 = get_word_embeddings(sent1)
    embed2 = get_word_embeddings(sent2)

    if diff > 0:
        excess = np.sum(embed1[:-diff])/len(sent2)
        embed1 = [word + excess for word in embed1[:len(sent2)]]
    elif diff < 0:
        excess = np.sum(embed2[:-diff], axis=0)/len(sent1)
        embed2 = [word + excess for word in embed2[:len(sent1)]]

    mean_cos_sim = np.mean([cosine_similarity(w1, w2) for w1, w2 in zip(embed1, embed2)])

    return mean_cos_sim



In [94]:
# Our sentences to encode
problems = read_problem_files("pan21/train", n=4)

for num, problem in problems.items():
    problem_num = f"problem-{num}"
    paragraphs = [[sent.text for sent in nlp(paragraph).sents] for paragraph in problem]
    for i in range(len(paragraphs) - 1):
        para1, para2 = (paragraphs[i], paragraphs[i + 1])

        most_similar_pair, similarity = most_similar(para1, para2)
        highest_similarity = compare_sentences(*most_similar_pair)
        print(f"{i}, {i+1}: {similarity:.2f} {highest_similarity:.2f} {ground_truth[problem_num]['changes'][i]}")
    print(f"{ground_truth[problem_num]=}")

ValueError: Expected 2D array, got 1D array instead:
array=[ 7.85434470e-02  1.96468760e-03  3.33422869e-02  8.77521001e-03
 -4.53390777e-02 -4.55802456e-02 -7.99855664e-02  2.06299219e-02
 -4.89074476e-02  1.21216655e-01  4.43624742e-02 -1.71473436e-02
 -7.34435543e-02  1.75592247e-02  7.47129507e-03 -5.92222326e-02
  4.30323742e-02  1.35327969e-02 -4.57067341e-02 -4.03561480e-02
 -4.45564911e-02 -2.74355561e-02  4.07087915e-02 -4.56368327e-02
 -8.04605894e-03  4.93553914e-02 -4.01860364e-02  4.55614291e-02
 -1.91965618e-03 -4.65979287e-03  3.09489272e-03  5.99665269e-02
  8.00852664e-03 -8.86062346e-03 -4.91302721e-02  4.52211499e-02
  8.80126667e-04  2.64338590e-02 -5.85633405e-02 -3.89617234e-02
 -9.33800861e-02  3.04856151e-03  4.27806303e-02  7.59484544e-02
  5.80741279e-02  6.20034225e-02 -1.62492190e-02 -7.10250214e-02
 -4.57938202e-03 -1.61669217e-02 -5.50461598e-02 -6.02390580e-02
  9.49239917e-03 -2.60158647e-02  1.05940878e-01  4.16961201e-02
  1.44326501e-02 -4.01123660e-03 -5.42610418e-03 -9.34219081e-03
  2.69243550e-02 -7.97608793e-02 -5.06606773e-02  1.87236760e-02
  7.49365846e-03 -2.96333935e-02  2.77719670e-03  3.36912647e-02
  7.53014581e-04 -1.88033469e-02  2.33384967e-02  4.69999239e-02
  1.99357234e-02  1.26560451e-02  5.30578271e-02 -6.74495324e-02
 -3.71372998e-02  2.94899847e-02 -1.68878818e-03  1.33050531e-01
 -1.51929045e-02 -5.38188256e-02 -7.86362514e-02  1.45922247e-02
  4.23583202e-02  1.87905282e-02 -1.50500266e-02  4.95835356e-02
  8.02558009e-03 -2.71955431e-02 -1.75129455e-02 -2.10103225e-02
  6.90583661e-02  7.02152960e-03  2.76244935e-02  2.06379946e-02
 -2.98221726e-02 -3.23068611e-02 -1.10694207e-01  5.57552848e-04
 -7.71440491e-02  4.15511988e-02  3.56166959e-02  1.62933722e-01
  1.19610112e-02  9.28719118e-02 -3.36066633e-03 -2.67960280e-02
  4.52782400e-03  1.38919996e-02 -2.59832945e-02  2.32658833e-02
  7.45013654e-02  1.80887952e-02 -1.03554064e-02  2.45259833e-02
 -1.09235626e-02  5.72465770e-02 -4.35287207e-02  6.75813109e-02
  3.15894820e-02 -3.43150795e-02  5.34788333e-02  6.86918423e-02
  4.34934348e-02 -7.91294649e-02 -4.53179149e-04 -3.26011713e-33
 -5.32614440e-03  7.27346390e-02  8.19814354e-02  1.27394041e-02
 -5.71853062e-03 -2.27642860e-02  4.99541163e-02 -1.74720790e-02
  8.11892450e-02  1.89954340e-02 -2.75897961e-02  4.34273481e-02
  9.81900189e-03  1.30913891e-02  8.31644312e-02  5.91988862e-02
  1.00057535e-02  2.72928867e-02  2.16337536e-02 -2.97174919e-02
  7.40056261e-02 -7.29571432e-02  3.39088991e-04 -9.68732089e-02
 -4.22424003e-02  9.64537486e-02  4.06973250e-02  4.40896116e-02
  2.75625009e-02 -1.21332463e-02 -7.87702203e-02 -4.69022896e-03
 -6.68891147e-02 -4.55598376e-04  3.21712941e-02  3.68975252e-02
  1.87761465e-03 -7.53280818e-02  1.78162602e-03 -7.46820075e-03
  1.58666857e-02  2.80366894e-02 -4.30848114e-02 -2.91757416e-02
 -2.58724429e-02 -3.05009913e-02  9.17648152e-02 -5.93264289e-02
 -1.03117675e-01  3.42186652e-02 -8.67569521e-02 -1.44418003e-03
  4.55304570e-02 -7.79464990e-02  7.90045932e-02  4.46647592e-03
  4.48911339e-02 -1.17840664e-02 -5.46260960e-02  7.18143210e-02
 -9.25383195e-02  1.49806133e-02  5.37165888e-02  3.97401713e-02
 -3.47823910e-02  9.38686579e-02 -6.01636432e-02  5.66456653e-02
 -2.15999726e-02 -3.72781721e-03 -3.26290703e-03  2.98855528e-02
 -2.82956418e-02 -3.77835892e-02  4.54343781e-02  3.02150082e-02
  1.35732079e-02 -8.69066119e-02  6.83113337e-02 -2.32446026e-02
  5.76125719e-02 -7.47763440e-02 -4.13494594e-02 -2.75478028e-02
 -5.19266203e-02 -4.51220432e-04  4.42005694e-02 -2.92017069e-02
 -2.61825752e-02 -5.87460585e-02 -1.11021794e-01 -6.60078153e-02
 -2.05489714e-03  2.08891705e-02  1.66825689e-02  2.23292407e-33
 -1.09043598e-01 -7.33560696e-02 -9.07503739e-02  8.27418119e-02
  2.50198524e-02  6.24227412e-02 -1.62544306e-02 -5.05761132e-02
 -1.92256868e-02  1.27760554e-02  1.18429717e-02  2.26155836e-02
 -1.05573326e-01 -1.34549262e-02  1.21301496e-02  3.07270512e-02
 -4.24367711e-02 -2.55128182e-02  1.48091279e-02 -7.61433598e-03
 -5.98534755e-03  4.63716462e-02 -6.02022521e-02  8.41712207e-03
  5.51153906e-03  2.70036515e-02 -6.03280440e-02 -1.16188645e-01
 -7.35428184e-02  1.06609799e-01  2.57754121e-02 -1.44128818e-02
 -1.87259521e-02 -1.55468415e-02 -2.72010304e-02  5.04146377e-03
  8.03458467e-02  4.08201702e-02 -1.78139471e-02 -2.00522728e-02
 -4.88092117e-02  5.29215634e-02 -3.11869718e-02 -6.28764406e-02
 -4.98896576e-02  1.03716301e-02  7.82937855e-02  1.52016012e-02
  2.34473143e-02  2.71274652e-02 -1.19562047e-02 -5.31127155e-02
  2.58538555e-02 -1.04559183e-01 -1.63469333e-02  3.05774957e-02
  1.05268536e-02  6.81004077e-02  4.32667807e-02 -2.36933269e-02
  4.89221402e-02 -1.99437607e-02 -6.51230887e-02  7.91809708e-02
  7.22391251e-03  3.35722230e-02 -8.63705948e-02  1.35053277e-01
  3.61997485e-02 -8.89321715e-02 -6.42020255e-02  4.76485267e-02
  5.69777517e-03 -3.93761955e-02 -4.26181592e-02  4.08178419e-02
 -3.93211143e-03  1.15460670e-02 -2.19038688e-02 -4.50032540e-02
  2.97856312e-02  2.47805398e-02  5.63917533e-02 -3.52975866e-03
 -1.23554468e-03  3.13900784e-02 -5.01632690e-02 -8.86398405e-02
  1.52008682e-02 -4.18417938e-02 -1.03961956e-02  5.21899164e-02
  1.21143639e-01  8.00634176e-02 -1.31234929e-01 -1.96612682e-08
 -1.36951711e-02 -2.64070891e-02  6.54032975e-02 -7.64601678e-02
 -1.49700996e-02  9.22984555e-02 -2.56306231e-02  7.06283981e-03
 -6.01604208e-02  6.87206164e-02  1.07115861e-02 -1.53258638e-02
 -2.11847778e-02  9.87857282e-02  3.19834352e-02  1.13298930e-01
  4.36197594e-02 -3.36320326e-02 -2.49263011e-02  9.79353637e-02
  7.68016279e-02  8.86406284e-03 -3.90523858e-03  5.68657555e-03
 -2.08543129e-02 -2.62277406e-02 -3.19395140e-02  1.43050859e-02
  2.41444632e-02  2.59916726e-02  5.00014378e-03 -8.07576180e-02
  7.65641853e-02  3.84437181e-02  7.74244741e-02  6.59708902e-02
  5.34467883e-02 -1.42907137e-02 -5.29293604e-02 -4.89799269e-02
 -1.21562794e-01  3.94784994e-02 -1.55078736e-03  7.63981491e-02
  1.56554785e-02 -2.41052709e-04 -8.68636072e-02 -1.03422493e-01
 -5.08328490e-02  6.53656423e-02  7.61301592e-02  8.59087333e-02
 -1.93412602e-02 -6.11527637e-02  8.48016813e-02  1.02773510e-01
  2.16339715e-03 -1.49494642e-02 -1.49114393e-02 -1.80853605e-02
  2.90354788e-02  1.29992180e-02 -3.12690884e-02  3.69895063e-02].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.