In [1]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import spacy

# Load English tokenizer, tagger, parser, NER, and word vectors
spacy.cli.download('en_core_web_sm')
nlp = spacy.load("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
from pathlib import Path
import itertools

def read_problem_files(problem_folder, n=None):
    """
    reads ground truth files into dict
    :param truth_folder: path to folder holding ground truth files
    :return: dict of ground truth files with problem-id as key and file content as value
    """
    problems = {}
    files = itertools.islice(Path(problem_folder).glob('problem-*.txt'), n)
    for problem_file in files:
        number = problem_file.name[len("problem-") : -len(".txt")]
        with open(problem_file, 'r', encoding="utf8") as fh:
            problems[number] = fh.readlines()
    return problems

In [4]:
from evaluation.evaluator import read_ground_truth_files

ground_truth = read_ground_truth_files("pan21/train")

In [5]:
from sklearn.metrics.pairwise import cosine_similarity
from numpy import unravel_index

def most_similar(para1, para2):
    embeddings1 = sbert_model.encode(para1, convert_to_numpy=True)
    embeddings2 = sbert_model.encode(para2, convert_to_numpy=True)

    # Calculate cosine similarity between all sentence pairs
    similarity_matrix = cosine_similarity(embeddings1, embeddings2)

    # Find the indices of the most similar sentence pair
    
    max_similarity = similarity_matrix.argmax()
    idx_1, idx_2 = unravel_index(max_similarity, similarity_matrix.shape)

    # Get the most similar sentence pair
    most_similar_pair = (para1[idx_1], para2[idx_2])

    return most_similar_pair, max_similarity


In [6]:
from transformers import BertTokenizer, BertModel
import torch

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [7]:
def get_word_embeddings(sentence):
    # Tokenize the sentence
    tokens = bert_tokenizer.tokenize(sentence)
    token_ids = bert_tokenizer.convert_tokens_to_ids(tokens)

    # Convert token IDs to tensor
    input_ids = torch.tensor([token_ids])

    # Get BERT model output
    with torch.no_grad():
        outputs = bert_model(input_ids)

    # Extract word embeddings from BERT model output
    return outputs[0].numpy()

In [8]:
import numpy as np


def compare_sentences(sent1, sent2):
    diff = len(sent1) - len(sent2)
    embed1 = get_word_embeddings(sent1)
    embed2 = get_word_embeddings(sent2)

    if diff > 0:
        excess = np.sum(embed1[:-diff])/len(sent2)
        embed1 = [word + excess for word in embed1[:len(sent2)]]
    elif diff < 0:
        excess = np.sum(embed2[:-diff], axis=0)/len(sent1)
        embed2 = [word + excess for word in embed2[:len(sent1)]]

    mean_cos_sim = np.mean([cosine_similarity(w1, w2) for w1, w2 in zip(embed1, embed2)])

    return mean_cos_sim



In [53]:
import numpy as np
from scipy.stats import entropy
from scipy.signal import coherence

test_sent_1 = "I am a yellow apple"
test_sent_2 = "I am a red apple"
test_sent_3 = "You are a yellow apple"
test_sent_4 = "Totally different structure and meaning with longer words and stuff"

sent_fft = []
for sent in [test_sent_1, test_sent_2, test_sent_3, test_sent_4]:
    sent_embedding = get_word_embeddings(sent)[0]
    print(sent_embedding.shape)

    sent_fft.append(np.real(np.fft.fft(sent_embedding))[:5])

print(sent_fft[0].shape)
print(sent_fft[0])

# print(np.fft.rfftn([1, 2, 1, 1, 0, 3, 1]))

for i, spectra in enumerate(sent_fft):
    for other_spectra in [x for x in sent_fft if x is not spectra]: 
        print(f"{i}: {np.average(spectra - other_spectra)} {coherence(spectra, other_spectra, axis=1)=}")

(5, 768)
(5, 768)
(5, 768)
(10, 768)
(5, 768)
[[ -6.94663851  18.64850896  -5.99128207 ... -12.67487251  -5.99128207
   18.64850896]
 [ -6.60646214  16.70945255  -7.95637537 ... -14.13357979  -7.95637537
   16.70945255]
 [ -6.47090254  14.26914222  -7.87351536 ... -21.17497934  -7.87351536
   14.26914222]
 [ -6.61749523  17.02825924  -9.89820147 ... -17.64721523  -9.89820147
   17.02825924]
 [ -6.69677528  17.70033625  -7.05161242 ... -15.86485992  -7.05161242
   17.70033625]]
0: 0.14204271584749223 (array([0.        , 0.00390625, 0.0078125 , 0.01171875, 0.015625  ,
       0.01953125, 0.0234375 , 0.02734375, 0.03125   , 0.03515625,
       0.0390625 , 0.04296875, 0.046875  , 0.05078125, 0.0546875 ,
       0.05859375, 0.0625    , 0.06640625, 0.0703125 , 0.07421875,
       0.078125  , 0.08203125, 0.0859375 , 0.08984375, 0.09375   ,
       0.09765625, 0.1015625 , 0.10546875, 0.109375  , 0.11328125,
       0.1171875 , 0.12109375, 0.125     , 0.12890625, 0.1328125 ,
       0.13671875, 0.1406

In [98]:
# Our sentences to encode
problems = read_problem_files("pan21/train", n=4)

for num, problem in problems.items():
    problem_num = f"problem-{num}"
    paragraphs = [[sent.text for sent in nlp(paragraph).sents] for paragraph in problem]
    for i in range(len(paragraphs) - 1):
        para1, para2 = (paragraphs[i], paragraphs[i + 1])

        most_similar_pair, similarity = most_similar(para1, para2)
        highest_similarity = compare_sentences(*most_similar_pair)
        print(f"{i}, {i+1}: {similarity:.2f} {highest_similarity:.2f} {ground_truth[problem_num]['changes'][i]}")
    print(f"{ground_truth[problem_num]=}")

0, 1: 2.00 0.39 1
1, 2: 3.00 0.37 0
2, 3: 3.00 0.37 0
3, 4: 1.00 0.38 0
4, 5: 7.00 0.51 1
5, 6: 6.00 0.44 1
6, 7: 1.00 0.33 0
ground_truth[problem_num]={'authors': 3, 'structure': [12611, 5862, 1424, 5862], 'site': 'gamedev.stackexchange.com.7z', 'multi-author': 1, 'changes': [1, 0, 0, 0, 1, 1, 0], 'paragraph-authors': [1, 2, 2, 2, 2, 3, 2, 2]}
0, 1: 0.00 0.40 0
1, 2: 0.00 0.35 0
2, 3: 2.00 0.38 1
3, 4: 4.00 0.38 1
ground_truth[problem_num]={'authors': 2, 'structure': [2062, 2310, 2062], 'site': 'networkengineering.stackexchange.com.7z', 'multi-author': 1, 'changes': [0, 0, 1, 1], 'paragraph-authors': [1, 1, 1, 2, 1]}
0, 1: 1.00 0.46 1
1, 2: 0.00 0.50 1
2, 3: 2.00 0.46 1
3, 4: 0.00 0.37 0
4, 5: 0.00 0.40 0
5, 6: 0.00 0.37 1
6, 7: 1.00 0.35 1
7, 8: 2.00 0.36 1
8, 9: 1.00 0.32 1
ground_truth[problem_num]={'authors': 4, 'structure': [13726, 19152, 13726, 6436, 19152, 2477, 6436, 2477], 'site': 'serverfault.com.7z', 'multi-author': 1, 'changes': [1, 1, 1, 0, 0, 1, 1, 1, 1], 'paragraph-auth