In [1]:
from joblib import Memory

memory = Memory(".cache", verbose=0)

In [2]:
from transformers import BertTokenizer, BertModel
import torch
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

@memory.cache
def get_tokenizer_model():
    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    bert_model = BertModel.from_pretrained('bert-base-cased')
    sbert_model = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")

    return bert_tokenizer, bert_model, sbert_model

bert_tokenizer, bert_model, sbert_model = get_tokenizer_model()

In [65]:
from pathlib import Path
import itertools
from natsort import natsorted

@memory.cache
def read_problem_files(problem_folder, n=None):
    """
    reads ground truth files into dict
    :param truth_folder: path to folder holding ground truth files
    :return: dict of ground truth files with problem-id as key and file content as value
    """
    problems = []
    files = itertools.islice(natsorted(Path(problem_folder).glob('problem-*.txt')), n)
    for problem_file in files:
        # number = problem_file.name[len("problem-") : -len(".txt")]
        with open(problem_file, 'r', encoding="utf8") as fh:
            problems.append(fh.readlines())
    return problems

In [62]:
from evaluation.evaluator import read_ground_truth_files

@memory.cache
def cached_read_ground_truth(x):
    return read_ground_truth_files(x)

ground_truth = cached_read_ground_truth("pan21/train")

In [63]:
def get_embeddings(paragraph):
    # Tokenize the sentence
    tokens = bert_tokenizer.tokenize(paragraph)
    token_ids = bert_tokenizer.convert_tokens_to_ids(tokens)

    # Convert token IDs to tensor
    token_tensor = torch.tensor([token_ids])
    segment_ids = [0] * len(token_ids)
    segment_tensor = torch.tensor([segment_ids])

    # Get BERT model output
    with torch.no_grad():
        outputs = bert_model(token_tensor, segment_tensor)

    # Extract word embeddings from BERT model output
    return outputs[0]

In [66]:
problems = read_problem_files("pan21/train", n=5)

In [68]:
problems[0]

["As stephelton said in the comments to your question, vector math is extremely important for pretty much any 2D or 3D game. However, physics knowledge isn't necessary for a lot of simple games. There are physics-like concepts you should understand a bit about, like collision, but you won't need calculus or physics classes for that as long as you keep it simple. A lot of things you may want to do can be simulated simply enough that players won't care much, like friction, or sliding, or gravity. A decent grasp of physics will likely help in many situations though.\n",
 'It is probably not required to know physics in details when you\'re doing a game, but it definitely helps, especially if there are some \'virtual reality\' features in your game. A game like "From Dust" (Eric Chahi) is essentially physics simulation gamified, while "Another World" only need high-precision capture of real-life motion (so and requires little to no actual understanding of what happens).\n',
 "It is very lik

In [74]:
def get_paragraph_pairs(problem_text):
    # print(problem_text)
    paragraph_embeddings = [get_embeddings(para) for para in problem_text]
    # print(f"{[x.shape for x in paragraph_embeddings]}")
    return list(itertools.combinations(paragraph_embeddings, 2))


In [75]:
from tqdm.notebook import tqdm

@memory.cache
def get_problem_embeddings(problems):
    return [get_paragraph_pairs(problem_text) for problem_text in tqdm(problems)]

In [76]:
problems_embed = get_problem_embeddings(problems)

In [82]:
def get_simple_ground_truth(ground_truth, problem_numbers):
    simple_ground_truth = []
    for num in problem_numbers:
        task_3_ground_truth = ground_truth[f"problem-{num}"]["paragraph-authors"]
        simple_ground_truth.append(task_3_ground_truth)
    return simple_ground_truth


simple_ground_truth = get_simple_ground_truth(ground_truth, range(1, len(problems_embed)))

In [83]:
simple_ground_truth

[[1, 2, 2, 2, 2, 3, 2, 2],
 [1, 2, 2, 2, 1, 2],
 [1, 2, 2, 3, 3, 2],
 [1, 1, 1, 1, 1]]

In [89]:
def get_task_3_ground_truth(simple_ground_truth):
    task_gt = []
    for problem in simple_ground_truth:
        problem_gt = []
        for author1, author2 in itertools.combinations(problem, 2):
            problem_gt.append(int(author1 != author2))
        task_gt.append(problem_gt)
    return task_gt

task_3_ground_truth = get_task_3_ground_truth(simple_ground_truth)

In [92]:
# TODO: Invert the function get_task_3_ground_truth. Our model will output a bunch of binary labels which need to be converted to the task 3 ground truth format
# Ground truth format (gtf): [1, 2, 2, 2, 2, 3, 2, 2]
# Binary labels for comparisons (bl): [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0]
# Each binary label is the result of comparing two paragraphs. 1 means there was an author change, 0 means there was no author change
# For example, bl[0], is the result of comparing gtf[0]=1 and gtf[1]=2. 1 != 2, therefore bl[0] = 1. bl[1]=1 is the result of gtf[0] == gtf[2] (1 == 2)
def get_simple_ground_truth_from_task_3(task_3_ground_truth):
    pass

simple_ground_truth == get_simple_ground_truth_from_task_3(task_3_ground_truth)

False

In [93]:
for s, t in zip(simple_ground_truth, task_3_ground_truth):
    print(f"{s}")
    print(f"{t}")

[1, 2, 2, 2, 2, 3, 2, 2]
[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0]
[1, 2, 2, 2, 1, 2]
[1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1]
[1, 2, 2, 3, 3, 2]
[1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1]
[1, 1, 1, 1, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [None]:
# Add code to write out the embeddings (X) and ground truths (y) so we can train without having to rerun the preprocessing step