In [None]:
from sampling import Sampler 

sampler = Sampler(
    model_path="../../models/SPhilBERTa",
    paraphrases_path="../../data/llm-completed-pairs/test_paraphrases.json",
    similar_sentences_path="../../data/llm-completed-pairs/test_similar_sentences.json",
    corpus_path="../../data/corpus/corpus/corpus.jsonl"
)

negatives = sampler.sample_negatives_irrelevant()
sampler.save(negatives, label="irrelevant", output_file="negatives.json")
for pair in negatives:
    print(pair)

In [3]:
import json
import random
def sample_quotes(corpus_file, output_file, sample_size):
    data = []
    with open(corpus_file, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():
                    record = json.loads(line)
                    data.append(record["sentence"])

    sample = random.sample(data, sample_size)

    output = []
    for sentence in sample:
        output.append({
            "sentence1": sentence,
            "sentence2": sentence,
            "label": "quote"
        })

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(output, f, indent=4)


In [8]:
sample_quotes("../../data/corpus/corpus/corpus.jsonl", "../../data/training/retriever/quotes.json", 5000)

In [18]:
for pair in paraphrases:
    sent1, sent2 = pair
    emb1 = model.encode(sent1)
    emb2 = model.encode(sent2) 
    score = util.cos_sim(emb1, emb2).item() 
    if score < 0.7:
        print(score)

In [None]:
for pair in similar_sentences:
    sent1, sent2 = pair
    emb1 = model.encode(sent1)
    emb2 = model.encode(sent2) 
    score = util.cos_sim(emb1, emb2).item() 
    if score < 0.5:
        print(score)

In [10]:
def create_triplets(file1_path, file2_path, output_path):
    
    def collect_pairs(data):
        pairs = {}
        for item in data:
            s1, s2, label = item["sentence1"], item["sentence2"], item["label"]
            for anchor, other in [(s1, s2), (s2, s1)]:
                pairs.setdefault(anchor, []).append((other, label))
        return pairs

    # Load and process files
    with open(file1_path, 'r', encoding='utf-8') as f:
        data1 = json.load(f)
    with open(file2_path, 'r', encoding='utf-8') as f:
        data2 = json.load(f)

    map1 = collect_pairs(data1)
    map2 = collect_pairs(data2)

    triplets = []
    for anchor in map1:
        if anchor in map2:
            for other1, label1 in map1[anchor]:
                for other2, label2 in map2[anchor]:
                    if label1 != label2:
                        if label1 == "irrelevant":
                            negative = other1
                            positive = other2
                        else:
                            negative = other2
                            positive = other1
                        triplets.append({
                            "anchor": anchor,
                            "positive": positive,
                            "negative": negative
                        })

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(triplets, f, indent=2, ensure_ascii=False)


In [11]:
create_triplets(
    "../../data/training/retriever/paraphrases_40k.json", 
    "../../data/training/retriever/irrelevant_for_paraphrases_40k.json", 
    "../../data/training/retriever/triplets_paraphrases.json"
)

In [12]:
create_triplets(
    "../../data/training/retriever/similar_sentences_40k.json", 
    "../../data/training/retriever/irrelevant_for_similar_sentences_40k.json", 
    "../../data/training/retriever/triplets_similar_sentences.json"
)

In [13]:
create_triplets(
    "../../data/training/retriever/fuzzy_quotes_20k.json", 
    "../../data/training/retriever/irrelevant_for_fuzzy_quotes_20k.json", 
    "../../data/training/retriever/triplets_fuzzy_quotes.json"
)