In [10]:
import json
import os
import random

In [None]:
from sampling import Sampler 

sampler = Sampler(
    model_path="../../models/SPhilBERTa",
    paraphrases_path="../../data/llm-completed-pairs/test_paraphrases.json",
    similar_sentences_path="../../data/llm-completed-pairs/test_similar_sentences.json",
    corpus_path="../../data/corpus/corpus/corpus.jsonl"
)

negatives = sampler.sample_negatives_irrelevant()
sampler.save(negatives, label="irrelevant", output_file="negatives.json")
for pair in negatives:
    print(pair)

In [3]:
def sample_quotes(corpus_file, output_file, sample_size):
    data = []
    with open(corpus_file, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():
                    record = json.loads(line)
                    data.append(record["sentence"])

    sample = random.sample(data, sample_size)

    output = []
    for sentence in sample:
        output.append({
            "sentence1": sentence,
            "sentence2": sentence,
            "label": "quote"
        })

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(output, f, indent=4)


In [8]:
sample_quotes("../../data/corpus/corpus/corpus.jsonl", "../../data/training/retriever/quotes.json", 5000)

In [35]:
def load(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    data = []

    if ext == '.jsonl':
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():
                    record = json.loads(line)
                    data.append(record["sentence"])
    elif ext == '.json':
        with open(file_path, 'r', encoding='utf-8') as f:
            json_data = json.load(f)
            for record in json_data:
                sent1 = record.get("sentence1")
                sent2 = record.get("sentence2")
                if sent1 is not None and sent2 is not None:
                    data.append((sent1, sent2))
    else:
        raise ValueError("Unsupported file extension. Only .json and .jsonl files are supported.")

    random.shuffle(data)
    return data

def save(pairs, output_file):
    shuffled = [
        tup if random.random() < 0.5 else tup[::-1]
        for tup in pairs
    ]
    output = []
    for sentence1, sentence2 in shuffled:
        output.append({
            "sentence1": sentence1,
            "sentence2": sentence2,
            "label": "irrelevant"
        })
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(output, f, indent=4)

def sample_irrelevant_for_pairs(corpus_file, pairs_file):

    corpus = load(corpus_file)
    corpus_set = set(corpus)
    pairs = load(pairs_file)
    negatives = []
    
    for pair in pairs:
        s1, s2 = pair
        if s1 in corpus_set:
            original = s1
        elif s2 in corpus_set:
            original = s2
        else:
            continue
        negatives.append((original, random.choice(corpus)))

    return negatives

def sample_irrelevant(corpus_file, number):

    corpus = load(corpus_file)
    pairs = [] 

    for i in range(0, number):
        s1 = random.choice(corpus)
        s2 = random.choice(corpus)
        if s1 != s2:
            pairs.append((s1,s2)) 

    return pairs

In [36]:
negatives = sample_irrelevant("../../data/corpus/corpus/corpus.jsonl", 1000)
save(negatives, "../../data/evaluation/I1000.json")    

In [21]:
negatives = sample_irrelevant("../../data/corpus/corpus/corpus.jsonl", "../../data/training/v2/retriever/paraphrases.json")
save(negatives, "../../data/training/v2/retriever/irrelevant_for_paraphrases.json")    

In [22]:
negatives = sample_irrelevant("../../data/corpus/corpus/corpus.jsonl", "../../data/training/v2/retriever/similar_sentences.json")
save(negatives, "../../data/training/v2/retriever/irrelevant_for_similar_sentences.json")    

In [24]:
def create_triplets(file1_path, file2_path, output_path):
    
    def collect_pairs(data):
        pairs = {}
        for item in data:
            s1, s2, label = item["sentence1"], item["sentence2"], item["label"]
            for anchor, other in [(s1, s2), (s2, s1)]:
                pairs.setdefault(anchor, []).append((other, label))
        return pairs

    # Load and process files
    with open(file1_path, 'r', encoding='utf-8') as f:
        data1 = json.load(f)
    with open(file2_path, 'r', encoding='utf-8') as f:
        data2 = json.load(f)

    map1 = collect_pairs(data1)
    map2 = collect_pairs(data2)

    triplets = []
    for anchor in map1:
        if anchor in map2:
            for other1, label1 in map1[anchor]:
                for other2, label2 in map2[anchor]:
                    if label1 != label2:
                        if label1 == "irrelevant":
                            negative = other1
                            positive = other2
                        else:
                            negative = other2
                            positive = other1
                        triplets.append({
                            "anchor": anchor,
                            "positive": positive,
                            "negative": negative
                        })

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(triplets, f, indent=2, ensure_ascii=False)


In [25]:
create_triplets(
    "../../data/training/v2/fuzzy_quotes.json", 
    "../../data/training/v2/irrelevant_for_fuzzy_quotes.json", 
    "../../data/training/v2/triplets_fuzzy_quotes.json"
)

In [26]:
create_triplets(
    "../../data/training/v2/paraphrases.json", 
    "../../data/training/v2/irrelevant_for_paraphrases.json", 
    "../../data/training/v2/triplets_paraphrases.json"
)

In [27]:
create_triplets(
    "../../data/training/v2/similar_sentences.json", 
    "../../data/training/v2/irrelevant_for_similar_sentences.json", 
    "../../data/training/v2/triplets_similar_sentences.json"
)

In [32]:
from transformers import AutoTokenizer
import json
import numpy as np

# Load the tokenizer (CamemBERT is compatible with SPhilBERTa)
tokenizer = AutoTokenizer.from_pretrained("bowphs/LaBERTa")

# Path to your JSONL file
file_path = "../../data/corpus/corpus/corpus.jsonl"

# Load sentences
sentences = []
with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line)
        sentences.append(data["sentence"])

# Tokenize and compute lengths
lengths = [len(tokenizer.encode(s, truncation=False)) for s in sentences]

# Show stats
print("Total sentences:", len(lengths))
print("Min length:", min(lengths))
print("Max length:", max(lengths))
print("Mean length:", np.mean(lengths))
print("Median length:", np.median(lengths))
print("90th percentile:", np.percentile(lengths, 90))
print("95th percentile:", np.percentile(lengths, 95))
print("99th percentile:", np.percentile(lengths, 99))


Total sentences: 371745
Min length: 11
Max length: 763
Mean length: 29.733177312405008
Median length: 25.0
90th percentile: 49.0
95th percentile: 60.0
99th percentile: 87.0
