In [None]:

import json
from transformers import AutoTokenizer
from tqdm import tqdm
from pathlib import Path
from tqdm import tqdm
import regex as re
data_dir = "../../data"
dataset = "mix_128"
set_name = "test"
# post_fix = "_filtered_with_chatgpt_with_reranker_ranks"
post_fix = ""
data_file = f"{data_dir}/{dataset}/{set_name}_data_prepared{post_fix}.json"
fuse_gen_data_dir = Path(f"{data_dir}/{dataset}/fuse_gen/{set_name}/")
fuse_gen_data_dir.mkdir(parents=True, exist_ok=True)
with open(data_file, 'r') as f:
    data = json.load(f)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")

def deduplicate_string(string, min_ngram=2, max_ngram=8, repeat=4):

    result = ""
    
    sub_strings = string.split(" ")
    assert repeat >= 2, "repeat should be larger than 2"
    for i in range(len(sub_strings)):
        stop = False
        for ngram in range(min_ngram, max_ngram):
            current_ngrams = sub_strings[i:i+ngram]
            # at least one alpha in the ngram
            if not any([re.search(r"[a-zA-Z]", ngra) for ngra in current_ngrams]):
                continue
            if len(set([" ".join(sub_strings[i+j*ngram:i+j*ngram+ngram]) for j in range(repeat)])) == 1:
                stop = True
                # keep the first occurrence
                result += " " + " ".join(sub_strings[i:i+ngram])
                break
        if stop:
            break
        else:
            result += " " + sub_strings[i]
    return result.strip()

for item in tqdm(data):
    for cand in item['candidates']:
        cand['text'] = deduplicate_string(cand['text'])



In [None]:
import torch
import random
random.seed(42)
class GenFuserDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_length, top_k, select_key, candidate_max_length=None, add_noise=False, noise_prob=0.10):
        """
            data: list of dict
            tokenizer: tokenizer
            max_length: max length of the input sequence
            top_k: number of top k candidate to select
            select_key: selection metric for the top k candidates
        """
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.top_k = top_k
        self.select_key = select_key
        self.candidate_max_length = candidate_max_length
        self.add_noise = add_noise
        self.noise_prob = noise_prob

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data[index]
        intruction = item["instruction"]
        input = item["input"]
        output = item["output"]
        sorted_candidates = sorted(
            item["candidates"], 
            key=lambda x: x['scores'][self.select_key], reverse=True)
        if self.top_k > 0:
            top_k_candidates = sorted_candidates[:self.top_k]
        else:
            top_k_candidates = sorted_candidates
        if self.add_noise and random.random() < self.noise_prob and len(sorted_candidates[self.top_k:]) > 0:
            random_idx = random.randint(0, len(top_k_candidates)-1)
            top_k_candidates[random_idx] = random.choice(sorted_candidates[self.top_k:])
        random.shuffle(top_k_candidates)
        candidates = [c["text"] for c in top_k_candidates]
        if self.candidate_max_length is not None:
            for i in range(len(candidates)):
                ids = self.tokenizer.encode(candidates[i], add_special_tokens=False)
                if len(ids) > self.candidate_max_length:
                    ids = ids[:self.candidate_max_length]
                    candidates[i] = self.tokenizer.decode(ids)
                    candidates[i] += "..."

        # concatenate input and candidates
        instruction = "Instruction: " + intruction # replace "</s>" with "</s>"
        input = "Input: " + input
        candidates = "</s>".join([f"Candidate {i}: <extra_id_{i}>:" + c for i, c in enumerate(candidates)]) # extra id
        fuse_input = "</s>".join([instruction, input, candidates])
        fuse_input += "</s>Summarize candidates into a better one for the given instruction:"

        # tokenize
        fuse_input_ids = self.tokenizer(
            fuse_input,
            max_length=self.max_length,
            truncation=True,
            return_tensors="pt",
            add_special_tokens=False,)
        labels_ids = self.tokenizer.encode(output, return_tensors="pt", add_special_tokens=False,).squeeze(0)
        fuse_input_ids = {k: v.squeeze(0) for k, v in fuse_input_ids.items()}


        return {
            **fuse_input_ids,
            "labels": labels_ids,
            "source_models": [c['model'] for c in top_k_candidates],
        }

In [None]:
import torch
import random
random.seed(42)
metric = "bartscore"
add_noise = True
noise_prob = 0.10
save_postfix = "_noise" if add_noise else ""
for top_k in [3, 5]:
    dataset = GenFuserDataset(data, tokenizer, 1024, top_k, metric, 128, add_noise, noise_prob)

    from torch.utils.data import DataLoader
    data_loader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)
    gen_data = []
    for item in tqdm(iter(data_loader), total=len(data_loader)):
        gen_data.append(
            {
                "input": tokenizer.decode(item['input_ids'][0]),
                "output": tokenizer.decode(item['labels'][0]),
                "source_models": [x[0] for x in item['source_models']],
            }
        )
    save_file = fuse_gen_data_dir / f"top{top_k}_{metric}{save_postfix}.jsonl"
    with open(save_file, 'w') as f:
        for item in gen_data:
            f.write(json.dumps(item) + "\n")
    print(f"Saved to {save_file}")

In [None]:
import torch
import random
random.seed(42)
for item in data:
    for cand in item['candidates']:
        cand['scores']['random'] = random.random()
for top_k in [3, 5]:
    dataset = GenFuserDataset(data, tokenizer, 1024, top_k, "random", 128)

    from torch.utils.data import DataLoader
    data_loader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)
    gen_data = []
    for item in iter(data_loader):
        gen_data.append(
            {
                "input": tokenizer.decode(item['input_ids'][0]),
                "output": tokenizer.decode(item['labels'][0]),
                "source_models": [x[0] for x in item['source_models']],
            }
        )
    with open(fuse_gen_data_dir / f"random{top_k}.jsonl", 'w') as f:
        for item in gen_data:
            f.write(json.dumps(item) + "\n")
    print(f"Saved to {fuse_gen_data_dir / f'random{top_k}.jsonl'}")

In [None]:
random.seed(42)
for item in data:
    for cand in item['candidates']:
        cand['scores']['random'] = random.random()
        if "oasst" in cand['model']:
            cand['scores']['random'] = 2.0 # make sure it is the best

for top_k in [3]:
    dataset = GenFuserDataset(data, tokenizer, 1024, top_k, "random", 128)

    from torch.utils.data import DataLoader
    data_loader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)
    gen_data = []
    for item in iter(data_loader):
        gen_data.append(
            {
                "input": tokenizer.decode(item['input_ids'][0]),
                "output": tokenizer.decode(item['labels'][0]),
                "source_models": [x[0] for x in item['source_models']],
            }
        )
    with open(fuse_gen_data_dir / f"oasst-random{top_k}.jsonl", 'w') as f:
        for item in gen_data:
            f.write(json.dumps(item) + "\n")
    print(f"Saved to {fuse_gen_data_dir / f'oasst-random{top_k}.jsonl'}")

In [None]:
random.seed(42)

for select_key in ["SummaReranker", "deberta-random", "deberta-random-with-tgt", "deberta-top-bottom", "roberta-random"]:
    for top_k in [3, 5]:
        dataset = GenFuserDataset(data, tokenizer, 1024, top_k, select_key, 128)

        from torch.utils.data import DataLoader
        data_loader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)
        gen_data = []
        for item in tqdm(iter(data_loader), total=len(data_loader)):
            gen_data.append(
                {
                    "input": tokenizer.decode(item['input_ids'][0]),
                    "output": tokenizer.decode(item['labels'][0]),
                    "source_models": [x[0] for x in item['source_models']],
                }
            )
        with open(fuse_gen_data_dir / f"top{top_k}_{select_key}.jsonl", 'w') as f:
            for item in gen_data:
                f.write(json.dumps(item) + "\n")
        print(f"Saved to {fuse_gen_data_dir / f'top{top_k}_{select_key}.jsonl'}")

In [None]:
from src.gpt_eval.utils import get_scores_from_cmps
import random
random.seed(42)
train_predictions_path_map = {
    "deberta-bartscore": "./outputs/crosscompare/microsoft/deberta-v3-large/test_mix_128_train_PairReranker_full_comparison_bartscore/predictions_full.pt"
}
train_predictions = {k: torch.load(v)[0] for k, v in train_predictions_path_map.items()}
train_scores = {k: get_scores_from_cmps(v) for k, v in train_predictions.items()}
for i, item in enumerate(data):
    for j, cand in enumerate(item['candidates']):
        for k, v in train_scores.items():
            cand['scores'][k] = v[i][j]

for select_key in train_scores.keys():
    for top_k in [3, 5]:
        dataset = GenFuserDataset(data, tokenizer, 1024, top_k, select_key, 128)

        from torch.utils.data import DataLoader
        data_loader = DataLoader(dataset, batch_size=1, shuffle=False, num_workers=0)
        gen_data = []
        for item in tqdm(iter(data_loader), total=len(data_loader)):
            gen_data.append(
                {
                    "input": tokenizer.decode(item['input_ids'][0]),
                    "output": tokenizer.decode(item['labels'][0]),
                    "source_models": [x[0] for x in item['source_models']],
                }
            )
        with open(fuse_gen_data_dir / f"top{top_k}_{select_key}.jsonl", 'w') as f:
            for item in gen_data:
                f.write(json.dumps(item) + "\n")
        print(f"Saved to {fuse_gen_data_dir / f'top{top_k}_{select_key}.jsonl'}")