In [1]:
import pandas as pd
import json
from types import SimpleNamespace
from collections import defaultdict
import regex as re
import os 


In [2]:
try:
    with open("/home/yosef/ws/BiPO/data/steerbench/in_context_eval.json", "r") as file:
        data = json.load(file)
except json.JSONDecodeError as e:
    print(f"Failed to decode JSON: {e}")

In [3]:
def read_row(data:dict):
    question = data.get('closed_ended_question')
    answers = question[question.find("\n"):]
    open_ended = data.get('open_ended')

    P = (
            f"You are being evaluated on your ability to provide appropriate responses to {data.get('domain')}-related questions.\n"
            f"Answer the multiple-choice question by selecting the best option (A, B, C, or D).\n\n"
            f"{question}\n\n"
            f"Answer:"
        )
    
    return SimpleNamespace(
        question = P,
        prompts = [opt for opt in re.findall(r'[A-Z][\.\)]\s*[^\n]+', answers)],
        pair = data.get('subreddit_pair'),
        open_ended = [k+" "+v for item in open_ended for k, v in item.items()],
        community = data.get('community'),
        label = data.get('answer')
    )

open_qa = defaultdict(lambda:{'r/A': [], 'r/B':[]})
closed_qa = defaultdict(lambda:{'questions': [],'communities':[],'prompts': [],'labels': []})

for item in data:
        row = read_row(item)
        open_qa[row.pair][row.community].extend([i for i in row.open_ended if i not in open_qa[row.pair][row.community]])
        closed_qa[row.pair]['questions'].append(row.question)
        closed_qa[row.pair]['prompts'].append(row.prompts)
        closed_qa[row.pair]['labels'].append(row.label)
        closed_qa[row.pair]['communities'].append(row.community)

In [4]:
for domain, data in open_qa.items():
    folder_path = f"/home/yosef/ws/BiPO/data/{domain}"

    df = defaultdict()
    df["question"] = [sen[:sen.find("?")+1].strip() for sen in data["r/A"]]
    df["matching"] = [sen[sen.find("?")+1:].strip() for sen in data["r/A"]]
    df["not_matching"] = [sen[sen.find("?")+1:].strip() for sen in data["r/A"]]

    pd_df = pd.DataFrame(df)
    os.makedirs(folder_path, exist_ok=True)
    pd_df.to_csv(f"{folder_path}/train.csv", index= False)

In [5]:
def merge_values(series):
    vals = [v for v in series if v is not None]
    return vals[0] if vals else None


for domain, data in closed_qa.items():
    folder_path = f"/home/yosef/ws/BiPO/data/{domain}"
    
    df = defaultdict()
    df["question"] = [sen[:sen.find(":")+1].strip() for sen in data["questions"]]
    df["matching"] = [
        next((op for op in ans if op[0] in labels and com == "r/A"), None)
        for ans, labels, com in zip(data["prompts"], data["labels"], data['communities'])
    ]
    df["not_matching"] = [
        next((op for op in ans if op[0] not in labels and com == "r/B"), None)
        for ans, labels, com in zip(data["prompts"], data["labels"], data['communities'])
    ]
    
    merged_df = pd.DataFrame(df)
    merged_df = merged_df.groupby("question", as_index=False).agg({
        "matching": merge_values,
        "not_matching": merge_values
    })
    merged_df = merged_df.fillna("")
    merged_df.to_csv(f"{folder_path}/test.csv", index= False)

    df["A"] = []
    df["B"] = []
    df["C"] = []
    df["D"] = []
    
    for prompt, labels in zip(data["prompts"], data["labels"]):

        op_dict = {"A": [], "B": [], "C": [], "D": []}

        for op in prompt:
            op_label = op[0]
            op_dict[op_label].append(op)

        df["A"].extend(op_dict["A"])
        df["B"].extend(op_dict["B"])
        df["C"].extend(op_dict["C"])
        df["D"].extend(op_dict["D"])

    df["communities"] = data["communities"]
    first_or_none = lambda series: series.dropna().iloc[0] if len(series.dropna()) else None
    df["label"] = data["labels"]
    del df["matching"]
    del df["not_matching"]
    pdf = pd.DataFrame(df)
    pdf = pdf.fillna("")
    pdf.to_csv(f"{folder_path}/test_infer.csv", index= False)
    