In [1]:
import pandas as pd
import json

# Common Functions

In [None]:
def create_incremental_data(method, end, start=0, base_path='/content/drive/MyDrive/UVA/Thesis/synthetic_data'):
    # List of CSV file paths
    file_paths = file_paths = [
        f"{base_path}/{method}/train-{i}.csv"
        for i in range(start, end + 1)
    ]

    # Initialize an empty list to store DataFrames
    dfs = []

    # Loop through the file paths and read each file
    for file_path in file_paths:
        df = pd.read_csv(file_path)
        dfs.append(df)

    # Concatenate all DataFrames in the list into a single DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)

    combined_df[["synthetic_question", "article_ids"]].to_csv(
        f"/content/drive/MyDrive/UVA/Thesis/synthetic_data/{method}/train-incremental.csv")


def train(current_round, epochs, target_round, end=10):
    print(f"### Starting training. Current Progress: {current_round} rounds / {target_round} rounds")
    # Define your variables
    model = "camembert-base"  # reinit parameters every round
    method = 'describe_then_ask_qcc_fewshots'
    output_path = f"/content/drive/MyDrive/UVA/Thesis/training/synthesizing/output/{method}/step_by_step{variation}/{current_round}"
    queries_filepath = f"/content/drive/MyDrive/UVA/Thesis/training/synthesizing/output/{method}/step_by_step{variation}/{current_round}/train.csv"

    if current_round == 0:
        # first iteration
        create_incremental_data(method=method, end=end)
        !cp/content/drive/MyDrive/UVA/Thesis/synthetic_data/ {method} /train-incremental.csv/content/drive/MyDrive/UVA/Thesis/training/synthesizing/output/ {method} /step_by_step{variation} / {current_round} /train.csv

    df = pd.read_csv(queries_filepath)
    df["synthetic_question"] = df["synthetic_question"].str.replace("[context] ", '', regex=False).str.replace(
        "[question] ", '', regex=False)
    df["synthetic_question"] = df["synthetic_question"].str.replace("[Context] ", '', regex=False).str.replace(
        "[Question] ", '', regex=False)
    df["synthetic_question"] = df["synthetic_question"].str.replace("[contexte] ", '', regex=False)
    df["synthetic_question"] = df["synthetic_question"].str.replace("[Contexte] ", '', regex=False)
    df["synthetic_question"] = df["synthetic_question"].str.replace("Contexte : ", '', regex=False)
    df["synthetic_question"] = df["synthetic_question"].str.replace("Situation :", '', regex=False)
    df["synthetic_question"] = df["synthetic_question"].str.replace("Question :", '', regex=False)
    df[["synthetic_question", "article_ids"]].to_csv(queries_filepath)

    !python scripts/baseline/bsard/experiments/train_biencoder_syn_step_by_step.py \
            --model {model} \
            --output_path {output_path} \
            --queries_filepath {queries_filepath} \
            --epochs {epochs}


def evaluate(current_round, epochs, target_round):
    print(f"### Starting evaluation. Current Progress: {current_round} rounds / {target_round} rounds")
    # Define your variables
    model = "camembert-base"  # reinit parameters every round
    method = 'describe_then_ask_qcc_fewshots'
    output_path = f"/content/drive/MyDrive/UVA/Thesis/training/synthesizing/output/{method}/step_by_step{variation}/{current_round}"
    queries_filepath = f"/content/drive/MyDrive/UVA/Thesis/training/synthesizing/output/{method}/step_by_step{variation}/{current_round}/train.csv"

    # evaluation on validation data
    !python scripts/baseline/bsard/experiments/test_biencoder_step_by_step.py \
            --checkpoint_path/content/drive/MyDrive/UVA/Thesis/training/synthesizing/output/ {method} /step_by_step{variation} / {current_round} \
            --test_queries_df_path {test_queries_df_path}
    # evaluation on test data
    !python scripts/baseline/bsard/experiments/test_biencoder.py --checkpoint_path/content/drive/MyDrive/UVA/Thesis/training/synthesizing/output/ {method} /step_by_step{variation} / {current_round}


def prepare_for_extrapolation(current_round, target_round, method, wrong_top_k):
    print(f"### Preparing for extrapolation. Current Progress: {current_round} rounds / {target_round} rounds")
    previous_round = current_round - 1

    wrong_pairs_to_extrapolate_file = f'/content/drive/MyDrive/UVA/Thesis/training/synthesizing/output/{method}/step_by_step{variation}/{previous_round}/wrong_pairs_to_extrapolate.json'
    top_k_wrong_pairs = {}

    # read wrong pairs file
    with open(wrong_pairs_to_extrapolate_file) as json_file:
        wrong_pairs = json.load(json_file)

    # keep top_k article_ids
    for query_id, doc_ids in wrong_pairs.items():
        query = df_validation[df_validation['id'] == int(query_id)]['question'].values[0]
        articles = []
        for doc_id in doc_ids:
            article = df_articles[df_articles['id'] == int(doc_id)]['article'].values[0]
            if len(article.split(" ")) > 20:
                articles.append(doc_id)
            if len(articles) >= wrong_top_k:
                break
        top_k_wrong_pairs[query] = articles

    top_k_wrong_pairs = [(key, value) for key, values in top_k_wrong_pairs.items() for value in values]
    df_wrong_pairs = pd.DataFrame(top_k_wrong_pairs, columns=['Question', 'Article_Id'])

    def update_article(row):
        doc_id = row["Article_Id"]
        article = df_articles[df_articles['id'] == int(doc_id)]['article'].values[0]
        row["Article"] = article
        return row

    df_wrong_pairs = df_wrong_pairs.apply(update_article, axis=1)
    df_wrong_pairs.to_csv(
        f"/content/drive/MyDrive/UVA/Thesis/training/synthesizing/output/{method}/step_by_step{variation}/{previous_round}/wrong_pairs_to_extrapolate.csv")


def extrapolate(current_round, target_round, method):
    print(
        f"### Extrapolate wrong pairs for current round. Current Progress: {current_round} rounds / {target_round} rounds")
    previous_round = current_round - 1
    !mkdir -p/content/drive/MyDrive/UVA/Thesis/training/synthesizing/output/ {method} /step_by_step{variation} / {current_round}
    !python scripts/prompts/gpt_generate_extrapolate.py \
            --prompt scripts/prompts/bsard/extrapolate.txt \
            --corpus/content/drive/MyDrive/UVA/Thesis/training/synthesizing/output/ {method} /step_by_step{variation} / {previous_round} /wrong_pairs_to_extrapolate.csv \
            --save_folder/content/drive/MyDrive/UVA/Thesis/training/synthesizing/output/ {method} /step_by_step{variation} / {current_round} \
            --key "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" \
            --org_key "org-ViWmGBWyZw44MQvxg2djVAff"


def merge_data(current_round, target_round):
    print(
        f"### Merging newly generated data to previous round data. Current Progress: {current_round} rounds / {target_round} rounds")
    method = 'describe_then_ask_qcc_fewshots'
    previous_round = current_round - 1
    file_paths = file_paths = [
        f"/content/drive/MyDrive/UVA/Thesis/training/synthesizing/output/{method}/step_by_step{variation}/{current_round}/extrapolated_queries_filtered.csv",
        f"/content/drive/MyDrive/UVA/Thesis/training/synthesizing/output/{method}/step_by_step{variation}/{previous_round}/train.csv"
    ]
    # Initialize an empty list to store DataFrames
    dfs = []

    # Loop through the file paths and read each file
    for file_path in file_paths:
        df = pd.read_csv(file_path)
        dfs.append(df)

    # Concatenate all DataFrames in the list into a single DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)
    combined_df[["synthetic_question", "article_ids"]].to_csv(
        f"/content/drive/MyDrive/UVA/Thesis/training/synthesizing/output/{method}/step_by_step{variation}/{current_round}/train.csv")
    print(
        f"### Merged into /content/drive/MyDrive/UVA/Thesis/training/synthesizing/output/{method}/step_by_step{variation}/{current_round}/train.csv")


def syntactic_filter(current_round, method, syntactic_topk, semantic_threshold, random=False):
    print(
        f"### Filtering questions for current round. Current Progress: {current_round} rounds / {target_round} rounds")
    previous_round = current_round - 1
    !python scripts/prompts/syntactic_filter.py \
            --extrapolated_queries/content/drive/MyDrive/UVA/Thesis/training/synthesizing/output/ {method} /step_by_step{variation} / {current_round} /extrapolated_queries.csv \
            --wrong_pairs_to_extrapolate/content/drive/MyDrive/UVA/Thesis/training/synthesizing/output/ {method} /step_by_step{variation} / {previous_round} /wrong_pairs_to_extrapolate.csv \
            --save_folder/content/drive/MyDrive/UVA/Thesis/training/synthesizing/output/ {method} /step_by_step{variation} / {current_round} \
            --syntactic_topk {syntactic_topk} \
            --semantic_threshold {semantic_threshold} \
            --random {random}

# Training and Evaluation (Extrapolation without Syntactic filter)

In [None]:
epochs = 90
current_round = 0
target_round = 2
end = 10  # start with 50% seed data
wrong_top_k = 10  # only take top 10 wrongly retrieved pairs (can be ablated)
method = 'describe_then_ask_qcc_fewshots'
df_articles = pd.read_csv('scripts/baseline/bsard/data/articles_fr.csv')
test_queries_df_path = 'scripts/baseline/bsard/data/questions_fr_validation_step_by_step_frac_4_6.csv'
df_validation = pd.read_csv(test_queries_df_path)
variation = 5

while True:
    try:
        train(current_round=current_round, target_round=target_round, epochs=epochs, end=end)
        evaluate(current_round=current_round, target_round=target_round, epochs=epochs)
        current_round += 1
        if current_round > target_round:
            break
        prepare_for_extrapolation(current_round=current_round, target_round=target_round, method=method, wrong_top_k=wrong_top_k)
        extrapolate(current_round=current_round, target_round=target_round, method=method)
        merge_data(current_round=current_round, target_round=target_round)
    except Exception as e:
        print(e)
        break

# Training and Evaluation (Extrapolation with Syntactic filter)

In [None]:
epochs = 90
current_round = 1
target_round = 1
method = 'describe_then_ask_qcc_fewshots'
syntactic_topk = 1000
semantic_threshold = 0.8
random = True

syntactic_filter(current_round=current_round, method=method, syntactic_topk=syntactic_topk,
                 semantic_threshold=semantic_threshold, random=random)
merge_data(current_round=current_round, target_round=target_round)
train(current_round=current_round, epochs=epochs, target_round=target_round)
evaluate(current_round=current_round, epochs=epochs, target_round=target_round)