In [2]:
import pandas as pd

The first stage is to find the best prompting strategy. For each statutory article, we generate one pseudo query. Then we train our camembert model on synthesized data and then evaluate against the testset.


# Common Functions

In [None]:
def create_incremental_data(method, end, start=0, base_path='/content/drive/MyDrive/UVA/Thesis/synthetic_data', ):
  # List of CSV file paths
  file_paths = file_paths = [
        f"{base_path}/{method}/train-{i}.csv"
        for i in range(start, end + 1)
  ]

  # Initialize an empty list to store DataFrames
  dfs = []

  # Loop through the file paths and read each file
  for file_path in file_paths:
      df = pd.read_csv(file_path)
      dfs.append(df)

  # Concatenate all DataFrames in the list into a single DataFrame
  combined_df = pd.concat(dfs, ignore_index=True)

  combined_df[["synthetic_question", "article_ids"]].to_csv(f"/content/drive/MyDrive/UVA/Thesis/synthetic_data/{method}/train-incremental.csv")

# Experiments

## Synthesizing
The frac means that we generate 5% of queries for each iteration. 

In [None]:
promptings = [
    "generate_only.txt",
    "generate_control_question_type.txt",
    "generate_simple_ask_qcc_fewshots.txt",
    "generate_describe_then_ask.txt",
    "generate_describe_then_ask_qcc.txt",
    "generate_describe_then_ask_fewshots.txt",
    "generate_describe_then_ask_qcc_fewshots.txt",
]

methods = [
    "simple_ask",
    "simple_ask_tc",
    "simple_ask_tc_fewshots",
    "describe_then_ask",
    "describe_then_ask_qcc",
    "describe_then_ask_fewshots",
    "describe_then_ask_qcc_fewshots",
]

for prompting, method in zip(promptings, methods):
    !python scripts/prompts/gpt_generate.py \
    --prompt scripts/prompts/bsard/{prompting} \
    --save_folder /content/drive/MyDrive/UVA/Thesis/synthetic_data/{method} \
    --corpus scripts/prompts/data/articles_fr.csv \
    --key "xxxxxxxxx" \
    --org_key "xxxxxxx" \
    --frac "0.05" \
    --iterations "20" \
    --exclude_index ""

# Training

In [None]:
import pandas as pd

end = 19
output_suffix = end/10
model = "camembert-base"

epochs = 90

for method in methods:
    output_path = f"/content/drive/MyDrive/UVA/Thesis/training/synthesizing/output/{method}/{output_suffix}"
    queries_filepath = f"/content/drive/MyDrive/UVA/Thesis/synthetic_data/{method}/train-incremental.csv"
    create_incremental_data(method=method,end=end)
    !python scripts/baseline/bsard/experiments/train_biencoder_syn.py \
      --model {model} \
      --output_path {output_path} \
      --queries_filepath {queries_filepath} \
      --epochs {epochs}

# Evaluation

In [None]:
your_checkpoint_path = "your_checkpoint_path"
!python scripts/baseline/bsard/experiments/test_biencoder.py --checkpoint_path {your_checkpoint_path}