# download the datasets:

first download this dataset and place it in a folder called CANARD: https://sites.google.com/view/qanta/projects/canard .<br>
It will give you 3 json files which will be preprocessed like in the CoSPLADE paper (formula 3 and 4).

The dataset looks like this:

```[
    {
        "History": [
            "Frank Zappa",
            "Disbandment"
        ],
        "QuAC_dialog_id": "C_2d211835213b45588ad5ca868ce7fabd_0",
        "Question": "What group disbanded?",
        "Question_no": 1,
        "Rewrite": "What group disbanded?"
    },
    {
        "History": [
            "Frank Zappa",
            "Disbandment",
            "What group disbanded?",
            "Zappa and the Mothers of Invention"
        ],
        "QuAC_dialog_id": "C_2d211835213b45588ad5ca868ce7fabd_0",
        "Question": "When did they disband?",
        "Question_no": 2,
        "Rewrite": "When did Zappa and the Mothers of Invention disband?"
    },
    {
```

More info can be found on the link to the dataset.

In [42]:
import pandas as pd
import json

def process_canard_json(json_file, dataset_name):
    data = []
    with open(json_file, "r") as cn_dev:
        obj = json.load(cn_dev)
        for turn in obj:
            id = turn['QuAC_dialog_id'] + "_" + str(turn['Question_no'])
            
            # Compute k based on the length of the history
            n = len(turn['History'])  # Total length of the history
            k = n - 1  # Choose k as n-1 to include all previous answers
            
            # Include the existing history at the first question
            if turn['Question_no'] == 1:
                queries_representation = " [SEP] ".join(turn['History']) + "\t" + turn['Question']
            else:
                # Rewrite the representation for \hat{q}_n^{\text {queries }}
                queries_representation = " [SEP] ".join(turn['History'][:2] + turn['History'][2::2]) + "\t" + turn['Question']
            
            # Calculate separate SPLADE representations for each answer and store them in a list
            answers_representations = []
            for i in range(1, n, 2):  # Start from index 1 and step by 2 to get only answers
                answer_representation = f"{turn['Question']} [SEP] {turn['History'][i]}"
                answers_representations.append(answer_representation)
            
            data.append({'id': id, f'{dataset_name}_queries_representation': queries_representation, f'{dataset_name}_answers_representations': answers_representations})
        
    df = pd.DataFrame(data)
    return df



# Example usage:
json_file_name = "CANARD/dev.json"  # Replace with the actual JSON file name
dataset_name = "dev"  # Replace with "train" or "test" as needed
dev_df = process_canard_json(json_file_name, dataset_name)
print("just an example from the devset:")
print("answers representations")
print(dev_df[f'{dataset_name}_answers_representations'][2])
print("quesries representations")
print(dev_df[f'{dataset_name}_queries_representation'][5])

json_file_name = "CANARD/train.json"  # Replace with the actual JSON file name
dataset_name = "train"  # Replace with "train" or "test" as needed
train_df = process_canard_json(json_file_name, dataset_name)

json_file_name = "CANARD/test.json"  # Replace with the actual JSON file name
dataset_name = "test"  # Replace with "train" or "test" as needed
test_df = process_canard_json(json_file_name, dataset_name)


just an example from the devset:
answers representations
['What kind of music did they play? [SEP] Disbandment', 'What kind of music did they play? [SEP] Zappa and the Mothers of Invention', 'What kind of music did they play? [SEP] In late 1969, Zappa broke up the band.']
quesries representations
Frank Zappa [SEP] Disbandment [SEP] What group disbanded? [SEP] When did they disband? [SEP] What kind of music did they play? [SEP] Why did they break up? [SEP] Why were there financial problems?	why did he think the band lacked effort?
