In [None]:
%load_ext autoreload
%autoreload 2


In [None]:
import os 
import sys 
sys.path.append('../')
from datasets import Dataset, load_dataset, concatenate_datasets
import requests
from dotenv import find_dotenv, load_dotenv
import json 

import pandas as pd 
from pipeline.helpers import list_of_dicts_to_dict_of_lists, upload_to_hf 
load_dotenv()

## Clean data generated from NOV2 and upload cleaned set to HF

In [None]:
REPO_ID="CPSC532/arxiv_qa_data"

In [None]:
# 1st run with 1 pdf
dataset_1 = load_dataset(
                        path=REPO_ID, 
                        name="2024NOV2_1file_full",
                        token=os.getenv("HUGGINGFACE_API_KEY")
)


In [None]:
# 2nd run with 8 pdf's
dataset_2 = load_dataset(
                        path=REPO_ID, 
                        name="2024NOV2_8file_full",
                        token=os.getenv("HUGGINGFACE_API_KEY")
)


In [None]:
dataset_1, dataset_2

In [None]:
merged_dataset = concatenate_datasets([dataset_1['train'], dataset_2['train']])

In [None]:
merged_dataset

### Load JSON cached file of 13 files processed that did not get uploaded to HF at the end of the pipeline run
* this data did not include the answers generated from RAG

In [None]:
with open('../pipeline/cache/2024NOV2_13files_full_get_answer_cache.json', 'r') as f:
    answer_cache = json.load(f)

with open('../pipeline/cache/2024NOV2_13files_full_question_generator_cache.json', 'r') as f:
    question_cache = json.load(f)

with open('../pipeline/cache/2024NOV2_13files_full_refine_question_cache.json', 'r') as f:
    refine_question_cache = json.load(f)

Seems like the key is the literal string

In [None]:
next(iter(answer_cache.items()))[0]

In [None]:
def clean_get_answer_cache(cache: dict):
    # Create new dict with simplified keys
    qa_list = []    
    for key, value in cache.items():
        # Parse the JSON string key
        try:
            key_dict = json.loads(key)
            qa_list.append({
                'question': key_dict['question'],
                'chunk': key_dict['chunk'],
                'answer': value
            })
        except json.JSONDecodeError:
            # Skip malformed keys
            print(f"Skipping key: {key}")
            continue
            
    return qa_list

In [None]:
cleaned_answer_cache = clean_get_answer_cache(answer_cache)

In [None]:
len(cleaned_answer_cache)

question_cache seems to have a one to many mapping. The key_dict contains everything value has except it also contains the entities extracted, this isn't necessary for the final HF data, thus we will just use the value in the cache

In [None]:
def clean_question_generator_cache(cache: dict):
    # Create new dict with simplified keys
    qa_list = []    
    for key, value in cache.items():
        qa_list.extend(value)
            
    return qa_list

In [None]:
cleaned_question_cache = clean_question_generator_cache(question_cache)

In [None]:
len(cleaned_question_cache) # may contain duplicates in here

In [None]:
cleaned_question_cache[0]

In [None]:
cleaned_answer_cache[0]

Can merge cleaned_question_cache and cleaned_answer_cache to get the same fields as the dataset, then will upload cleaned data to HF

In [None]:
df_merged = pd.merge(
                    pd.DataFrame(cleaned_question_cache),
                    pd.DataFrame(cleaned_answer_cache),
                    on=['question', 'chunk'],
                    how='inner'
)

In [None]:
df_merged.head(1)

In [None]:
cleaned_data = Dataset.from_dict(df_merged.to_dict(orient='list'))

In [None]:
cleaned_data

In [None]:
merged_dataset

In [None]:
combined_dataset = concatenate_datasets([merged_dataset, cleaned_data])

In [None]:
combined_dataset

In [None]:
df = combined_dataset.data.to_pandas()

In [None]:
df.source.value_counts()

In [None]:
print(df.loc[df.answer.str.contains('NO ANSWER FOUND')].shape[0])
df = df.loc[~df.answer.str.contains('NO ANSWER FOUND')].reset_index(drop=True)

In [None]:
df.drop_duplicates(subset=['question', 'chunk'], inplace=True)

In [None]:
df.to_csv('../pipeline/outputs/2024NOV2_combined_cleaned.csv', index=False)

In [None]:
df.shape

In [None]:
dataset=Dataset.from_dict(df.to_dict(orient='list'))

In [None]:
dataset.push_to_hub(
    repo_id="CPSC532/2024NOV2_arxiv_qa_data_cleaned",
    token=os.getenv("HUGGINGFACE_API_KEY"),
    commit_message="cleaned data",
    commit_description="Dataset contains 681 questons and answers from 19 different arxiv pdf files"
)