In [None]:
%load_ext autoreload
%autoreload 2


In [116]:
import os 
import sys 
sys.path.append('../')
from datasets import Dataset, load_dataset, concatenate_datasets
import requests
from dotenv import find_dotenv, load_dotenv
import json 

import pandas as pd 
from pipeline.helpers import list_of_dicts_to_dict_of_lists, upload_to_hf 
load_dotenv()

True

## Clean data generated from NOV2 and upload cleaned set to HF

In [6]:
REPO_ID="CPSC532/arxiv_qa_data"

In [None]:
# 1st run with 1 pdf
dataset_1 = load_dataset(
                        path=REPO_ID, 
                        name="2024NOV2_1file_full",
                        token=os.getenv("HUGGINGFACE_API_KEY")
)


Generating train split: 100%|██████████| 113/113 [00:00<00:00, 11645.98 examples/s]


In [7]:
# 2nd run with 8 pdf's
dataset_2 = load_dataset(
                        path=REPO_ID, 
                        name="2024NOV2_8file_full",
                        token=os.getenv("HUGGINGFACE_API_KEY")
)


Generating train split: 100%|██████████| 388/388 [00:00<00:00, 14705.95 examples/s]


In [10]:
dataset_1, dataset_2

(DatasetDict({
     train: Dataset({
         features: ['filename', 'source', 'source_type', 'chunk', 'question', 'answer'],
         num_rows: 113
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['filename', 'source', 'source_type', 'chunk', 'question', 'answer'],
         num_rows: 388
     })
 }))

In [13]:
merged_dataset = concatenate_datasets([dataset_1['train'], dataset_2['train']])

In [14]:
merged_dataset

Dataset({
    features: ['filename', 'source', 'source_type', 'chunk', 'question', 'answer'],
    num_rows: 501
})

### Load JSON cached file of 13 files processed that did not get uploaded to HF at the end of the pipeline run
* this data did not include the answers generated from RAG

In [26]:
with open('../pipeline/cache/2024NOV2_13files_full_get_answer_cache.json', 'r') as f:
    answer_cache = json.load(f)

with open('../pipeline/cache/2024NOV2_13files_full_question_generator_cache.json', 'r') as f:
    question_cache = json.load(f)

with open('../pipeline/cache/2024NOV2_13files_full_refine_question_cache.json', 'r') as f:
    refine_question_cache = json.load(f)

Seems like the key is the literal string

In [51]:
next(iter(answer_cache.items()))[0]

'{"question": "Why does the use of full datasets in training models introduce inconsistencies and potential unfair comparisons, according to the paper \'Making Text Embedders Few-Shot Learners\'?", "chunk": "-Clustering-{S2S/P2P},_\\nTwentyNewsgroups-Clustering (Lang, 1995).\\n\\n    - STS: STS12 (Agirre et al., 2012), STS22 (Chen et al., 2022), STS-Benchmark (Cer et al.,\\n2017).\\n\\n**Training Detail. We fine-tune the Mistral-7B model using a contrastive loss and conduct the pro-**\\ncess over a single epoch. For efficient fine-tuning, we employ Low-Rank Adaptation (LoRA) (Hu\\net al., 2021), setting the LoRA rank to 64 and the LoRA alpha to 32, with a learning rate of 1e-4.\\nFor retrieval tasks, we use in-batch negatives, a strategy not adopted for other tasks. Each dataset\\nincorporates 7 hard negatives. The batch size is set to 512 for retrieval tasks and 256 for other types\\nof tasks. We maintain consistency by using the same dataset throughout one training step, and the\\nma

In [60]:
def clean_get_answer_cache(cache: dict):
    # Create new dict with simplified keys
    qa_list = []    
    for key, value in cache.items():
        # Parse the JSON string key
        try:
            key_dict = json.loads(key)
            qa_list.append({
                'question': key_dict['question'],
                'chunk': key_dict['chunk'],
                'answer': value
            })
        except json.JSONDecodeError:
            # Skip malformed keys
            print(f"Skipping key: {key}")
            continue
            
    return qa_list

In [61]:
cleaned_answer_cache = clean_get_answer_cache(answer_cache)

In [64]:
len(cleaned_answer_cache)

430

question_cache seems to have a one to many mapping. The key_dict contains everything value has except it also contains the entities extracted, this isn't necessary for the final HF data, thus we will just use the value in the cache

In [104]:
def clean_question_generator_cache(cache: dict):
    # Create new dict with simplified keys
    qa_list = []    
    for key, value in cache.items():
        qa_list.extend(value)
            
    return qa_list

In [105]:
cleaned_question_cache = clean_question_generator_cache(question_cache)

In [108]:
len(cleaned_question_cache) # may contain duplicates in here

704

In [110]:
cleaned_question_cache[0]

{'filename': '../data/Making_Text_Embedders_Few-Shot_Learners_2409.15700v1.pdf',
 'source': 'Making_Text_Embedders_Few-Shot_Learners_2409.15700v1',
 'source_type': 'paper',
 'chunk': '-Clustering-{S2S/P2P},_\nTwentyNewsgroups-Clustering (Lang, 1995).\n\n    - STS: STS12 (Agirre et al., 2012), STS22 (Chen et al., 2022), STS-Benchmark (Cer et al.,\n2017).\n\n**Training Detail. We fine-tune the Mistral-7B model using a contrastive loss and conduct the pro-**\ncess over a single epoch. For efficient fine-tuning, we employ Low-Rank Adaptation (LoRA) (Hu\net al., 2021), setting the LoRA rank to 64 and the LoRA alpha to 32, with a learning rate of 1e-4.\nFor retrieval tasks, we use in-batch negatives, a strategy not adopted for other tasks. Each dataset\nincorporates 7 hard negatives. The batch size is set to 512 for retrieval tasks and 256 for other types\nof tasks. We maintain consistency by using the same dataset throughout one training step, and the\nmaximum sequence length is set at 512 

In [112]:
cleaned_answer_cache[0]

{'question': "Why does the use of full datasets in training models introduce inconsistencies and potential unfair comparisons, according to the paper 'Making Text Embedders Few-Shot Learners'?",
 'chunk': '-Clustering-{S2S/P2P},_\nTwentyNewsgroups-Clustering (Lang, 1995).\n\n    - STS: STS12 (Agirre et al., 2012), STS22 (Chen et al., 2022), STS-Benchmark (Cer et al.,\n2017).\n\n**Training Detail. We fine-tune the Mistral-7B model using a contrastive loss and conduct the pro-**\ncess over a single epoch. For efficient fine-tuning, we employ Low-Rank Adaptation (LoRA) (Hu\net al., 2021), setting the LoRA rank to 64 and the LoRA alpha to 32, with a learning rate of 1e-4.\nFor retrieval tasks, we use in-batch negatives, a strategy not adopted for other tasks. Each dataset\nincorporates 7 hard negatives. The batch size is set to 512 for retrieval tasks and 256 for other types\nof tasks. We maintain consistency by using the same dataset throughout one training step, and the\nmaximum sequence

Can merge cleaned_question_cache and cleaned_answer_cache to get the same fields as the dataset, then will upload cleaned data to HF

In [118]:
df_merged = pd.merge(
                    pd.DataFrame(cleaned_question_cache),
                    pd.DataFrame(cleaned_answer_cache),
                    on=['question', 'chunk'],
                    how='inner'
)

In [120]:
df_merged.head(1)

Unnamed: 0,filename,source,source_type,chunk,question,answer
0,../data/Making_Text_Embedders_Few-Shot_Learner...,Making_Text_Embedders_Few-Shot_Learners_2409.1...,paper,"-Clustering-{S2S/P2P},_\nTwentyNewsgroups-Clus...",What is the purpose of using Low-Rank Adaptati...,"According to the provided text, the purpose of..."


In [126]:
cleaned_data = Dataset.from_dict(df_merged.to_dict(orient='list'))

In [127]:
cleaned_data

Dataset({
    features: ['filename', 'source', 'source_type', 'chunk', 'question', 'answer'],
    num_rows: 440
})

In [109]:
merged_dataset

Dataset({
    features: ['filename', 'source', 'source_type', 'chunk', 'question', 'answer'],
    num_rows: 501
})

In [128]:
combined_dataset = concatenate_datasets([merged_dataset, cleaned_data])

In [129]:
combined_dataset

Dataset({
    features: ['filename', 'source', 'source_type', 'chunk', 'question', 'answer'],
    num_rows: 941
})

In [131]:
df = combined_dataset.data.to_pandas()

In [146]:
df.source.nunique()

19

In [153]:
df.source.value_counts()

source
Making_Text_Embedders_Few-Shot_Learners_2409.15700v1                                                                                   96
Time-MoE_Billion-Scale_Time_Series_Foundation_Models_with_Mixture_of\n__Experts_2409.16040v1                                           77
GraphLoRA: Structure-Aware Contrastive Low-Rank Adaptation for Cross-Graph Transfer Learning                                           68
Taming Transformers for High Resolution Image Synthesis                                                                                59
Using_LLM_for_Real-Time_Transcription_and_Summarization_of\n__Doctor-Patient_Interactions_into_ePuskesmas_in_Indonesia_2409.17054v1    57
Pruning_Multilingual_Large_Language_Models_for_Multilingual_Inference_2409.16911v1                                                     46
Unsupervised Text Representation Learning via Instruction-Tuning for Zero-Shot Dense Retrieval                                         40
AgentInstruct Toward Genera

In [137]:
print(df.loc[df.answer.str.contains('NO ANSWER FOUND')].shape[0])
df = df.loc[~df.answer.str.contains('NO ANSWER FOUND')].reset_index(drop=True)

5


In [148]:
df.drop_duplicates(subset=['question', 'chunk'], inplace=True)

In [150]:
df.to_csv('../pipeline/outputs/2024NOV2_combined_cleaned.csv', index=False)

In [151]:
df.shape

(681, 6)

In [152]:
combined_dataset.push_to_hub(
    repo_id="CPSC532/cleaned_arxiv_qa_data",
    config_name="2024NOV3",
    token=os.getenv("HUGGINGFACE_API_KEY"),
    commit_message="cleaned data",
    commit_description="Dataset contains 681 questons and answers from 19 different arxiv pdf files"
)

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 89.76ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.94s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/CPSC532/cleaned_arxiv_qa_data/commit/d1066063c22b82c4aebfd077718613a9548f4dc3', commit_message='cleaned data', commit_description='Dataset contains 681 questons and answers from 19 different arxiv pdf files', oid='d1066063c22b82c4aebfd077718613a9548f4dc3', pr_url=None, pr_revision=None, pr_num=None)