In [1]:
import pandas as pd

In [24]:
import json

def split_finetuning_data(filtered_data_path, finetuning_data_path, output_matching_path, output_non_matching_path):
    """
    Splits the finetuning data into two JSON files:
    1. Items with User_IDs matching the pipeline User_asin.
    2. Items with User_IDs not present in the pipeline.

    Parameters:
        filtered_data_path (str): Path to the filtered pipeline data JSON file.
        finetuning_data_path (str): Path to the finetuning dataset JSON file.
        output_matching_path (str): Path to save the JSON file with matching User_IDs.
        output_non_matching_path (str): Path to save the JSON file with non-matching User_IDs.
    """
    # Load filtered data
    with open(filtered_data_path, 'r') as filtered_file:
        filtered_data = json.load(filtered_file)
    
    # Load finetuning data
    with open(finetuning_data_path, 'r') as finetuning_file:
        finetuning_data = json.load(finetuning_file)

    # Extract user_asin values from filtered data
    pipeline_user_ids = set()
    for item in filtered_data:
        if 'users' in item:
            for user in item['users']:
                if 'User_asin' in user:
                    pipeline_user_ids.add(user['User_asin'])

    # Separate finetuning data
    matching_items = []
    non_matching_items = []

    for item in finetuning_data:
        if item["User_ID"] in pipeline_user_ids:
            matching_items.append(item)
        else:
            non_matching_items.append(item)

    # Write matching and non-matching items to separate JSON files
    with open(output_matching_path, 'w', encoding='utf-8') as match_file:
        json.dump(matching_items, match_file, ensure_ascii=False, indent=4)

    with open(output_non_matching_path, 'w', encoding='utf-8') as non_match_file:
        json.dump(non_matching_items, non_match_file, ensure_ascii=False, indent=4)

    print(f"Matching data written to {output_matching_path}")
    print(f"Non-matching data written to {output_non_matching_path}")

# File paths
filtered_data_path = 'qlora_finetuning/qlora_finetuning_dataset_19_11_with_id.json'
finetuning_data_path = 'qlora_finetuning/new_candidate_items_with_profile.json'
output_matching_path = 'qlora_finetuning/matching_ids.json'
output_non_matching_path = 'qlora_finetuning/non_matching_ids.json'

# Run the function
split_finetuning_data(filtered_data_path, finetuning_data_path, output_matching_path, output_non_matching_path)


Matching data written to qlora_finetuning/matching_ids.json
Non-matching data written to qlora_finetuning/non_matching_ids.json


None
