In [None]:
import json

def combine_with_target(train_file, val_file, output_file):
    # Load sorted training data
    with open(train_file, 'r', encoding='utf-8') as f:
        train_data = json.load(f)
    
    # Load validation data
    with open(val_file, 'r', encoding='utf-8') as f:
        val_data = json.load(f)
    
    # Create a dictionary to map user_id to product_name (target) from the validation data
    target_map = {user['user_id']: user['reviews'][0]['product_name'] for user in val_data if user['reviews']}
    
    # Add target field to each user in the training data based on user_id
    for user in train_data:
        user_id = user['user_id']
        # Find the target product_name if available in the validation data
        target_product = target_map.get(user_id)
        if target_product:
            user['target'] = target_product  # Add the target field to the user data
    
    # Save the combined data with target field to a new JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(train_data, f, ensure_ascii=False, indent=4)

# Paths to the input files and output file
combine_with_target('new_data/sorted_train_output.json', 'new_data/new_val_output.json', 'new_data/combined_output_for_training_chatGPT.json')

print("Combined JSON file with target field saved as 'new_data/combined_output_for_training_chatGPT.json'")


Combined JSON file with target field saved as 'new_data/combined_output.json'


## Extract the data from the chatGPT output to alpaca format

In [2]:
import re
import json
import pandas as pd

def transform_user_profiles(input_file_path, output_file_path, output_format='json'):
    # Initialize list to hold user profiles
    user_profiles = []

    # Read file content
    with open(input_file_path, 'r') as file:
        content = file.read()

    # Split content by each user profile
    users = re.split(r"(User ID: [A-Z0-9]+)", content)[1:]  # Split while keeping 'User ID' lines

    # Group each User ID with their profile
    users = [(users[i], users[i + 1].strip()) for i in range(0, len(users), 2)]

    # Process each user entry
    for user_id, profile in users:
        # Remove line breaks and unnecessary white spaces in main profile section
        profile_text = re.sub(r"(\n\s*)+", " ", profile).strip()
        
        # Remove unwanted strings from the profile text
        profile_text = re.sub(r"\(similar to target product\)", "", profile_text, flags=re.IGNORECASE).strip()
        profile_text = re.sub(r"\(target product\)", "", profile_text, flags=re.IGNORECASE).strip()

        
        # Extract Candidate Items separately, line by line, and number them
        candidate_items_match = re.search(r"Candidate Items:\s*((?:\n\s{4}.*)+)", profile)
        if candidate_items_match:
            # Split each item by new line and 4-space indentation, and strip them
            items = [
                re.sub(r"\(target product\)|\(similar to target product\)", "", item, flags=re.IGNORECASE).strip()
                for item in candidate_items_match.group(1).split('\n') if item.strip()
            ]
            numbered_items = {str(i + 1): items[i] for i in range(len(items))}
            profile_text = re.sub(r"Candidate Items:\s*((?:\n\s{4}.*)+)", "", profile_text).strip()  # Remove items from main profile text
        else:
            numbered_items = {}

        # Prepare dictionary for each user
        user_dict = {
            "User_ID": user_id.split(": ")[1].strip(),
            "User_Profile": profile_text,
            "Candidate_Items": numbered_items
        }

        user_profiles.append(user_dict)

    # Output based on the selected format
    if output_format == 'json':
        with open(output_file_path, 'w') as json_file:
            json.dump(user_profiles, json_file, indent=4)
    elif output_format == 'csv':
        # Convert to DataFrame and expand Candidate Items
        df = pd.json_normalize(user_profiles)
        df.to_csv(output_file_path, index=False)
    else:
        print("Unsupported output format. Choose either 'json' or 'csv'.")

# Example usage:
input_file_path = 'chatGPT_output_UP.txt'
output_file_path = 'chatGPT_UP_output.json'  # or 'output_file.csv'
transform_user_profiles(input_file_path, output_file_path, output_format='json')  # Change 'json' to 'csv' for CSV output


In [10]:
import json

def validate_user_profiles_from_json(file_path, required_keywords):
    """
    Loads user profiles from a JSON file and validates that each profile contains all required keywords.
    
    Parameters:
        file_path (str): Path to the JSON file containing user profiles.
        required_keywords (list): List of keywords that each profile must contain.
    
    Returns:
        dict: Dictionary with user IDs as keys and a list of missing keywords as values.
    """
    # Load profiles from JSON file
    with open(file_path, 'r') as file:
        profiles = json.load(file)
    
    missing_keywords_report = {}

    # Validate each profile
    for profile in profiles:
        user_id = profile["User_ID"]
        profile_text = profile["User_Profile"]
        
        # Check for missing keywords in profile text
        missing_keywords = [keyword for keyword in required_keywords if keyword not in profile_text]
        
        # Only record if there are missing keywords
        if missing_keywords:
            missing_keywords_report[user_id] = missing_keywords

    return missing_keywords_report

# Example usage:
required_keywords = [
    "Short-term Intentions", "Long-term Preferences", "Item Descriptions",
    "Preferences and Dislikes", "Explicit and Implicit Signals", "Contextual Features", "Candidate Items"
]

# Path to the JSON file containing user profiles
file_path = 'path_to_your_user_profiles.json'
missing_report = validate_user_profiles_from_json(output_file_path, required_keywords)

# Output the results
if missing_report:
    for user_id, missing in missing_report.items():
        print(f"User ID {user_id} is missing the following sections: {', '.join(missing)}")
else:
    print("All user profiles contain the required sections.")


All user profiles contain the required sections.
