In [1]:
import pandas as pd
import os
import json

# Define the function to extract conversations from the ChatGPT sharing data
def extract_conversations(chatgpt_sharing):
    """
    Extract detailed ChatGPT conversations from the given sharing data.

    Parameters:
    chatgpt_sharing (list): A list containing ChatGPT sharing entries.

    Returns:
    list: A list of dictionaries, each representing a conversation with detailed information.
    """
    if isinstance(chatgpt_sharing, list):
        conversations = []
        for entry in chatgpt_sharing:
            # Check if the entry contains the 'Conversations' field
            if 'Conversations' in entry:
                for convo in entry['Conversations']:
                    conversations.append({
                        "Prompt": convo.get("Prompt"),
                        "Answer": convo.get("Answer"),
                        "ListOfCode": convo.get("ListOfCode"),
                        "ChatgptURL": entry.get("URL"),
                        "DateOfConversation": entry.get("DateOfConversation"),
                        "Model": entry.get("Model"),
                        "NumberOfPrompts": entry.get("NumberOfPrompts"),
                        "TokensOfPrompts": entry.get("TokensOfPrompts"),
                        "TokensOfAnswers": entry.get("TokensOfAnswers"),
                    })
        return conversations
    return None

# Define the function to process a single JSON file
def process_json_file(file_path):
    """
    Process a single JSON file to extract and flatten ChatGPT conversation data.

    Parameters:
    file_path (str): The path to the JSON file.

    Returns:
    pd.DataFrame: A DataFrame containing the processed data.
    """
    try:
        # Load the JSON file
        df = pd.read_json(file_path)
        # Flatten nested data in the 'Sources' column
        df = pd.json_normalize(df['Sources'])
    except (ValueError, KeyError) as e:
        print(f"Failed to load file {file_path}: {e}")
        return pd.DataFrame()  # Return an empty DataFrame to avoid interruption

    # Select relevant columns for processing
    basic_columns = ['Type', 'URL', 'Author', 'Title', 'Body', 'CreatedAt', 'State', 'ChatgptSharing']
    basic_columns_in_df = [col for col in basic_columns if col in df.columns]
    basic_df = df[basic_columns_in_df]

    # Extract and flatten conversation data
    conversation_data = []
    for _, row in basic_df.iterrows():
        chatgpt_sharing = row.get('ChatgptSharing', None)
        conversations = extract_conversations(chatgpt_sharing)
        if conversations:
            for convo in conversations:
                conversation_data.append({
                    "SourceType": row.get('Type', None),         # Source type
                    "SourceURL": row.get('URL', None),           # Original source URL
                    "SourceAuthor": row.get('Author', None),     # Original author
                    "SourceTitle": row.get('Title', "Unknown"),  # Default title if missing
                    **convo,                                     # ChatGPT conversation details
                })
    return pd.DataFrame(conversation_data)

# Process all JSON files in the specified folder
folder_path = "./"  # Path to the folder containing JSON files
all_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

# Initialize an empty DataFrame to hold all conversations
all_conversations = pd.DataFrame()

# Process each file and append its data to the combined DataFrame
for file in all_files:
    file_path = os.path.join(folder_path, file)
    print(f"Processing file: {file_path}")
    file_data = process_json_file(file_path)
    if not file_data.empty:
        all_conversations = pd.concat([all_conversations, file_data], ignore_index=True)

# Check if there is any data to save
if not all_conversations.empty:
    # Save the merged data to a CSV file
    output_csv_path = "merged_conversations.csv"
    all_conversations.to_csv(output_csv_path, index=False, encoding='utf-8-sig')  # UTF-8-SIG supports non-ASCII characters
    print(f"Data successfully saved to {output_csv_path}")
else:
    print("No valid data was processed. No output file was generated.")


Processing file: ./20230831_060603_pr_sharings.json
Processing file: ./20230831_061759_issue_sharings.json
Processing file: ./20230831_061926_discussion_sharings.json
Processing file: ./20230831_063412_commit_sharings.json
Processing file: ./20230831_072722_file_sharings.json
Processing file: ./20230831_073827_hn_sharings.json
Data successfully saved to merged_conversations.csv
