This file: Extract standardized GPT classification result from JSONL Raw output file

Input:
- 2_batch_api_output.jsonl: GPT raw output JSONL file
- 0_investment_sentences.csv: original sentence level dataset

Output:
- 3_gpt_result_1.csv: std GPT classification result
- 3_gpt_result_2.csv: std GPT classification result, mapped back to sentence texts

In [1]:
import json
import csv
import re
import pandas as pd
import os

Notice: If you're using colab, run the following two cells

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
os.chdir('/content/drive/MyDrive/Colab Notebooks/acct4_ta_s1')

# Set Up File Path

In [7]:
input_file = "2_batch_api_output.jsonl"
raw_file = "0_investment_sentences.csv"
output_file_1 = "3_gpt_result_1.csv"
output_file_2 = "3_gpt_result_2.csv"

# Functions to extract std answer from JSON text

In [8]:
def extract_json_from_text(text):
    # Find JSON content within the triple backticks
    match = re.search(r'```json\s*(.*?)\s*```', text, re.DOTALL)
    if match:
        return match.group(1)
    return None

In [None]:
def process_jsonl_file(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f_in, \
         open(output_file, 'w', newline='', encoding='utf-8') as f_out:

        fieldnames = [
            'id',
            'invest_plan',
            'invest_target',
            'invest_amount'
        ]
        writer = csv.DictWriter(f_out, fieldnames=fieldnames)
        writer.writeheader()

        error_lines = []

        for line in f_in:
            try:
                line_data = json.loads(line)
                summary_text = (
                    line_data.get('response', {})
                             .get('body', {})
                             .get('choices', [])[0]
                             .get('message', {})
                             .get('content', '')
                )

                # Try to extract JSON inside triple backticks
                json_str = extract_json_from_text(summary_text)
                if not json_str:
                    # Fallback to raw content
                    json_str = summary_text.strip()
                    # Strip wrapping quotes if any
                    if json_str.startswith('"') and json_str.endswith('"'):
                        json_str = json_str[1:-1]
                    # Unescape common escapes
                    json_str = json_str.replace('\\"', '"').replace('""', '"')

                data = json.loads(json_str)

                row = {
                    'id': data.get('id'),
                    'invest_plan': data.get('invest_plan'),
                    'invest_target': data.get('invest_target'),
                    'invest_amount': data.get('invest_amount')
                }
                writer.writerow(row)

            except Exception as e:
                print(f"Error processing line: {e}")
                error_lines.append({
                    'id': f"ERROR_{len(error_lines)}",
                    'invest_plan': line.strip(),
                    'invest_target': "",
                    'invest_amount': ""
                })

        # append any error rows
        for err in error_lines:
            writer.writerow(err)

# Function to map gpt classification result back to original text dataset

In [None]:
def combine_with_raw_og_data(output_file_1, raw_file, output_file):

    output_df = pd.read_csv(output_file_1)
    output_df['id'] = output_df['id'].astype(str)

    raw_sentence = pd.read_csv(raw_file)

    # Create a dictionary to map sentence_id to sentence
    raw_sentence['id'] = raw_sentence['sentence_id'].astype(str)
    guid_to_text = dict(zip(raw_sentence['id'], raw_sentence['sentence']))

    # Add a new column for the original text
    output_df['Original_Text'] = output_df['id'].map(guid_to_text)

    # Save to csv file
    output_df.to_csv(output_file, index=False)
    print(f"Combined data saved to {output_file}")

In [9]:
process_jsonl_file(input_file, output_file_1)
print(f"Data extracted and saved to {output_file_1}")

combine_with_raw_og_data(output_file_1, raw_file, output_file_2)

Data extracted and saved to 3_gpt_result_1.csv
Combined data saved to 3_gpt_result_2.csv
