In [1]:
import json
import re
import pandas as pd

def parse_perturbed_output(content):
    """
    Parse the perturbed output content into its components:
    - Question (rephrased)
    - Option A, Option B, Option C
    - Answer
    - Category

    This function removes any markdown asterisks, then looks for an "Output Format" block.
    If the block is found, it extracts fields from lines starting with "- <Field>:".
    Otherwise, it falls back to scanning for markers in the entire text.
    """
    # Remove all asterisks from the content
    content = content.replace("**", "")
    
    # Initialize dictionary for extracted fields
    extracted = {
        "Question": "",
        "Option A": "",
        "Option B": "",
        "Option C": "",
        "Answer": "",
        "Category": ""
    }
    
    # Try to find the "Output Format:" block
    lines = content.splitlines()
    in_output_format = False
    for line in lines:
        # Check if we are entering the Output Format section
        if "Output Format:" in line:
            in_output_format = True
            continue
        # Once in the output format block, check for lines that start with "- "
        if in_output_format:
            line = line.strip()
            match = re.match(r"^- (.*?):\s*(.*)", line)
            if match:
                key = match.group(1).strip()
                value = match.group(2).strip()
                if key in extracted:
                    extracted[key] = value
    # If the output format block provided all needed fields, return them
    if all(extracted[field] for field in extracted):
        return extracted

    # Fallback method: scan entire content for markers.
    # Look for "Rephrase the Question:" (or similar) and options.
    # This part might be less reliable.
    for line in lines:
        line_clean = re.sub(r"^\d+\.\s*", "", line.strip())
        if "Question:" in line_clean or "Rephrased Question:" in line_clean:
            extracted["Question"] = line_clean.split("Question:", 1)[1].strip()
        elif "Option A:" in line_clean:
            extracted["Option A"] = line_clean.split("Option A:", 1)[1].strip()
        elif "Option B:" in line_clean:
            extracted["Option B"] = line_clean.split("Option B:", 1)[1].strip()
        elif "Option C:" in line_clean:
            extracted["Option C"] = line_clean.split("Option C:", 1)[1].strip()
        elif "Answer:" in line_clean:
            extracted["Answer"] = line_clean.split("Answer:", 1)[1].strip()
        elif "Category:" in line_clean:
            extracted["Category"] = line_clean.split("Category:", 1)[1].strip()
            
    return extracted

def build_dataframe_from_batch_output(jsonl_filepath):
    """
    Read the batch output JSONL file, parse each line to extract the perturbed output,
    and then build a DataFrame with the required columns.
    """
    records = []
    
    with open(jsonl_filepath, "r") as f:
        for line in f:
            try:
                record = json.loads(line)
                # Navigate to the content field inside the batch response
                content = record.get("response", {}) \
                                .get("body", {}) \
                                .get("choices", [{}])[0] \
                                .get("message", {}) \
                                .get("content", "")
                
                parsed = parse_perturbed_output(content)
                records.append(parsed)
            except Exception as e:
                print("Error parsing line:", e)
    
    df = pd.DataFrame(records, columns=["Question", "Option A", "Option B", "Option C", "Answer", "Category"])
    return df

# Specify the batch output JSONL file name
batch_output_file = "batch_maths_gpt4.jsonl"

# Build the DataFrame from the file
df_output = build_dataframe_from_batch_output(batch_output_file)

# Save the DataFrame to a CSV file (or pickle if preferred)
# df_output.to_csv("perturbed_outputs.csv", index=False)
print("DataFrame created with columns:", list(df_output.columns))
print(df_output.head())

DataFrame created with columns: ['Question', 'Option A', 'Option B', 'Option C', 'Answer', 'Category']
                                            Question               Option A  \
0  In planning an event, the combined durations f...  66 months of duration   
1  In the context of an examination period, if th...   25.7 months in total   
2  At a camp, the registration period is 6 months...    One year and a half   
3  For a retreat, the group projects require 11 m...  16 months and 8 weeks   
4  A theatrical play runs for a period four times...   26.8 months duration   

                    Option B               Option C Answer     Category  
0      71 months of duration  69 months of duration      A  Computation  
1       28.7 months in total   24.7 months in total      C  Computation  
2  One year and eight months   A year and a quarter      C  Computation  
3      14 months and 8 weeks  18 months and 8 weeks      B  Computation  
4       20.8 months duration   21.8 months duration 

In [10]:
# Assuming you already have a dataframe 'df'
for col in df_output.columns:
    print(col)        # Prints the column name
    print(df_output[col][0])    # Prints all values in that column


Question
In planning an event, the combined durations for sound check, cleanup, and the actual event equal to 99 months. Given that the sound check lasts 11 months and the cleanup spans twice that duration, what is the duration of the actual event?
Option A
66 months of duration
Option B
71 months of duration
Option C
69 months of duration
Answer
A
Category
Computation


In [2]:
df_output.to_pickle('dataframe_perturbated_maths.pkl')