# MEDQA

In [4]:
import json

# Load your JSON data
with open('medqa_data.json') as f:
    qa_data = json.load(f)

# Create a list to store the responses and track success
responses = []

In [8]:
from tqdm import tqdm 

# Explanation template with split question logic and updated format
def generate_explanation_v4(case):
    template = """{case_summary}\n{diagnosis}"""
    
    # Split the question to extract the case summary and the last part (the actual question)
    question_parts = case['question'].rsplit('.', 1)
    
    # If there is a clear split between case summary and question
    if len(question_parts) == 2:
        case_summary = question_parts[0].strip() + "."
        actual_question = question_parts[1].strip()
    else:
        # If there's no clear split, use the whole question as the summary
        case_summary = case['question']
        actual_question = ""
    
    # Create the diagnosis by combining the actual question and the correct answer
    diagnosis = f"Based on the patient's condition, the most likely diagnosis or action for the question: '{actual_question}' is: {case['answer']}."
    
    # Filling in the template with the generated sections
    explanation = template.format(
        case_summary=case_summary,
        diagnosis=diagnosis
    )
    
    return explanation

# Iterate over the dataset, generate explanations and append them to the JSON
for case in qa_data:
    explanation = generate_explanation_v4(case)
    case['generated_explanation'] = explanation

# Convert the modified data back to JSON format
output_json = json.dumps(qa_data, indent=4)

# Output the result to verify (you can save it to a file instead)
print(output_json)

# Optionally, write the result to a file
with open('qa_data_with_explanations_compact.json', 'w') as f:
    f.write(output_json)


[
    {
        "question": "A junior orthopaedic surgery resident is completing a carpal tunnel repair with the department chairman as the attending physician. During the case, the resident inadvertently cuts a flexor tendon. The tendon is repaired without complication. The attending tells the resident that the patient will do fine, and there is no need to report this minor complication that will not harm the patient, as he does not want to make the patient worry unnecessarily. He tells the resident to leave this complication out of the operative report. Which of the following is the correct next action for the resident to take?",
        "answer": "Tell the attending that he cannot fail to disclose this mistake",
        "answer_idx": "opb",
        "opa": "Disclose the error to the patient and put it in the operative report",
        "opb": "Tell the attending that he cannot fail to disclose this mistake",
        "opc": "Report the physician to the ethics committee",
        "opd":

# MEDMQA

In [38]:
# Read the original JSON file
with open("raw_medmqa.json", 'r') as f:
    # Read lines and strip whitespace
    raw_medmqa = [line.strip() for line in f if line.strip()]

In [50]:
# Function to convert second format to the first format
def convert_to_first_format(data):
    converted_data = []
    not_true_questions = 0

    for entry in data:
        entry = json.loads(entry)
        exp = f'Here is an additional explanation: {entry['exp']}' if entry['exp'] != None else ''

        # Skip entries that contain "not true" in the question
        if "not true" in entry["question"].lower() and entry['exp'] == None:
            not_true_questions += 1
            continue

        # Find the correct answer based on the cop field
        correct_answer_key = f"op{chr(96 + entry['cop'])}"  # 'a' is chr(97), 'b' is chr(98), etc.
        correct_answer = entry[correct_answer_key]
        
        # Structure the new entry to match the first format
        new_entry = {
            "question": entry["question"],
            "answer": correct_answer,
            "opa": entry["opa"],
            "opb": entry["opb"],
            "opc": entry["opc"],
            "opd": entry["opd"],
            "generated_explanation": f"""{entry["question"]} {correct_answer}. {exp}"""
        }

        # Append the new entry to the list
        converted_data.append(new_entry)
    
    return converted_data, not_true_questions

# Convert the data
converted_data, not_true_questions = convert_to_first_format(raw_medmqa)
print(f"Skipped {not_true_questions} unexplained 'not true' questions.")

# Convert the modified data back to JSON format
output_json = json.dumps(converted_data, indent=4)

# Output the result to verify (you can save it to a file instead)
print(output_json)

# Optionally, write the result to a file
with open('medmqa_data.json', 'w') as f:
    f.write(output_json)

Skipped 18 unexplained 'not true' questions.
[
    {
        "question": "Which of the following is not true about glomerular capillaries')",
        "answer": "The oncotic pressure of the fluid leaving the capillaries is less than that of fluid entering it",
        "opa": "The oncotic pressure of the fluid leaving the capillaries is less than that of fluid entering it",
        "opb": "Glucose concentration in the capillaries is the same as that in glomerular filtrate",
        "opc": "Constriction of afferent aeriole decreases the blood flow to the glomerulas",
        "opd": "Hematocrit of the fluid leaving the capillaries is less than that of the fluid entering it",
        "generated_explanation": "Which of the following is not true about glomerular capillaries') The oncotic pressure of the fluid leaving the capillaries is less than that of fluid entering it. Here is an additional explanation: Ans-a. The oncotic pressure of the fluid leaving the capillaries is less than that of f

In [53]:
import pandas as pd

print(output_json[0])

[
