In [1]:
import openai
import pandas as pd
from typing import List, Dict
from set_env import set_env


ModuleNotFoundError: No module named 'pandas'

In [None]:
set_env("OPENAI_API_KEY")
set_env("WANDB_API_KEY")

In [None]:
import weave
weave.init("medical_data_results")

In [None]:
client = openai.OpenAI()

medical_task = """
You are extracting insights from some medical records.
The records contain a medical note and a
dialogue between a doctor and a patient. You need
to extract values for the following: Chief
complaint, History of present illness, Physical
examination, symptoms experienced by the patient,
New medications prescribed or changed, including
dosages (N/A if not provided), and Follow-up
instructions (N/A if not provided). Your answer
should not include any personal identifiable
information (PII) such as name, age, gender, or
ID. Use "the patient" instead of their name, for
example. Return your answer as a bullet list,
where each bullet is formatted like •chief
complaint: xx. If there is no value for the key,
the value should be N/A. Keep your response
around 150 words (you may have to summarize some
extracted values to stay within the word limit).
{transcript}
"""

medical_dataset_url = "https://raw.githubusercontent.com/wyim/aci-bench/main/data/challenge_data/train.csv"


In [None]:
def load_medical_data(url: str, num_samples: int = 100) -> List[Dict]:
    df = pd.read_csv(url)
    samples = df.sample(n=num_samples, random_state=42)
    return samples.to_dict('records')

@weave.op()
def process_medical_record(record: Dict) -> Dict:
    prompt = medical_task.format(transcript=record['transcript'])
    
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a medical data extractor."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=200
    )
    
    extracted_info = response.choices[0].message.content
    
    return {
        "input": record['transcript'],
        "output": extracted_info,
    }

def generate_medical_data(num_samples: int = 100) -> List[Dict]:
    data = load_medical_data(medical_dataset_url, num_samples)
    processed_data = []
    
    for record in data:
        processed_record = process_medical_record(record)
        processed_data.append(processed_record)
    
    return processed_data

In [None]:
results = generate_medical_data(100)