In [None]:
import os
import openai
from dotenv import load_dotenv

load_dotenv('setting.env')

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
client = openai.OpenAI()

In [None]:
target = 'glp1'

if target == 'glp1':
    DRUGS = ['Byetta', 'Victoza', 'Saxenda', 'Ozempic', 'Wegovy', 'Mounjaro', 'Zepbound', 'Januvia']
elif target == 'metformin':
    DRUGS = ['Metformin hydrochloride', 'Janumet', 'Synjardy', 'Actoplus Met', 'Glucovance', 'Jentadueto', 'Prandimet']

In [None]:
# Get openFDA cfm URLs 
import requests


def get_review_cfm(drugs: list[str]) -> dict:
    URL_FORMAT = 'https://api.fda.gov/drug/drugsfda.json?search=openfda.brand_name:"{drug}"'
    
    review_cfm_dict = dict()
    for drug in drugs:
        try:
            response = requests.get(URL_FORMAT.format(drug=drug))
            response.raise_for_status()  # handle HTTP errors
        except requests.exceptions.RequestException as e:
            print(f'{drug} - HTTP Request failed: {e}')
            continue

        data = response.json().get('results')[0]

        for submission in data['submissions']:
            for doc in submission.get('application_docs', []):
                if doc.get('type', '') != 'Review':
                    continue

                if doc.get('url', '').endswith('.cfm'):     # TODO: include additional submitted files (PDFs)
                    review_cfm_dict.setdefault(drug, []).append(doc)

        # TODO: handle marketing_status: Discontinued
    return review_cfm_dict


review_cfm_dict = get_review_cfm(DRUGS)

for k, v in review_cfm_dict.items():
    print(k, v[0]['url'], sep='\t')

In [None]:
# Parse review URLs from HTML
import re
import time
from bs4 import BeautifulSoup
from urllib.parse import urljoin


def normalize_title(raw_text: str, href: str) -> str:
    txt = re.sub(r"\s+", " ", raw_text).strip()
    href_l = (href or "").lower()

    m = re.match(r"^part\s*(\d+)$", txt, flags=re.I)
    if m:
        part = m.group(1)
        if "clinpharm" in href_l:
            return f"Clinical Pharmacology Biopharmaceutics Review(s) - Part {part}"
        if "pharmr" in href_l:
            return f"Pharmacology Review(s) - Part {part}"
        if "medr" in href_l or "clinical" in href_l:
            return f"Clinical Review(s) - Part {part}"
        return f"Review - Part {part}"

    return txt


def get_review_url(review_cfm_dict: dict) -> dict:
    review_url_dict = dict()

    for drug in review_cfm_dict.keys():
        for doc in review_cfm_dict[drug]:
            review_url = doc['url']
            base_url = review_url.rsplit('/', 1)[0] + '/'

            try:
                resp = requests.get(review_url, timeout=30)
                resp.raise_for_status()
            except requests.HTTPError as e:
                if resp.status_code == 503:
                    time.sleep(60)
                    resp = requests.get(review_url, timeout=30)

            soup = BeautifulSoup(resp.text, 'html.parser')
            
            toc_links = soup.select('ul li a[href]')
            for a in toc_links:
                if 'pharmr' in a['href'].lower() or 'medr' in a['href'].lower():
                    key = normalize_title(a.get_text(" ", strip=True), a['href'])
                    review_url_dict.setdefault(drug, dict())[key] = urljoin(base_url, a['href'])
    
    return review_url_dict


review_url_dict = get_review_url(review_cfm_dict)

for k, v in review_url_dict.items():
    print(k, v, sep='\t')

In [None]:
# Upload files
import os
import io
import json


def load_file_index(file_index_filename: str) -> dict:
    if os.path.exists(file_index_filename):
        with open(file_index_filename, 'r', encoding='utf-8') as f:
            try:
                existing = json.load(f)
                return existing if isinstance(existing, dict) else dict()
            except json.JSONDecodeError:
                return dict()
    return dict()


def save_file_index(file_index: dict, file_index_filename: str) -> None:
    with open(file_index_filename, 'w', encoding='utf-8') as f:
        json.dump(file_index, f, ensure_ascii=False, indent=2)


def upload_pdf(url: str, filename: str) -> str:
    try:
        r = requests.get(url, stream=True, timeout=60)
        r.raise_for_status()
    except requests.exceptions.HTTPError as e:
        print(filename, url)
        return
    
    content = io.BytesIO(r.content)
    f = client.files.create(file=(filename, content), purpose='assistants')
    return f.id


def upload_missing_pdfs(review_url_dict: dict, existing_index: dict) -> tuple[list, list]:
    file_ids = []
    file_meta = []

    for drug, docs in review_url_dict.items():
        for title, pdf_url in docs.items():
            if existing_index.get(drug, {}).get(title):
                continue

            filename = f'{drug}_{title}.pdf'
            fid = upload_pdf(pdf_url, filename)

            if fid:
                file_ids.append(fid)
                file_meta.append((fid, {'drug': drug, 'review': filename}))
                existing_index.setdefault(drug, {})[filename] = fid

    return file_ids, file_meta


file_index_filename = 'file_index.json'
existing = load_file_index(file_index_filename)
file_ids, file_meta = upload_missing_pdfs(review_url_dict, existing)
save_file_index(existing, file_index_filename)

print(f'new uploads: {len(file_ids)}')

In [None]:
SYSTEM_PROMPT = """You are an expert in drafting Target Product Profiles (TPPs) based strictly on regulatory and drug development source documents.
Prepare the TPP solely based on the provided documents.

Output Requirements (Strict):
- You must write a total of 17 sections, in the exact order specified.
- Each section must include all three components below:
  1) [Conclusion]
    - Content intended for inclusion in the corresponding TPP section
  2) [Evidence]
    - Up to five (5) bullet points citing document-based evidence
    - Page numbers are mandatory
    - Include referenced sentences, sections, figures, tables, or table numbers where available
- Maintain the same structure and formatting for every section.
- Construct sentences primarily using terminology and phrasing found in the source documents.

Behavioral Rules (Critical):
- All claims must be supported by evidence from the provided documents obtained via file_search.
- Do not infer, assume, or generalize beyond what is explicitly stated in the documents.
- If no supporting evidence exists, explicitly state “No supporting evidence found in the documents.”
- The [Evidence] section must be written only after performing file_search.
- If file_search yields no relevant results, state “No supporting evidence found in the documents.”
- Before drafting each section, internally construct relevant search queries and perform file_search.
- Write from a preclinical-stage perspective prior to IND submission.
- Do not make clinical assumptions or claims.
"""

USER_PROMPT = """Writing Objective: Draft a Target Product Profile(TPP) – Preclinical Stage (Prior to IND Submission)

Before writing each section, internally construct appropriate search queries and perform file_search against the provided documents to locate relevant evidence.
(Example queries: “indication”, “dose”, “toxicology”, “NOAEL”, “safety pharmacology”, “PK”, “heart rate”, “monkey study”, etc.)

=== TPP Section ===
1. Indication and Usage
2. Dosage and Administration
3. Dosage Forms and Strengths
4. Contraindications
5. Warnings and Precautions
6. Adverse Reactions
7. Drug Interactions
8. Use in Specific Populations
9. Drug Abuse and Dependence
10. Overdosage
11. Description
12. Clinical Pharmacology
13. Nonclinical Pharmacology
14. Clinical Studies
15. How Supplied/Storage and Handling
16. Patient Counseling Information

Required Format(Apply to Every Section)
## 1. Indication and Usage
[Conclusion] ...
[Evidence]
- ...
- ...
"""

In [None]:
from typing import Any

def create_client_response_w_rag(vector_store_id: str) -> Any:
    resp = client.responses.create(
        model="gpt-5.1",
        input=[
            {"role": "system", "content": [{"type": "input_text", "text": SYSTEM_PROMPT}]},
            {"role": "user", "content": [{"type": "input_text", "text": USER_PROMPT}]}
        ],
        tools=[{
            "type": "file_search",
            "vector_store_ids": [vector_store_id],
        }],
        include=["file_search_call.results"],
        reasoning={"effort": "high"}
    )
    return resp


def create_client_response_wo_rag(file_ids: str|list) -> Any:
    if isinstance(file_ids, str):
        file_ids = [file_ids]

    user_content = [{"type": "input_file", "file_id": fid} for fid in file_ids]
    user_content.append({"type": "input_text", "text": USER_PROMPT})
    
    resp = client.responses.create(
        model="gpt-5.1",
        input=[
            {"role": "system", "content": [{"type": "input_text", "text": SYSTEM_PROMPT}]},
            {"role": "user", "content": user_content}
        ],
        reasoning={"effort": "high"}
    )
    return resp


def create_client_response_hybrid(file_ids: str|list, vector_store_id: str) -> Any:
    if isinstance(file_ids, str):
        file_ids = [file_ids]

    user_content = [{"type": "input_file", "file_id": fid} for fid in file_ids]
    user_content.append({"type": "input_text", "text": USER_PROMPT})

    resp = client.responses.create(
        model="gpt-5.1",
        input=[
            {"role": "system", "content": [{"type": "input_text", "text": SYSTEM_PROMPT}]},
            {"role": "user", "content": user_content}
        ],
        tools=[{
            "type": "file_search",
            "vector_store_ids": [vector_store_id],
        }],
        # include=["file_search_call.results"],
        reasoning={"effort": "high"}
    )
    return resp

In [None]:
def check_file_status(vs_id: str) -> list:
    failed = []
    for file in client.vector_stores.files.list(vector_store_id=vs_id).data:
        if file.status == 'failed':
            failed.append((file.id, file.last_error))
    return failed


def check_vector_store_status(vs_id: str) -> None:
    while True:
        vs = client.vector_stores.retrieve(vs_id)
        print(vs.id, vs.status, vs.file_counts)

        if vs.status == "completed" and vs.file_counts.total == vs.file_counts.completed and vs.file_counts.total != 0:
            break

        for file_id, err in check_file_status(vs_id):
            code = getattr(err, "code", None)
            print(f"FAILED: {file_id} - {err} (code={code})")

            if code == "server_error":
                pass
            else:
                raise RuntimeError(f"Vector store has failed files. file_id={file_id}, last_error={err}")

        time.sleep(5)

In [None]:
process = 'clinical'

with open(f'{target}.txt', 'a') as file:
    for drug in list(set(existing.keys()) & set(DRUGS))[:3]:
        print(f'========== {drug} ==========')
        file_ids = []
        for file_name, file_id in existing.get(drug).items():
            if process == 'preclinical' and file_name.split('_', 1).lower().startswith('clinical'):
                continue
            file_ids.append(file_id)

        if file_ids:
            vs = client.vector_stores.create(
                name=drug,
                file_ids=file_ids
            )
            check_vector_store_status(vs.id)

            file.write(f'===== {drug} =====\n')
            try:
                resp = create_client_response_hybrid(file_ids=file_ids, vector_store_id=vs.id)
                file.write(resp.output_text)
            except Exception as e:
                file.write(str(e))  # TODO: handle errors
            file.write('\n\n')
