How to use/adapt:

Change entities, filler_phrases, or sections to mimic different clinical scenarios.

Adjust n_dates, min_distance, or noise parameters for more/less complexity.

Loop as needed to generate as many records as you want.

In [3]:
# 1) Imports
import pandas as pd

# EDA stats (already implemented)
from eda_analysis import (
    get_doc_length_stats,
    get_entity_count_stats,
    get_entity_frequency,
    get_date_count_stats,
)

# Reusable filler phrases and curated entities
from constants import FILLER_TEXT, TOP_DIAGNOSIS_ENTITIES

# Stepwise generation utilities
from create_synthetic_data import (
    seed_all,
    # sampling
    sample_note_length, sample_count, sample_entities,
    # base note
    generate_base_note,
    # placement (boundary-aware recommended)
    insert_entities_at_boundaries, insert_dates_at_boundaries,
    # relationship logic
    link_relationships_by_proximity, update_relationship_positions,
    # distance/capacity controls
    ensure_min_distance_for_all_pairs, match_date_density,
    # finishing touches
    add_filler_text, inject_noise,
    # cleanup & correction
    deduplicate_entities, deduplicate_diagnoses, correct_entity_positions,
)

Real Data

In [4]:
seed_all(42)
df = pd.read_csv("../data/synthetic.csv")

In [None]:
# doc length
length_stats = get_doc_length_stats(df)
length_stats

{'mean': 1366.4752475247524,
 'std': 64.96007913471257,
 'min': 1261,
 'max': 1568,
 'median': 1359.0}

In [None]:
# entity counts
entity_stats = get_entity_count_stats(df)
entity_stats

{'mean': 6.178217821782178,
 'std': 0.5549061110487141,
 'min': 6,
 'max': 9,
 'median': 6.0}

In [13]:
# entity frequency
entity_freqs = get_entity_frequency(df)
len(entity_freqs), entity_freqs

(61,
 {'asthma': 19,
  'pituitary_adenoma': 15,
  'rheumatoid_arthritis': 9,
  'pneumonia': 11,
  'gerd': 11,
  'meningitis': 7,
  'multiple_sclerosis': 14,
  'bronchitis': 21,
  'tension_headache': 21,
  'congenital malformation': 33,
  'osteoarthritis': 16,
  'diabetes_mellitus': 11,
  'anemia': 12,
  'depression': 18,
  'copd': 21,
  'hyperlipidemia': 19,
  'ibs': 16,
  'schizophrenia': 16,
  'stroke': 7,
  'microadenoma': 12,
  'disorder of optic nerve': 10,
  'pituitary macroadenoma': 15,
  'cerebral hemorrhage': 18,
  'neoplasm of pituitary gland': 6,
  'bipolar_disorder': 11,
  'hypothyroidism': 17,
  'anxiety': 12,
  'pituitary adenoma': 5,
  'disorder of pituitary gland': 3,
  'ulcerative_colitis': 12,
  'leukemia': 10,
  'soft tissue lesion': 6,
  'macroadenoma': 11,
  'epilepsy': 15,
  'prolactinoma': 16,
  'cluster_headache': 12,
  'atrial fibrillation': 2,
  'gout': 15,
  'type 2 diabetes': 1,
  'adrenal_insufficiency': 11,
  'fibromyalgia': 8,
  'migraine': 12,
  'obesity

In [None]:
# date counts
date_stats = get_date_count_stats(df)
date_stats

{'mean': 1.99009900990099,
 'std': 0.09950371902099892,
 'min': 1,
 'max': 2,
 'median': 2.0}

In [9]:
TOP_DIAGNOSIS_ENTITIES

['congenital malformation',
 'cerebral hemorrhage',
 'pituitary macroadenoma',
 'disorder of optic nerve',
 'hydrocephalus',
 'disorder of pituitary gland',
 'pituitary adenoma',
 'cyst',
 'pneumocephalus',
 'neoplasm of pituitary gland',
 'disorder of liver',
 'disorder of pancreas',
 'intracranial meningioma',
 'soft tissue lesion',
 'internal carotid artery stenosis',
 'meningitis',
 'radiologic infiltrate of lung',
 'hypertension',
 'type 2 diabetes',
 'migraine',
 'asthma',
 'chronic kidney disease',
 'hypothyroidism',
 'hyperlipidemia',
 'osteoarthritis',
 'depression',
 'anxiety disorder',
 'gastroesophageal reflux disease',
 'atrial fibrillation',
 'chronic obstructive pulmonary disease',
 'obesity',
 'coronary artery disease',
 'heart failure',
 'osteoporosis',
 'anemia',
 'vitamin D deficiency',
 'sleep apnea']

In [12]:
# Optional: backfill curated entities to ensure variety even if missing in this sample
for e in TOP_DIAGNOSIS_ENTITIES:
    entity_freqs.setdefault(e, 1)

len(entity_freqs)

67

Synthetic Generation

In [15]:
# Sample targets
target_length = sample_note_length(length_stats)
n_entities    = sample_count(entity_stats)
n_dates       = sample_count(date_stats)

print(f"Targets -> length: {target_length}, entities: {n_entities}, dates: {n_dates}")

Targets -> length: 1465, entities: 6, dates: 1


In [17]:
# 6) Generate a base note (variety comes from FILLER_TEXT)
note = generate_base_note(target_length=target_length, filler_phrases=FILLER_TEXT)
note

"Complete blood count shows WBC 7.2, Hgb 13.5, Plt 250. Patient instructed to follow up with specialist for further evaluation. Patient reports compliance with current medication regimen. The patient's vital signs are stable. Lungs clear to auscultation bilaterally. GI: Bowel sounds present in all four quadrants. No hepatosplenomegaly. Cardiovascular: Regular rate and rhythm. No murmurs, rubs, or gallops. Normal S1 and S2. Patient reports compliance with current medication regimen. No acute distress noted on examination. Neurological exam within normal limits. Cardiovascular: Regular rate and rhythm. No murmurs, rubs, or gallops. Normal S1 and S2. Labs from this morning were unremarkable. The plan was discussed with the patient, who understands and agrees. Skin: No rashes, lesions, or abnormal pigmentation. Good turgor and hydration. Labs from this morning were unremarkable. Respiratory: Clear to auscultation bilaterally. No wheezes, rales, or rhonchi. Respiratory: Clear to auscultatio

In [21]:
# Sample entities (weighted by real frequencies) and place non-overlapping at boundaries
entities = sample_entities(entity_freqs, n_entities)
entity_spans = insert_entities_at_boundaries(note, entities, min_distance=30)
print("Entities placed:", [e['label'] for e in entity_spans])
note

Entities placed: ['rheumatoid_arthritis', 'pituitary macroadenoma', 'disorder of optic nerve', 'intracranial meningioma', 'cerebral hemorrhage', 'hyperlipidemia']


"Complete blood count shows WBC 7.2, Hgb 13.5, Plt 250. Patient instructed to follow up with specialist for further evaluation. Patient reports compliance with current medication regimen. The patient's vital signs are stable. Lungs clear to auscultation bilaterally. GI: Bowel sounds present in all four quadrants. No hepatosplenomegaly. Cardiovascular: Regular rate and rhythm. No murmurs, rubs, or gallops. Normal S1 and S2. Patient reports compliance with current medication regimen. No acute distress noted on examination. Neurological exam within normal limits. Cardiovascular: Regular rate and rhythm. No murmurs, rubs, or gallops. Normal S1 and S2. Labs from this morning were unremarkable. The plan was discussed with the patient, who understands and agrees. Skin: No rashes, lesions, or abnormal pigmentation. Good turgor and hydration. Labs from this morning were unremarkable. Respiratory: Clear to auscultation bilaterally. No wheezes, rales, or rhonchi. Respiratory: Clear to auscultatio

In [20]:
# Insert dates at boundaries (with simple format variety internally)
date_spans = insert_dates_at_boundaries(note, n_dates=n_dates, min_distance=40)
print("Dates placed:", [d['original'] for d in date_spans])
note

Dates placed: ['(01 Jan 2020)']


"Complete blood count shows WBC 7.2, Hgb 13.5, Plt 250. Patient instructed to follow up with specialist for further evaluation. Patient reports compliance with current medication regimen. The patient's vital signs are stable. Lungs clear to auscultation bilaterally. GI: Bowel sounds present in all four quadrants. No hepatosplenomegaly. Cardiovascular: Regular rate and rhythm. No murmurs, rubs, or gallops. Normal S1 and S2. Patient reports compliance with current medication regimen. No acute distress noted on examination. Neurological exam within normal limits. Cardiovascular: Regular rate and rhythm. No murmurs, rubs, or gallops. Normal S1 and S2. Labs from this morning were unremarkable. The plan was discussed with the patient, who understands and agrees. Skin: No rashes, lesions, or abnormal pigmentation. Good turgor and hydration. Labs from this morning were unremarkable. Respiratory: Clear to auscultation bilaterally. No wheezes, rales, or rhonchi. Respiratory: Clear to auscultatio

In [22]:
# Create positive relationships by proximity (more realistic than round-robin)
relationships = link_relationships_by_proximity(entity_spans, date_spans, max_distance=500)
relationships

[{'date': '2020-01-01',
  'date_position': 550,
  'diagnoses': [{'diagnosis': 'pituitary macroadenoma', 'position': 987}]},
 {'date': '2020-01-01',
  'date_position': 550,
  'diagnoses': [{'diagnosis': 'disorder of optic nerve', 'position': 669}]},
 {'date': '2020-01-01',
  'date_position': 550,
  'diagnoses': [{'diagnosis': 'intracranial meningioma', 'position': 484}]},
 {'date': '2020-01-01',
  'date_position': 550,
  'diagnoses': [{'diagnosis': 'cerebral hemorrhage', 'position': 282}]}]

In [24]:
# 10) Enforce a minimum distance for all positive pairs (inserts filler between close pairs)
#     This may shift positions, so we will refresh relationship positions after
note, entity_spans, date_spans = ensure_min_distance_for_all_pairs(
    note=note,
    entities=entity_spans,
    dates=date_spans,
    relationships=relationships,
    min_distance=180,
    filler_phrases=FILLER_TEXT
)

note

"Complete blood count shows WBC 7.2, Hgb 13.5, Plt 250. Patient instructed to follow up with specialist for further evaluation. Patient reports compliance with current medication regimen. The patient's vital signs are stable. Lungs clear to auscultation bilaterally. GI: Bowel sounds present in all four quadrants. No hepatosplenomegaly. Cardiovascular: Regular rate and rhythm. No murmurs, rubs, or gallops. Normal S1 and S2. Patient reports compliance with current medication regimen. No acute distress not Complete blood count shows WBC 7.2, Hgb 13.5, Plt 250. ed on examination. Neurological exam within Lymphatic: No cervical, axillary, or inguinal lymphadenopathy.  Urinalysis negative for blood, protein, and leukocyte esterase.  normal limits. Cardiovascular: Regular rate and rhythm. No murmurs, rubs, or gallops. Normal S1 and S2. Labs from this morning were unremarkable. The plan was discussed with the patient, who understands and agrees. Skin: No rashes, lesions, or abnormal pigmentati

In [25]:
# Refresh relationship positions after span shifts
relationships = update_relationship_positions(relationships, entity_spans, date_spans)
relationships

[{'date': '2020-01-01',
  'date_position': 735,
  'diagnoses': [{'diagnosis': 'pituitary macroadenoma', 'position': 1172}]},
 {'date': '2020-01-01',
  'date_position': 735,
  'diagnoses': [{'diagnosis': 'disorder of optic nerve', 'position': 854}]},
 {'date': '2020-01-01',
  'date_position': 735,
  'diagnoses': [{'diagnosis': 'intracranial meningioma', 'position': 484}]},
 {'date': '2020-01-01',
  'date_position': 735,
  'diagnoses': [{'diagnosis': 'cerebral hemorrhage', 'position': 282}]}]

In [26]:
# 11) Match date density if needed (e.g., reduce to target average)
date_spans, relationships = match_date_density(date_spans, relationships, target_avg=date_stats['mean'])
date_spans

[{'original': '(01 Jan 2020)',
  'parsed': '2020-01-01',
  'start': 735,
  'end': 748}]

In [27]:
# 12) Top up to target length if enforcing distances shortened the note
note = add_filler_text(note, target_length=target_length, filler_phrases=FILLER_TEXT)
note

"Complete blood count shows WBC 7.2, Hgb 13.5, Plt 250. Patient instructed to follow up with specialist for further evaluation. Patient reports compliance with current medication regimen. The patient's vital signs are stable. Lungs clear to auscultation bilaterally. GI: Bowel sounds present in all four quadrants. No hepatosplenomegaly. Cardiovascular: Regular rate and rhythm. No murmurs, rubs, or gallops. Normal S1 and S2. Patient reports compliance with current medication regimen. No acute distress not Complete blood count shows WBC 7.2, Hgb 13.5, Plt 250. ed on examination. Neurological exam within Lymphatic: No cervical, axillary, or inguinal lymphadenopathy.  Urinalysis negative for blood, protein, and leukocyte esterase.  normal limits. Cardiovascular: Regular rate and rhythm. No murmurs, rubs, or gallops. Normal S1 and S2. Labs from this morning were unremarkable. The plan was discussed with the patient, who understands and agrees. Skin: No rashes, lesions, or abnormal pigmentati

In [28]:
# 13) Optional: inject mild noise (typos) for realism
note_noisy = inject_noise(note, typo_prob=0.002, ambiguous_prob=0.0)
note_noisy

"Complete blood count shows WBC 7.2, Hgb 13.5, Plt 250. Patient instructed to follow up with specialist for further evaluation. Patient reports compliance with current medication regimen. The patient's vital signs are stable. Lungs clear to auscultation bilaterally. GI: Bowel sounds present in all four quadrants. No hepatosplenomegaly. Cardiovascular: Regular rate and rhythm. No murmurs, rubs, or gallops. Normal S1 and S2. Patient reports compliance with current medication regimen. No acute distress not Complete blood couwt shows WBC 7.2, Hgb 13.5, mlt 250. ed on examination. Neurological exam within Lymphatic: No cervical, axillary, or inguinal lymphadenopathy.  Urinalysis negative for blood, protein, and leukocyte esterase.  normal limits. Cardiovascular: Regular rate and rhythm. No murmurs, rubs, or gallops. Normal S1 and S2. Labs from this morning were unremarkable. The plan was discussed with the patient, who understands and agrees. Skin: No rashes, lesions, or abnormal pigmentati

In [29]:
# 14) Cleanup: deduplicate and correct positions against the final note text
entity_spans = deduplicate_entities(entity_spans)
for r in relationships:
    r['diagnoses'] = deduplicate_diagnoses(r.get('diagnoses', [])

SyntaxError: unexpected EOF while parsing (3612558129.py, line 4)

In [30]:
# Re-scan positions of entities in the final text (defensive correction)
entity_spans = correct_entity_positions(note_noisy, entity_spans)
relationships = update_relationship_positions(relationships, entity_spans, date_spans)

In [31]:
# 15) Inspect the final assembled record
synthetic_record = {
    'note': note_noisy,
    'entities': entity_spans,
    'dates': date_spans,
    'relationships': relationships
}


In [32]:
# Quick inspection helpers
print("Note preview:")
print(synthetic_record['note'][:800], "...\n")
print("Entities:", synthetic_record['entities'][:10])
print("Dates:", synthetic_record['dates'])
print("Relationships:", synthetic_record['relationships'][:10])

Note preview:
Complete blood count shows WBC 7.2, Hgb 13.5, Plt 250. Patient instructed to follow up with specialist for further evaluation. Patient reports compliance with current medication regimen. The patient's vital signs are stable. Lungs clear to auscultation bilaterally. GI: Bowel sounds present in all four quadrants. No hepatosplenomegaly. Cardiovascular: Regular rate and rhythm. No murmurs, rubs, or gallops. Normal S1 and S2. Patient reports compliance with current medication regimen. No acute distress not Complete blood couwt shows WBC 7.2, Hgb 13.5, mlt 250. ed on examination. Neurological exam within Lymphatic: No cervical, axillary, or inguinal lymphadenopathy.  Urinalysis negative for blood, protein, and leukocyte esterase.  normal limits. Cardiovascular: Regular rate and rhythm. No murmu ...

Entities: []
Dates: [{'original': '(01 Jan 2020)', 'parsed': '2020-01-01', 'start': 735, 'end': 748}]
Relationships: [{'date': '2020-01-01', 'date_position': 735, 'diagnoses': [{'d

In [None]:
# 9. (Optional) Repeat in a loop to create a dataset
dataset = []
for i in range(5):
    note = generate_note_with_template(sections)
    note = add_filler_text(note, target_length=500, filler_phrases=filler_phrases)
    note = inject_noise(note, typo_prob=0.01, ambiguous_prob=0.01)
    entity_spans = insert_entities(note, entities, min_distance=50)
    date_spans = insert_dates(note, n_dates=n_dates)
    entity_spans = deduplicate_entities(entity_spans)
    relationships = generate_relationships(entity_spans, date_spans, mode="diagnosis_only")
    dataset.append({
        "note": note,
        "entities": entity_spans,
        "dates": date_spans,
        "relationships": relationships,
    })

dataset

In [None]:
# 10. Convert to DataFrame for inspection (optional)
import pandas as pd
df = pd.DataFrame(dataset)
df.head()