In [1]:
import sys
import os
import json


# Load the autoreload extension
%load_ext autoreload

# Set autoreload mode to reload all modules before executing each cell
%autoreload 2

# Get the absolute path to the parent directory
parent_path = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Add the parent directory to sys.path
if parent_path not in sys.path:
    sys.path.append(parent_path)

from src.utils import clean_references

from src.models import Article

In [2]:
import pandas as pd

df = pd.read_csv("/Users/vince/Salk/PaperGeneration/data/condition_revised.csv").fillna("")
df

Unnamed: 0,Condition,Alternative Name,Category
0,Food Allergy,,Allergies
1,Allergic Rhinitis,Hay Fever,Allergies
2,Drug Allergies,,Allergies
3,Atopic Dermatitis,Eczema,Allergies
4,Contact Dermatitis,,Allergies
...,...,...,...
288,Interstitial Cystitis,,Urinary Health
289,Urinary Tract Infection,UTI,Urinary Health
290,Nephrolithiasis,Kidney Stones,Urinary Health
291,Urinary Incontinence,,Urinary Health


In [10]:
import os
import re
from datetime import datetime
import logging 
import json
import shutil
import string

def sanitize_filename(filename):
  valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
  return ''.join(c for c in filename if c in valid_chars)

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

output_dir = "/Users/vince/Salk/PaperGeneration/data/output/gemini-2.0-flash-exp"
filtered_dir = "/Users/vince/Salk/PaperGeneration/data/final_json"
os.makedirs(filtered_dir, exist_ok=True)

command_dir = "/home/vrothenberg_salk_edu/wagtail/data/filtered_json"

# This will store the commands
commands = []

for i, row in df.iterrows():

    condition_name = row['Condition']
    alternative_name = row['Alternative Name']
    category = row['Category']
    topic = condition_name
    if alternative_name:
        topic = f"{topic} ({alternative_name})"

    logger.info(f"[{i}] {condition_name} - {category}")

    sanitized = re.sub(r'[^\w\s]', '', topic).replace(' ', '_')

    # Create the directory path
    dir_path = os.path.join(output_dir, sanitized)
    # Check if the directory exists
    if not os.path.exists(dir_path):
        # If it doesn't exist, skip 
        logger.info(f"Skipping. Does not exist: {dir_path}")
        continue

    # If directory exists, find most recent JSON files
    final_jsons = [f for f in os.listdir(dir_path) if "_final_" in f and f.endswith(".json")]

    # Find the latest final JSON
    latest_final_json = None
    latest_final_time = None

    for f in final_jsons:
        try:
            dt = datetime.strptime(f.split('_')[-2] + ' ' + f.split('_')[-1].split('.json')[0], '%Y%m%d %H%M%S')
            if latest_final_time is None or dt > latest_final_time:
                latest_final_time = dt
                latest_final_json = f
        except ValueError as e:
            logger.error(f"Error parsing filename {f}: {e}")
            continue

    # Check if latest_final_json is valid, if so store in filtered_dir
    if latest_final_json:
        try:
            source_path = os.path.join(dir_path, latest_final_json)
            
            # Validate JSON by loading it
            with open(source_path, 'r') as f:
                json_data = json.load(f)  # This will raise an exception if the JSON is invalid

            # Sanitize the filename
            safe_filename = sanitize_filename(latest_final_json)
            destination_path = os.path.join(filtered_dir, safe_filename) 
            shutil.copy(source_path, destination_path)

            command_path = os.path.join(command_dir, safe_filename) 

            # Add the command for this file
            command = f"python manage.py import_articles --category-name \"{category}\" --json-path \"{command_path}\""
            commands.append(command)

        except json.JSONDecodeError as e:
            logger.error(f"Invalid JSON format in {latest_final_json}: {e}")
        except Exception as e:
            logger.error(f"Error on {latest_final_json}")
            logger.error(e)
            continue

    else:
        print(f"No valid JSON found for {topic}")
        continue

# Print or save the commands
for command in commands:
    print(command)

# You can save the commands to a file if you want to execute them later
with open('import_commands.sh', 'w') as f:
    for command in commands:
        f.write(command + '\n')

2025-01-21 11:30:29,110 - INFO - [0] Food Allergy - Allergies


2025-01-21 11:30:29,112 - INFO - [1] Allergic Rhinitis - Allergies
2025-01-21 11:30:29,114 - INFO - [2] Drug Allergies - Allergies
2025-01-21 11:30:29,115 - INFO - [3] Atopic Dermatitis - Allergies
2025-01-21 11:30:29,116 - INFO - [4] Contact Dermatitis - Allergies
2025-01-21 11:30:29,117 - INFO - [5] Ankylosing Spondylitis - Autoimmune & Inflammatory
2025-01-21 11:30:29,118 - INFO - [6] Gout - Autoimmune & Inflammatory
2025-01-21 11:30:29,120 - INFO - [7] Guillain-Barre Syndrome - Autoimmune & Inflammatory
2025-01-21 11:30:29,121 - INFO - [8] Multiple Sclerosis - Autoimmune & Inflammatory
2025-01-21 11:30:29,121 - INFO - [9] Osteoarthritis - Autoimmune & Inflammatory
2025-01-21 11:30:29,122 - INFO - [10] Rheumatoid Arthritis - Autoimmune & Inflammatory
2025-01-21 11:30:29,124 - INFO - [11] Sarcoidosis - Autoimmune & Inflammatory
2025-01-21 11:30:29,125 - INFO - [12] Scleroderma - Autoimmune & Inflammatory
2025-01-21 11:30:29,126 - INFO - [13] Systemic Lupus Erythematosus - Autoimmune 

No valid JSON found for Cholecystitis
No valid JSON found for Diverticulitis
No valid JSON found for Peptic Ulcer Disease (PUD)
No valid JSON found for Hypothyroidism
No valid JSON found for Cataracts
No valid JSON found for Corneal Abrasion
No valid JSON found for Post-traumatic Stress Disorder (PTSD)


2025-01-21 11:30:29,373 - INFO - [198] Sciatica - Neurological
2025-01-21 11:30:29,375 - INFO - [199] Stroke - Neurological
2025-01-21 11:30:29,375 - INFO - [200] Tension Headache - Neurological
2025-01-21 11:30:29,376 - INFO - [201] Trigeminal Neuralgia - Neurological
2025-01-21 11:30:29,377 - INFO - [202] Vertigo - Neurological
2025-01-21 11:30:29,378 - INFO - [203] Visual Migraine - Neurological
2025-01-21 11:30:29,379 - INFO - [204] Halitosis - Oral Health
2025-01-21 11:30:29,380 - INFO - [205] Cavities - Oral Health
2025-01-21 11:30:29,382 - INFO - [206] Dental Abscess - Oral Health
2025-01-21 11:30:29,383 - INFO - [207] Dry Mouth - Oral Health
2025-01-21 11:30:29,384 - INFO - [208] Gingivitis - Oral Health
2025-01-21 11:30:29,385 - INFO - [209] Mouth Ulcers - Oral Health
2025-01-21 11:30:29,387 - INFO - [210] Orthodontic Relapse - Oral Health
2025-01-21 11:30:29,388 - INFO - [211] Periodontitis - Oral Health
2025-01-21 11:30:29,389 - INFO - [212] Dentin Hypersensitivity - Oral He

python manage.py import_articles --category-name "Allergies" --json-path "/home/vrothenberg_salk_edu/wagtail/data/filtered_json/Food_Allergy_final_20250117_171302.json"
python manage.py import_articles --category-name "Allergies" --json-path "/home/vrothenberg_salk_edu/wagtail/data/filtered_json/Allergic_Rhinitis_Hay_Fever_final_20250117_171123.json"
python manage.py import_articles --category-name "Allergies" --json-path "/home/vrothenberg_salk_edu/wagtail/data/filtered_json/Drug_Allergies_final_20250117_180851.json"
python manage.py import_articles --category-name "Allergies" --json-path "/home/vrothenberg_salk_edu/wagtail/data/filtered_json/Atopic_Dermatitis_Eczema_final_20250117_171306.json"
python manage.py import_articles --category-name "Allergies" --json-path "/home/vrothenberg_salk_edu/wagtail/data/filtered_json/Contact_Dermatitis_final_20250117_145902.json"
python manage.py import_articles --category-name "Autoimmune & Inflammatory" --json-path "/home/vrothenberg_salk_edu/wag

In [None]:
import os
import re
from datetime import datetime
import logging 
import json
import shutil
import string

def sanitize_filename(filename):
  valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
  return ''.join(c for c in filename if c in valid_chars)

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

output_dir = "/Users/vince/Salk/PaperGeneration/data/output/gemini-2.0-flash-exp"
filtered_dir = "/Users/vince/Salk/PaperGeneration/data/final_json"
os.makedirs(filtered_dir, exist_ok=True)

command_dir = "/home/vrothenberg_salk_edu/wagtail/data/filtered_json"

# This will store the commands
commands = []

for i, row in df.iterrows():

    condition_name = row['Condition']
    alternative_name = row['Alternative Name']
    category = row['Category']
    topic = condition_name
    if alternative_name:
        topic = f"{topic} ({alternative_name})"

    logger.info(f"[{i}] {condition_name} - {category}")

    sanitized = re.sub(r'[^\w\s]', '', topic).replace(' ', '_')

    # Create the directory path
    dir_path = os.path.join(output_dir, sanitized)
    # Check if the directory exists
    if not os.path.exists(dir_path):
        # If it doesn't exist, skip 
        logger.info(f"Skipping. Does not exist: {dir_path}")
        continue

    # If directory exists, find most recent JSON files
    final_jsons = [f for f in os.listdir(dir_path) if "_final_" in f and f.endswith(".json")]

    # Find the latest final JSON
    latest_final_json = None
    latest_final_time = None

    for f in final_jsons:
        try:
            dt = datetime.strptime(f.split('_')[-2] + ' ' + f.split('_')[-1].split('.json')[0], '%Y%m%d %H%M%S')
            if latest_final_time is None or dt > latest_final_time:
                latest_final_time = dt
                latest_final_json = f
        except ValueError as e:
            logger.error(f"Error parsing filename {f}: {e}")
            continue

    # Check if latest_final_json is valid, if so store in filtered_dir
    if latest_final_json:
        try:
            source_path = os.path.join(dir_path, latest_final_json)
            
            # Validate JSON by loading it
            with open(source_path, 'r') as f:
                json_data = json.load(f)  # This will raise an exception if the JSON is invalid
            print("json_data", json_data)
            cleaned_json_data = clean_references(json_data)
            print("cleaned_json", cleaned_json_data)
            
            latest_final_json = latest_final_json.replace("_sourced_remapped_", "_cleaned_") 
            
            # Sanitize the filename
            safe_filename = sanitize_filename(latest_final_json)
            destination_path = os.path.join(filtered_dir, safe_filename) 

            with open(destination_path, 'w') as f:
                json.dump(cleaned_json_data, f, indent=2)

            command_path = os.path.join(command_dir, safe_filename) 

            # Add the command for this file
            command = f"python manage.py import_articles --category-name \"{category}\" --json-path \"{command_path}\""
            commands.append(command)

        except json.JSONDecodeError as e:
            logger.error(f"Invalid JSON format in {latest_final_json}: {e}")
        except Exception as e:
            logger.error(f"Error on {latest_final_json}")
            logger.error(e)
            continue

    else:
        print(f"No valid JSON found for {topic}")
        continue

# Print or save the commands
for command in commands:
    print(command)

# You can save the commands to a file if you want to execute them later
with open('import_commands.sh', 'w') as f:
    for command in commands:
        f.write(command + '\n')

2025-01-17 11:22:46,692 - INFO - [0] Food Allergy - Allergies


2025-01-17 11:22:46,695 - INFO - [1] Allergic Rhinitis - Allergies
2025-01-17 11:22:46,698 - INFO - [2] Drug Allergies - Allergies
2025-01-17 11:22:46,702 - INFO - [3] Atopic Dermatitis - Allergies
2025-01-17 11:22:46,705 - INFO - [4] Contact Dermatitis - Allergies
2025-01-17 11:22:46,707 - INFO - [5] Ankylosing Spondylitis - Autoimmune & Inflammatory
2025-01-17 11:22:46,710 - INFO - [6] Gout - Autoimmune & Inflammatory
2025-01-17 11:22:46,713 - INFO - [7] Guillain-Barre Syndrome - Autoimmune & Inflammatory
2025-01-17 11:22:46,715 - INFO - [8] Multiple Sclerosis - Autoimmune & Inflammatory
2025-01-17 11:22:46,718 - INFO - [9] Osteoarthritis - Autoimmune & Inflammatory
2025-01-17 11:22:46,721 - INFO - [10] Rheumatoid Arthritis - Autoimmune & Inflammatory
2025-01-17 11:22:46,723 - INFO - [11] Sarcoidosis - Autoimmune & Inflammatory
2025-01-17 11:22:46,727 - INFO - [12] Scleroderma - Autoimmune & Inflammatory
2025-01-17 11:22:46,731 - INFO - [13] Systemic Lupus Erythematosus - Autoimmune 

json_data {'title': 'Food Allergy', 'subtitle': 'An adverse immune reaction to specific food proteins.', 'overview': {'heading': 'Overview', 'content': 'Food allergy is an abnormal immune response triggered by specific food proteins. Unlike food intolerance, which involves the digestive system, food allergies involve the immune system and can be potentially life-threatening. It is estimated that up to 8% of children [10,11,18] and up to 10.8% of adults [4,19] have a food allergy. Reactions can range from mild to severe, with anaphylaxis being the most severe and potentially fatal.'}, 'key_facts': {'heading': 'Key Facts', 'content': ['Food allergies are caused by an overreaction of the immune system to certain food proteins.', "The 'Big 8' allergens account for about 90% of all food allergies: milk, eggs, peanuts, tree nuts, soy, wheat, fish, and shellfish [20].", 'Symptoms can appear within minutes to hours after ingestion of the offending food.', 'Anaphylaxis is a severe, life-threate

2025-01-17 11:22:46,914 - INFO - [84] Type 2 Diabetes - Endocrine & Metabolic
2025-01-17 11:22:46,917 - INFO - [85] Age-related Macular Degeneration - Eye Health
2025-01-17 11:22:46,920 - INFO - [86] Amblyopia - Eye Health
2025-01-17 11:22:46,923 - INFO - [87] Astigmatism - Eye Health
2025-01-17 11:22:46,925 - INFO - [88] Blepharitis - Eye Health
2025-01-17 11:22:46,928 - INFO - [89] Cataracts - Eye Health
2025-01-17 11:22:46,928 - INFO - [90] Conjunctivitis - Eye Health
2025-01-17 11:22:46,930 - INFO - [91] Corneal Abrasion - Eye Health
2025-01-17 11:22:46,931 - INFO - [92] Diabetic Retinopathy - Eye Health
2025-01-17 11:22:46,933 - INFO - [93] Dry Eye Syndrome - Eye Health
2025-01-17 11:22:46,936 - INFO - [94] Eye Floaters - Eye Health
2025-01-17 11:22:46,938 - INFO - [95] Glaucoma - Eye Health
2025-01-17 11:22:46,940 - INFO - [96] Hyperopia - Eye Health
2025-01-17 11:22:46,942 - INFO - [97] Keratitis - Eye Health
2025-01-17 11:22:46,944 - INFO - [98] Keratoconus - Eye Health
2025-01

json_data {'title': 'Type 1 Diabetes', 'subtitle': 'An autoimmune condition characterized by the destruction of insulin-producing cells in the pancreas.', 'overview': {'heading': 'Overview', 'content': "Type 1 diabetes is a chronic autoimmune disease where the body's immune system mistakenly attacks and destroys the insulin-producing beta cells in the pancreas. This results in an inability to produce insulin, a hormone necessary for regulating blood sugar levels. Unlike type 2 diabetes, type 1 diabetes is not caused by lifestyle factors and usually develops in childhood or adolescence, though it can occur at any age. The condition requires lifelong insulin therapy to manage blood glucose levels and prevent serious complications. Approximately 5-10% of all diabetes cases are type 1 diabetes. The process leading to type 1A diabetes involves genetic susceptibility, is likely triggered by environmental factors [7], and progresses over time before the onset of symptoms [1]. This latent peri

2025-01-17 11:22:47,131 - INFO - [207] Dry Mouth - Oral Health
2025-01-17 11:22:47,135 - INFO - Skipping. Does not exist: /Users/vince/Salk/PaperGeneration/data/output/gemini-2.0-flash-exp/Dry_Mouth_Xerostomia
2025-01-17 11:22:47,136 - INFO - [208] Gingivitis - Oral Health
2025-01-17 11:22:47,137 - INFO - Skipping. Does not exist: /Users/vince/Salk/PaperGeneration/data/output/gemini-2.0-flash-exp/Gingivitis
2025-01-17 11:22:47,137 - INFO - [209] Mouth Ulcers - Oral Health
2025-01-17 11:22:47,137 - INFO - Skipping. Does not exist: /Users/vince/Salk/PaperGeneration/data/output/gemini-2.0-flash-exp/Mouth_Ulcers_Canker_Sores
2025-01-17 11:22:47,137 - INFO - [210] Orthodontic Relapse - Oral Health
2025-01-17 11:22:47,137 - INFO - Skipping. Does not exist: /Users/vince/Salk/PaperGeneration/data/output/gemini-2.0-flash-exp/Orthodontic_Relapse
2025-01-17 11:22:47,138 - INFO - [211] Periodontitis - Oral Health
2025-01-17 11:22:47,138 - INFO - Skipping. Does not exist: /Users/vince/Salk/PaperGen

json_data {'title': 'Contact Dermatitis', 'subtitle': 'An inflammatory skin condition caused by direct contact with an allergen or irritant.', 'overview': {'heading': 'Overview', 'content': "Contact dermatitis is a common inflammatory skin condition that occurs when the skin comes into contact with a substance that causes irritation or an allergic reaction. This condition is characterized by redness, itching, and sometimes blistering, and it can affect people of all ages. Contact dermatitis is not contagious, but it can significantly impact the quality of life for those who experience it [7]. The severity can range from mild discomfort to severe pain. It's estimated that a significant percentage of the population will experience some form of contact dermatitis in their lifetime."}, 'key_facts': {'heading': 'Key Facts', 'content': ['Contact dermatitis is not contagious.', 'It is caused by direct contact with an irritant or allergen.', 'Symptoms can range from mild redness to severe blis

In [7]:
import os
import re
from datetime import datetime
import logging 
import json
import shutil
import string
import pandas as pd

def sanitize_filename(filename):
  valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
  return ''.join(c for c in filename if c in valid_chars)

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

output_dir = "/Users/vince/Salk/PaperGeneration/data/output/gemini-2.0-flash-exp"

filtered_rows = []

for i, row in df.iterrows():

    condition_name = row['Condition']
    alternative_name = row['Alternative Name']
    category = row['Category']
    topic = condition_name
    if alternative_name:
        topic = f"{topic} ({alternative_name})"

    # logger.info(f"[{i}] {condition_name} - {category}")

    sanitized = re.sub(r'[^\w\s]', '', topic).replace(' ', '_')

    # Create the directory path
    dir_path = os.path.join(output_dir, sanitized)
    # Check if the directory exists
    if not os.path.exists(dir_path):
        logger.info(f"Does not exist: {dir_path}")
        filtered_rows.append(row)
        continue

    # If directory exists, find most recent JSON files
    final_jsons = [f for f in os.listdir(dir_path) if "_final_" in f and f.endswith(".json")]

    # Find the latest final JSON
    latest_final_json = None
    latest_final_time = None

    for f in final_jsons:
        try:
            dt = datetime.strptime(f.split('_')[-2] + ' ' + f.split('_')[-1].split('.json')[0], '%Y%m%d %H%M%S')
            if latest_final_time is None or dt > latest_final_time:
                latest_final_time = dt
                latest_final_json = f
        except ValueError as e:
            # logger.error(f"Error parsing filename {f}: {e}")
            continue

    # Check if latest_final_json is valid, if so store in filtered_dir
    if latest_final_json:
        try:
            source_path = os.path.join(dir_path, latest_final_json)
            
            # Validate JSON by loading it
            with open(source_path, 'r') as f:
                json_data = json.load(f)  # This will raise an exception if the JSON is invalid
                print(len(json_data))

        except Exception as e:
            # Add to filtered dataframe
            logger.error(f"Invalid JSON found for {topic}: {e}")
            filtered_rows.append(row)

    else:
        print(f"No valid JSON found for {topic}")
        # Add to filtered dataframe
        filtered_rows.append(row)

# Create a new DataFrame from the filtered rows
filtered_df = pd.DataFrame(filtered_rows)

# Now you can work with filtered_df, which contains the rows without valid final JSONs
display(filtered_df) 

# Example: Save the filtered DataFrame to a CSV file
# filtered_df.to_csv("filtered_rows.csv", index=False)

18
18
No valid JSON found for Drug Allergies
18
18
18
18
18
18
18
18
18
No valid JSON found for Scleroderma
18
18
18
18
18
18
18
18
18
18
18
18
18
No valid JSON found for Osteoporosis
No valid JSON found for Osteomalacia (Soft Bone Disease)
No valid JSON found for Osteogenesis Imperfecta (Brittle Bone Disease)
No valid JSON found for Paget's Disease of Bone (Osteitis Deformans)
No valid JSON found for Bone Fractures
No valid JSON found for Basal Cell Carcinoma
No valid JSON found for Breast Cancer
No valid JSON found for Cervical Cancer
No valid JSON found for Colorectal Cancer
No valid JSON found for Liver Cancer
No valid JSON found for Lung Cancer
No valid JSON found for Melanoma
No valid JSON found for Ovarian Cancer
No valid JSON found for Prostate Cancer
No valid JSON found for Squamous Cell Carcinoma
No valid JSON found for Leukemia (Blood Cancer)
No valid JSON found for Cardiac Arrhythmia
No valid JSON found for Coronary Artery Disease
No valid JSON found for Heart Failure
No va

Unnamed: 0,Condition,Alternative Name,Category
2,Drug Allergies,,Allergies
12,Scleroderma,,Autoimmune & Inflammatory
26,Osteoporosis,,Bone Health
27,Osteomalacia,Soft Bone Disease,Bone Health
28,Osteogenesis Imperfecta,Brittle Bone Disease,Bone Health
...,...,...,...
91,Corneal Abrasion,,Eye Health
94,Eye Floaters,,Eye Health
140,Post-traumatic Stress Disorder,PTSD,Mental Health
179,Shoulder Dislocation,,Musculoskeletal


In [8]:
filtered_df.to_csv("missing_conditions.csv")