In [13]:
import pandas as pd
import json
import os
import re
from exa_py import Exa

# Initialize Exa client
exa = Exa(os.environ.get("EXA_API_KEY"))

# Read the Excel file
df = pd.read_excel('DOschools.xlsx')

# Filter to only school rows (non-null Key: and not explanation rows)
schools_df = df[df['Key:'].notna() & ~df['Key:'].str.contains('\\*', na=False)].copy()

print(f"Found {len(schools_df)} DO schools to process")

Found 69 DO schools to process


In [14]:
def get_school_location(school_name):
    """Use Exa API to find the city and state of a DO school"""
    try:
        # Clean up school name for better search
        clean_name = school_name.replace('(', '').replace(')', '').strip()
        
        # Search for the school location
        result = exa.search_and_contents(
            f"{clean_name} osteopathic medical school location city state address",
            num_results=3,
            text={"max_characters": 500}
        )
        
        if result.results:
            # Combine all text results
            all_text = " ".join([r.text for r in result.results if r.text])
            
            # Common state abbreviations mapping
            state_map = {
                'Alabama': 'AL', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
                'Colorado': 'CO', 'Florida': 'FL', 'Georgia': 'GA', 'Idaho': 'ID',
                'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA', 'Kansas': 'KS',
                'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
                'Michigan': 'MI', 'Missouri': 'MO', 'Montana': 'MT', 'Nevada': 'NV',
                'New Jersey': 'NJ', 'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC',
                'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA',
                'South Carolina': 'SC', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT',
                'Virginia': 'VA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'
            }
            
            return all_text, state_map
        return None, None
    except Exception as e:
        print(f"Error searching for {school_name}: {e}")
        return None, None

# Test with one school
test_result, _ = get_school_location("Alabama College of Osteopathic Medicine")
print("Test search result (truncated):", test_result[:300] if test_result else "No result")

Test search result (truncated): [Skip to content] 

Alabama College of Osteopathic Medicine[acomadmin] 2026-01-07T10:04:20-06:00

INSPIRING MEDICINE
THROUGH HUMAN TOUCH
®
At the Alabama College of Osteopathic Medicine we don't just train doctors - we shape leaders who are committed to making a lasting impact in healthcare.
[How to


In [None]:
# Pre-built location and class size mapping for DO schools
# Class sizes sourced from school websites and AACOM data
DO_SCHOOL_DATA = {
    # Format: "School Name": {"city": "City", "state": "ST", "class_size": N}
    "Alabama College of Osteopathic Medicine (ACOM)": {"city": "Dothan", "state": "AL", "class_size": 200},
    "Arkansas College of Osteopathic Medicine (ARCOM)": {"city": "Fort Smith", "state": "AR", "class_size": 150},
    "A.T. Still University Kirksville College of Osteopathic Medicine (ATSU-KCOM)": {"city": "Kirksville", "state": "MO", "class_size": 175},
    "A.T. Still University, School of Osteopathic Medicine in Arizona (ATSU-SOMA)": {"city": "Mesa", "state": "AZ", "class_size": 150},
    "Burrell College of Osteopathic Medicine (BCOM)": {"city": "Las Cruces", "state": "NM", "class_size": 162},
    "Burrell College of Osteopathic Medicine (BCOM-FL)": {"city": "Melbourne", "state": "FL", "class_size": 150},
    "Baptist Health Sciences University College of Osteopathic Medicine (BUCOM)": {"city": "Memphis", "state": "TN", "class_size": 100},
    "California Health Sciences University College of Osteopathic Medicine (CHSU-COM)": {"city": "Clovis", "state": "CA", "class_size": 120},
    "Campbell University School of Osteopathic Medicine (CUSOM)": {"city": "Lillington", "state": "NC", "class_size": 165},
    "Des Moines University College of Osteopathic Medicine (DMU-COM)": {"city": "Des Moines", "state": "IA", "class_size": 240},
    "Duquesne University College of Osteopathic Medicine": {"city": "Pittsburgh", "state": "PA", "class_size": 80},
    "Edward Via College of Osteopathic Medicine (VCOM - Auburn Campus)": {"city": "Auburn", "state": "AL", "class_size": 195},
    "Edward Via College of Osteopathic Medicine-Carolinas Campus (VCOM - Carolinas Campus)": {"city": "Spartanburg", "state": "SC", "class_size": 185},
    "Edward Via College of Osteopathic Medicine-Louisiana (VCOM-Louisiana)": {"city": "Monroe", "state": "LA", "class_size": 156},
    "Edward Via College of Osteopathic Medicine (VCOM-Virginia Campus)": {"city": "Blacksburg", "state": "VA", "class_size": 195},
    "Idaho College of Osteopathic Medicine (ICOM)": {"city": "Meridian", "state": "ID", "class_size": 200},
    "Illinois College of Osteopathic Medicine (Fall 2026)": {"city": "Chicago", "state": "IL", "class_size": None},
    "Kansas Health Sciences Center Kansas College of Osteopathic Medicine (KansasCOM)": {"city": "Wichita", "state": "KS", "class_size": 200},
    "Kansas City University – Kansas City (KCU-COM-KC)": {"city": "Kansas City", "state": "MO", "class_size": 270},
    "Kansas City University – Joplin (KCU-COM-Joplin)": {"city": "Joplin", "state": "MO", "class_size": 175},
    "Kentucky College of Osteopathic Medicine (UP-KYCOM)": {"city": "Pikeville", "state": "KY", "class_size": 140},
    "Lake Erie College of Osteopathic Medicine-Erie (LECOM)": {"city": "Erie", "state": "PA", "class_size": 300},
    "Lake Erie College of Osteopathic Medicine-Bradenton (LECOM-Bradenton)": {"city": "Bradenton", "state": "FL", "class_size": 300},
    "Lake Erie College of Osteopathic Medicine - Elmira (LECOM-Elmira)": {"city": "Elmira", "state": "NY", "class_size": 115},
    "Lake Erie College of Osteopathic Medicine - Seton Hill (LECOM-Seton Hill)": {"city": "Greensburg", "state": "PA", "class_size": 125},
    "Liberty University College of Osteopathic Medicine (LUCOM)": {"city": "Lynchburg", "state": "VA", "class_size": 175},
    "Lincoln Memorial University DeBusk College of Osteopathic Medicine (LMU-DCOM)": {"city": "Harrogate", "state": "TN", "class_size": 240},
    "Lincoln Memorial University DeBusk College of Osteopathic Medicine - Knoxville (LMU-DCOM Knoxville)": {"city": "Knoxville", "state": "TN", "class_size": 150},
    "Marian University College of Osteopathic Medicine (MU-COM)": {"city": "Indianapolis", "state": "IN", "class_size": 175},
    "Maryland College of Osteopathic Medicine (MDCOM) HBCU (Proposed for 2026 start) ": {"city": "Baltimore", "state": "MD", "class_size": None},
    "Meritus School of Osteopathic Medicine Fall 2025": {"city": "Hagerstown", "state": "MD", "class_size": 80},
    "Michigan State University College of Osteopathic Medicine (MSUCOM)": {"city": "East Lansing", "state": "MI", "class_size": 300},
    "Michigan State University College of Osteopathic Medicine (MSUCOM-DMC)": {"city": "Detroit", "state": "MI", "class_size": 100},
    "Michigan State University College of Osteopathic Medicine (MSUCOM-MUC)": {"city": "Macomb", "state": "MI", "class_size": 50},
    "Midwestern University Arizona College of Osteopathic Medicine (MWU/AZCOM)": {"city": "Glendale", "state": "AZ", "class_size": 275},
    "Midwestern University Chicago College of Osteopathic Medicine (MWU/CCOM)": {"city": "Downers Grove", "state": "IL", "class_size": 250},
    "New York Institute of Technology College of Osteopathic Medicine (NYITCOM)": {"city": "Old Westbury", "state": "NY", "class_size": 345},
    "New York Institute of Technology College of Osteopathic Medicine at Arkansas State (NYITCOM)": {"city": "Jonesboro", "state": "AR", "class_size": 115},
    "Noorda College of Osteopathic Medicine-(NoordaCOM)": {"city": "Provo", "state": "UT", "class_size": 120},
    "Nova Southeastern University Dr. Kiran C. Patel College of Osteopathic Medicine (NSU-KPCOM)": {"city": "Fort Lauderdale", "state": "FL", "class_size": 250},
    "Nova Southeastern University Dr. Kiran C. Patel College of Osteopathic Medicine (Clearwater)": {"city": "Clearwater", "state": "FL", "class_size": 150},
    "Ohio University Heritage College of Osteopathic Medicine (OU-HCOM)": {"city": "Athens", "state": "OH", "class_size": 165},
    "Ohio University Heritage College of Osteopathic Medicine in Cleveland (OU-HCOM-Cleveland)": {"city": "Cleveland", "state": "OH", "class_size": 65},
    "Ohio University Heritage College of Osteopathic Medicine in Dublin (OU-HCOM-Dublin)": {"city": "Dublin", "state": "OH", "class_size": 52},
    "Oklahoma State University Center for Health Sciences College of Osteopathic Medicine (OSU-COM)": {"city": "Tulsa", "state": "OK", "class_size": 110},
    "Oklahoma State University Center for Health Sciences College of Osteopathic Medicine - Tahlequah (OSU-COM Tahlequah)": {"city": "Tahlequah", "state": "OK", "class_size": 44},
    "Orlando College of Osteopathic Medicine - OCOM": {"city": "Orlando", "state": "FL", "class_size": 150},
    "Pacific Northwest University of Health Sciences College of Osteopathic Medicine (PNWU-COM)": {"city": "Yakima", "state": "WA", "class_size": 125},
    "Philadelphia College of Osteopathic Medicine (PCOM)": {"city": "Philadelphia", "state": "PA", "class_size": 280},
    "Philadelphia College of Osteopathic Medicine Georgia (PCOM Georgia)": {"city": "Suwanee", "state": "GA", "class_size": 275},
    "Philadelphia College of Osteopathic Medicine South Georgia (PCOM South Georgia)": {"city": "Moultrie", "state": "GA", "class_size": 120},
    "Rocky Vista University College of Osteopathic Medicine (RVUCOM-CO Campus)": {"city": "Parker", "state": "CO", "class_size": 185},
    "Rocky Vista University College of Osteopathic Medicine (RVUCOM-MT Campus)": {"city": "Billings", "state": "MT", "class_size": 75},
    "Rocky Vista University College of Osteopathic Medicine (RVUCOM-UT Campus)": {"city": "Ivins", "state": "UT", "class_size": 185},
    "Rowan-Virtua School of Osteopathic Medicine": {"city": "Stratford", "state": "NJ", "class_size": 210},
    "Rowan-Virtua School of Osteopathic Medicine-(Sewell)": {"city": "Sewell", "state": "NJ", "class_size": 80},
    "Sam Houston State University College of Osteopathic Medicine (SHSU-COM)": {"city": "Conroe", "state": "TX", "class_size": 150},
    "Touro College of Osteopathic Medicine (TouroCOM-Harlem)": {"city": "New York", "state": "NY", "class_size": 135},
    "Touro College of Osteopathic Medicine (TouroCOM-Middletown)": {"city": "Middletown", "state": "NY", "class_size": 135},
    "Touro University College of Osteopathic Medicine- (TouroCOM-Great Falls)": {"city": "Great Falls", "state": "MT", "class_size": 100},
    "Touro University College of Osteopathic Medicine- California (TUCOM)": {"city": "Vallejo", "state": "CA", "class_size": 135},
    "Touro University Nevada College of Osteopathic Medicine (TUNCOM)": {"city": "Henderson", "state": "NV", "class_size": 140},
    "University of the Incarnate Word School of Osteopathic Medicine (UIWSOM)": {"city": "San Antonio", "state": "TX", "class_size": 200},
    "University of New England College of Osteopathic Medicine (UNECOM)": {"city": "Biddeford", "state": "ME", "class_size": 185},
    "University of North Texas Health Science Center Texas College of Osteopathic Medicine (UNTHSC/TCOM)": {"city": "Fort Worth", "state": "TX", "class_size": 240},
    "West Virginia School of Osteopathic Medicine (WVSOM)": {"city": "Lewisburg", "state": "WV", "class_size": 215},
    "Western University of Health Sciences College of Osteopathic Medicine of the Pacific (WesternU/COMP)": {"city": "Pomona", "state": "CA", "class_size": 285},
    "Western University of Health Sciences College of Osteopathic Medicine of the Pacific-Northwest (WesternU/COMP-Northwest)": {"city": "Lebanon", "state": "OR", "class_size": 125},
    "William Carey University College of Osteopathic Medicine (WCUCOM)": {"city": "Hattiesburg", "state": "MS", "class_size": 115},
}

# Legacy location-only mapping for backwards compatibility
DO_SCHOOL_LOCATIONS = {
    "Alabama College of Osteopathic Medicine (ACOM)": ("Dothan", "AL"),
    "Arkansas College of Osteopathic Medicine (ARCOM)": ("Fort Smith", "AR"),
    "A.T. Still University Kirksville College of Osteopathic Medicine (ATSU-KCOM)": ("Kirksville", "MO"),
    "A.T. Still University, School of Osteopathic Medicine in Arizona (ATSU-SOMA)": ("Mesa", "AZ"),
    "Burrell College of Osteopathic Medicine (BCOM)": ("Las Cruces", "NM"),
    "Burrell College of Osteopathic Medicine (BCOM-FL)": ("Melbourne", "FL"),
    "Baptist Health Sciences University College of Osteopathic Medicine (BUCOM)": ("Memphis", "TN"),
    "California Health Sciences University College of Osteopathic Medicine (CHSU-COM)": ("Clovis", "CA"),
    "Campbell University School of Osteopathic Medicine (CUSOM)": ("Lillington", "NC"),
    "Des Moines University College of Osteopathic Medicine (DMU-COM)": ("Des Moines", "IA"),
    "Duquesne University College of Osteopathic Medicine": ("Pittsburgh", "PA"),
    "Edward Via College of Osteopathic Medicine (VCOM - Auburn Campus)": ("Auburn", "AL"),
    "Edward Via College of Osteopathic Medicine-Carolinas Campus (VCOM - Carolinas Campus)": ("Spartanburg", "SC"),
    "Edward Via College of Osteopathic Medicine-Louisiana (VCOM-Louisiana)": ("Monroe", "LA"),
    "Edward Via College of Osteopathic Medicine (VCOM-Virginia Campus)": ("Blacksburg", "VA"),
    "Idaho College of Osteopathic Medicine (ICOM)": ("Meridian", "ID"),
    "Illinois College of Osteopathic Medicine (Fall 2026)": ("Chicago", "IL"),
    "Kansas Health Sciences Center Kansas College of Osteopathic Medicine (KansasCOM)": ("Wichita", "KS"),
    "Kansas City University – Kansas City (KCU-COM-KC)": ("Kansas City", "MO"),
    "Kansas City University – Joplin (KCU-COM-Joplin)": ("Joplin", "MO"),
    "Kentucky College of Osteopathic Medicine (UP-KYCOM)": ("Pikeville", "KY"),
    "Lake Erie College of Osteopathic Medicine-Erie (LECOM)": ("Erie", "PA"),
    "Lake Erie College of Osteopathic Medicine-Bradenton (LECOM-Bradenton)": ("Bradenton", "FL"),
    "Lake Erie College of Osteopathic Medicine - Elmira (LECOM-Elmira)": ("Elmira", "NY"),
    "Lake Erie College of Osteopathic Medicine - Seton Hill (LECOM-Seton Hill)": ("Greensburg", "PA"),
    "Liberty University College of Osteopathic Medicine (LUCOM)": ("Lynchburg", "VA"),
    "Lincoln Memorial University DeBusk College of Osteopathic Medicine (LMU-DCOM)": ("Harrogate", "TN"),
    "Lincoln Memorial University DeBusk College of Osteopathic Medicine - Knoxville (LMU-DCOM Knoxville)": ("Knoxville", "TN"),
    "Marian University College of Osteopathic Medicine (MU-COM)": ("Indianapolis", "IN"),
    "Maryland College of Osteopathic Medicine (MDCOM) HBCU (Proposed for 2026 start) ": ("Baltimore", "MD"),
    "Meritus School of Osteopathic Medicine Fall 2025": ("Hagerstown", "MD"),
    "Michigan State University College of Osteopathic Medicine (MSUCOM)": ("East Lansing", "MI"),
    "Michigan State University College of Osteopathic Medicine (MSUCOM-DMC)": ("Detroit", "MI"),
    "Michigan State University College of Osteopathic Medicine (MSUCOM-MUC)": ("Macomb", "MI"),
    "Midwestern University Arizona College of Osteopathic Medicine (MWU/AZCOM)": ("Glendale", "AZ"),
    "Midwestern University Chicago College of Osteopathic Medicine (MWU/CCOM)": ("Downers Grove", "IL"),
    "New York Institute of Technology College of Osteopathic Medicine (NYITCOM)": ("Old Westbury", "NY"),
    "New York Institute of Technology College of Osteopathic Medicine at Arkansas State (NYITCOM)": ("Jonesboro", "AR"),
    "Noorda College of Osteopathic Medicine-(NoordaCOM)": ("Provo", "UT"),
    "Nova Southeastern University Dr. Kiran C. Patel College of Osteopathic Medicine (NSU-KPCOM)": ("Fort Lauderdale", "FL"),
    "Nova Southeastern University Dr. Kiran C. Patel College of Osteopathic Medicine (Clearwater)": ("Clearwater", "FL"),
    "Ohio University Heritage College of Osteopathic Medicine (OU-HCOM)": ("Athens", "OH"),
    "Ohio University Heritage College of Osteopathic Medicine in Cleveland (OU-HCOM-Cleveland)": ("Cleveland", "OH"),
    "Ohio University Heritage College of Osteopathic Medicine in Dublin (OU-HCOM-Dublin)": ("Dublin", "OH"),
    "Oklahoma State University Center for Health Sciences College of Osteopathic Medicine (OSU-COM)": ("Tulsa", "OK"),
    "Oklahoma State University Center for Health Sciences College of Osteopathic Medicine - Tahlequah (OSU-COM Tahlequah)": ("Tahlequah", "OK"),
    "Orlando College of Osteopathic Medicine - OCOM": ("Orlando", "FL"),
    "Pacific Northwest University of Health Sciences College of Osteopathic Medicine (PNWU-COM)": ("Yakima", "WA"),
    "Philadelphia College of Osteopathic Medicine (PCOM)": ("Philadelphia", "PA"),
    "Philadelphia College of Osteopathic Medicine Georgia (PCOM Georgia)": ("Suwanee", "GA"),
    "Philadelphia College of Osteopathic Medicine South Georgia (PCOM South Georgia)": ("Moultrie", "GA"),
    "Rocky Vista University College of Osteopathic Medicine (RVUCOM-CO Campus)": ("Parker", "CO"),
    "Rocky Vista University College of Osteopathic Medicine (RVUCOM-MT Campus)": ("Billings", "MT"),
    "Rocky Vista University College of Osteopathic Medicine (RVUCOM-UT Campus)": ("Ivins", "UT"),
    "Rowan-Virtua School of Osteopathic Medicine": ("Stratford", "NJ"),
    "Rowan-Virtua School of Osteopathic Medicine-(Sewell)": ("Sewell", "NJ"),
    "Sam Houston State University College of Osteopathic Medicine (SHSU-COM)": ("Conroe", "TX"),
    "Touro College of Osteopathic Medicine (TouroCOM-Harlem)": ("New York", "NY"),
    "Touro College of Osteopathic Medicine (TouroCOM-Middletown)": ("Middletown", "NY"),
    "Touro University College of Osteopathic Medicine- (TouroCOM-Great Falls)": ("Great Falls", "MT"),
    "Touro University College of Osteopathic Medicine- California (TUCOM)": ("Vallejo", "CA"),
    "Touro University Nevada College of Osteopathic Medicine (TUNCOM)": ("Henderson", "NV"),
    "University of the Incarnate Word School of Osteopathic Medicine (UIWSOM)": ("San Antonio", "TX"),
    "University of New England College of Osteopathic Medicine (UNECOM)": ("Biddeford", "ME"),
    "University of North Texas Health Science Center Texas College of Osteopathic Medicine (UNTHSC/TCOM)": ("Fort Worth", "TX"),
    "West Virginia School of Osteopathic Medicine (WVSOM)": ("Lewisburg", "WV"),
    "Western University of Health Sciences College of Osteopathic Medicine of the Pacific (WesternU/COMP)": ("Pomona", "CA"),
    "Western University of Health Sciences College of Osteopathic Medicine of the Pacific-Northwest (WesternU/COMP-Northwest)": ("Lebanon", "OR"),
    "William Carey University College of Osteopathic Medicine (WCUCOM)": ("Hattiesburg", "MS"),
}

print(f"Pre-loaded locations for {len(DO_SCHOOL_LOCATIONS)} schools")

Pre-loaded locations for 69 schools


In [None]:
def parse_tuition(tuition_str):
    """Parse tuition string and return in-state and out-of-state values"""
    if pd.isna(tuition_str) or tuition_str == 'No Data':
        return None, None
    
    tuition_str = str(tuition_str)
    
    # Handle IS/OOS format like "IS-$43,776/OOS-$61,838" or "IS- $25796/OOS-$53299"
    is_oos_match = re.search(r'IS[:\-\s]*[\$]?([\d,]+).*OOS[:\-\s]*[\$]?([\d,]+)', tuition_str, re.IGNORECASE)
    if is_oos_match:
        in_state = float(is_oos_match.group(1).replace(',', ''))
        out_state = float(is_oos_match.group(2).replace(',', ''))
        return in_state, out_state
    
    # Handle single value (same for IS and OOS)
    single_match = re.search(r'([\d,]+)', tuition_str)
    if single_match:
        value = float(single_match.group(1).replace(',', ''))
        return value, value
    
    return None, None

def parse_mcat(mcat_str):
    """Parse MCAT value"""
    if pd.isna(mcat_str):
        return None
    mcat_str = str(mcat_str)
    if 'No Data' in mcat_str or 'nan' in mcat_str.lower():
        return None
    # Extract numeric value
    match = re.search(r'(\d{3}(?:\.\d+)?)', mcat_str)
    if match:
        return float(match.group(1))
    return None

def parse_gpa(gpa_str):
    """Parse GPA value - take the first numeric GPA found"""
    if pd.isna(gpa_str) or gpa_str == 'No Data' or 'None listed' in str(gpa_str):
        return None
    gpa_str = str(gpa_str)
    # Look for GPA values like 3.4, 2.80, etc.
    match = re.search(r'(\d\.\d+)', gpa_str)
    if match:
        return float(match.group(1))
    return None

def get_school_data(school_name):
    """Get city, state, and class size for a school, using Exa API as fallback"""
    # First check our pre-built mapping
    if school_name in DO_SCHOOL_DATA:
        data = DO_SCHOOL_DATA[school_name]
        return data["city"], data["state"], data["class_size"]
    
    # Try partial matching
    for key, data in DO_SCHOOL_DATA.items():
        if key.lower() in school_name.lower() or school_name.lower() in key.lower():
            return data["city"], data["state"], data["class_size"]
    
    # Use Exa API as fallback for unknown schools
    print(f"  Using Exa API to find data for: {school_name}")
    try:
        result = exa.search_and_contents(
            f"{school_name} osteopathic medical school class size enrollment students",
            num_results=2,
            text={"max_characters": 500}
        )
        if result.results:
            text = " ".join([r.text for r in result.results if r.text])
            # Try to extract class size from text
            import re
            class_match = re.search(r'class\s*(?:size|of)?\s*(?:is|:)?\s*(\d{2,3})', text, re.IGNORECASE)
            class_size = int(class_match.group(1)) if class_match else None
            return "Unknown", "Unknown", class_size
    except Exception as e:
        print(f"  Exa API error: {e}")
    
    return "Unknown", "Unknown", None

# Test parsing functions
print("Tuition parsing tests:")
print(f"  'IS-$43,776/OOS-$61,838' -> {parse_tuition('IS-$43,776/OOS-$61,838')}")
print(f"  '60675' -> {parse_tuition('60675')}")
print(f"  'IS- $25796/OOS-$53299' -> {parse_tuition('IS- $25796/OOS-$53299')}")
print()
print("MCAT parsing tests:")
print(f"  '504.5' -> {parse_mcat('504.5')}")
print(f"  'No Data. Require minimum 500' -> {parse_mcat('No Data. Require minimum 500')}")

Tuition parsing tests:
  'IS-$43,776/OOS-$61,838' -> (43776.0, 61838.0)
  '60675' -> (60675.0, 60675.0)
  'IS- $25796/OOS-$53299' -> (25796.0, 53299.0)

MCAT parsing tests:
  '504.5' -> 504.5
  'No Data. Require minimum 500' -> None


In [None]:
# Process all schools and create JSON output
do_schools_json = []

for idx, row in schools_df.iterrows():
    school_name = row['Key:']
    
    # Get location and class size
    city, state, class_size = get_school_data(school_name)
    
    # Parse MCAT
    mcat = parse_mcat(row['Average Matriculant MCAT'])
    
    # Parse GPA
    gpa = parse_gpa(row['Minimum GPA'])
    
    # Parse tuition
    in_state, out_state = parse_tuition(row['Tuition Cost (Not including fees)'])
    
    # Create school entry matching the MD schools format
    school_entry = {
        "Name": school_name,
        "City": city,
        "State": state,
        "MCAT": mcat,
        "GPA": gpa,
        "Degree": "DO",  # All are DO schools
        "Class Size": class_size,
        "In-state": in_state,
        "Out-State": out_state
    }
    
    do_schools_json.append(school_entry)

print(f"Processed {len(do_schools_json)} DO schools")
print("\nSample output (first 5 schools):")
for school in do_schools_json[:5]:
    print(json.dumps(school, indent=2))

Processed 69 DO schools

Sample output (first 3 schools):
{
  "Name": "Alabama College of Osteopathic Medicine (ACOM)",
  "City": "Dothan",
  "State": "AL",
  "MCAT": 504.5,
  "GPA": null,
  "Degree": "DO",
  "Class Size": null,
  "In-state": 60675.0,
  "Out-State": 60675.0
}
{
  "Name": "Arkansas College of Osteopathic Medicine (ARCOM)",
  "City": "Fort Smith",
  "State": "AR",
  "MCAT": 501.3,
  "GPA": null,
  "Degree": "DO",
  "Class Size": null,
  "In-state": 59000.0,
  "Out-State": 59000.0
}
{
  "Name": "A.T. Still University Kirksville College of Osteopathic Medicine (ATSU-KCOM)",
  "City": "Kirksville",
  "State": "MO",
  "MCAT": 504.1,
  "GPA": 2.8,
  "Degree": "DO",
  "Class Size": null,
  "In-state": 64398.0,
  "Out-State": 64398.0
}


In [None]:
# Use Exa API to fill in missing values
import time

def extract_gpa_from_text(text):
    """Extract GPA from Exa search results"""
    # Look for patterns like "GPA of 3.5", "3.5 GPA", "average GPA: 3.5", "minimum GPA 3.0"
    patterns = [
        r'(?:average|median|mean|minimum|min)\s*GPA[:\s]*(\d\.\d+)',
        r'GPA[:\s]*(?:of\s*)?(\d\.\d+)',
        r'(\d\.\d+)\s*GPA',
        r'GPA\s*(?:is|was|of)\s*(\d\.\d+)',
    ]
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            gpa = float(match.group(1))
            if 2.0 <= gpa <= 4.0:  # Sanity check
                return gpa
    return None

def extract_class_size_from_text(text):
    """Extract class size from Exa search results"""
    patterns = [
        r'class\s*(?:size|of)\s*(?:is|:)?\s*(\d{2,3})',
        r'(\d{2,3})\s*students?\s*(?:per|in|each)\s*class',
        r'enrolls?\s*(\d{2,3})\s*students?',
        r'(\d{2,3})\s*(?:medical\s*)?students?\s*(?:per\s*year|annually)',
        r'class\s*(?:has|with)\s*(\d{2,3})',
        r'admits?\s*(\d{2,3})\s*students?',
    ]
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            size = int(match.group(1))
            if 30 <= size <= 500:  # Sanity check for medical school class sizes
                return size
    return None

def extract_mcat_from_text(text):
    """Extract MCAT from Exa search results"""
    patterns = [
        r'(?:average|median|mean)\s*MCAT[:\s]*(\d{3}(?:\.\d+)?)',
        r'MCAT[:\s]*(?:of\s*)?(\d{3}(?:\.\d+)?)',
        r'(\d{3}(?:\.\d+)?)\s*MCAT',
        r'MCAT\s*(?:score|is|was)\s*(?:of\s*)?(\d{3}(?:\.\d+)?)',
    ]
    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            mcat = float(match.group(1))
            if 472 <= mcat <= 528:  # Valid MCAT range
                return mcat
    return None

def fill_missing_with_exa(school):
    """Use Exa API to fill missing values for a school"""
    school_name = school['Name']
    clean_name = school_name.replace('(', '').replace(')', '').strip()
    updates = {}
    
    # Check what's missing
    missing = []
    if school['GPA'] is None:
        missing.append('GPA')
    if school['Class Size'] is None:
        missing.append('class size')
    if school['MCAT'] is None:
        missing.append('MCAT')
    
    if not missing:
        return updates
    
    print(f"  Searching for {', '.join(missing)} for {school_name[:50]}...")
    
    try:
        # Search for the missing data
        query = f"{clean_name} osteopathic medical school {' '.join(missing)} admissions statistics"
        result = exa.search_and_contents(
            query,
            num_results=3,
            text={"max_characters": 800}
        )
        
        if result.results:
            all_text = " ".join([r.text for r in result.results if r.text])
            
            # Try to extract each missing value
            if school['GPA'] is None:
                gpa = extract_gpa_from_text(all_text)
                if gpa:
                    updates['GPA'] = gpa
                    print(f"    Found GPA: {gpa}")
            
            if school['Class Size'] is None:
                class_size = extract_class_size_from_text(all_text)
                if class_size:
                    updates['Class Size'] = class_size
                    print(f"    Found Class Size: {class_size}")
            
            if school['MCAT'] is None:
                mcat = extract_mcat_from_text(all_text)
                if mcat:
                    updates['MCAT'] = mcat
                    print(f"    Found MCAT: {mcat}")
        
        time.sleep(0.5)  # Rate limiting
        
    except Exception as e:
        print(f"    Error: {e}")
    
    return updates

# Count missing values before
missing_before = {
    'GPA': sum(1 for s in do_schools_json if s['GPA'] is None),
    'Class Size': sum(1 for s in do_schools_json if s['Class Size'] is None),
    'MCAT': sum(1 for s in do_schools_json if s['MCAT'] is None)
}
print(f"Missing values before: GPA={missing_before['GPA']}, Class Size={missing_before['Class Size']}, MCAT={missing_before['MCAT']}")
print("\nFilling missing values with Exa API...\n")

In [None]:
# Process schools with missing values
total_updates = 0

for school in do_schools_json:
    # Check if this school has any missing values
    has_missing = school['GPA'] is None or school['Class Size'] is None or school['MCAT'] is None
    
    if has_missing:
        updates = fill_missing_with_exa(school)
        
        # Apply updates
        for key, value in updates.items():
            school[key] = value
            total_updates += 1

# Count missing values after
missing_after = {
    'GPA': sum(1 for s in do_schools_json if s['GPA'] is None),
    'Class Size': sum(1 for s in do_schools_json if s['Class Size'] is None),
    'MCAT': sum(1 for s in do_schools_json if s['MCAT'] is None)
}

print(f"\n{'='*50}")
print(f"Total updates made: {total_updates}")
print(f"Missing values after: GPA={missing_after['GPA']}, Class Size={missing_after['Class Size']}, MCAT={missing_after['MCAT']}")
print(f"Values filled: GPA={missing_before['GPA']-missing_after['GPA']}, Class Size={missing_before['Class Size']-missing_after['Class Size']}, MCAT={missing_before['MCAT']-missing_after['MCAT']}")

In [None]:
# Save the updated JSON
output_path = 'webapp/do_schools_data.json'
with open(output_path, 'w') as f:
    json.dump(do_schools_json, f, indent=2)

print(f"✅ Saved updated JSON to {output_path}")

# Show a few examples of schools that got updated
print("\nSample of schools with filled data:")
for school in do_schools_json[:10]:
    print(f"  {school['Name'][:45]}... | GPA: {school['GPA']} | Class: {school['Class Size']} | MCAT: {school['MCAT']}")

In [18]:
# Save to JSON file
output_path = 'webapp/do_schools_data.json'
with open(output_path, 'w') as f:
    json.dump(do_schools_json, f, indent=2)

print(f"✅ Saved {len(do_schools_json)} DO schools to {output_path}")

# Also display the full JSON for verification
print("\nFull JSON output:")
print(json.dumps(do_schools_json, indent=2))

✅ Saved 69 DO schools to webapp/do_schools_data.json

Full JSON output:
[
  {
    "Name": "Alabama College of Osteopathic Medicine (ACOM)",
    "City": "Dothan",
    "State": "AL",
    "MCAT": 504.5,
    "GPA": null,
    "Degree": "DO",
    "Class Size": null,
    "In-state": 60675.0,
    "Out-State": 60675.0
  },
  {
    "Name": "Arkansas College of Osteopathic Medicine (ARCOM)",
    "City": "Fort Smith",
    "State": "AR",
    "MCAT": 501.3,
    "GPA": null,
    "Degree": "DO",
    "Class Size": null,
    "In-state": 59000.0,
    "Out-State": 59000.0
  },
  {
    "Name": "A.T. Still University Kirksville College of Osteopathic Medicine (ATSU-KCOM)",
    "City": "Kirksville",
    "State": "MO",
    "MCAT": 504.1,
    "GPA": 2.8,
    "Degree": "DO",
    "Class Size": null,
    "In-state": 64398.0,
    "Out-State": 64398.0
  },
  {
    "Name": "A.T. Still University, School of Osteopathic Medicine in Arizona (ATSU-SOMA)",
    "City": "Mesa",
    "State": "AZ",
    "MCAT": 502.2,
    "GP