In [8]:
import sys
import requests
import pandas as pd
import json
import re
from bs4 import BeautifulSoup
from openpyxl.styles import Alignment

# 1. Versions (optional)
print(f"Python {sys.version}")
print(f"requests {requests.__version__}")
print(f"pandas {pd.__version__}")

# 2. Fetch the SOAP-wrapped JSON
endpoint = "https://wwwnc.cdc.gov/travel/Services/xmlservices.asmx/YellowFeverInformationJson"
resp = requests.get(endpoint)
resp.raise_for_status()
m = re.search(r"<string[^>]*>(.*?)</string>", resp.text, re.DOTALL)
if not m:
    raise RuntimeError("Couldn't locate JSON payload")
json_text = m.group(1)

# 3. Parse JSON → list of country dicts
countries = json.loads(json_text)

# 4. Flatten into records
records = []
for c in countries:
    yf = c.get("YellowFever", {})
    m = c.get("Malaria", {})
    records.append({
        "Country": c.get("LongName"),
        "YF_Recommendations": yf.get("Recommendations"),
        "YF_Entry_Requirements": yf.get("Requirements"),
        "Malaria_Has_Transmission": m.get("HasTransmission"),
        "Malaria_Areas_of_Risk": m.get("AreaOfRisk"),
        "Malaria_Chloroquine_Resistance": m.get("ChloroquineResistance"),
        "Malaria_Species": m.get("Species"),
        "Malaria_Recommended_Chemoprophylaxis": m.get("RecommendedProphylaxis"),
        "Other_Vaccines_To_Consider": c.get("OtherVaccinesToConsider"),
        "Map_Links": c.get("MapLinks"),
        "Map_HTML": c.get("MapHtml")
    })

df = pd.DataFrame.from_records(records)

# 5. HTML cleaner that strips <sup> and renders <ul>/<li> (with nested sub-li)
def clean_html(text):
    if pd.isna(text):
        return text
    soup = BeautifulSoup(text, "html.parser")
    # remove all superscripts
    for sup in soup.find_all("sup"):
        sup.decompose()

    root_ul = soup.find("ul")
    if not root_ul:
        # no list → plain text
        plain = soup.get_text(" ")
        return re.sub(r"\s+", " ", plain).strip()

    lines = []
    def render(ul, level=0):
        marker = "- " if level == 0 else "  • "
        for li in ul.find_all("li", recursive=False):
            # text of this <li> sans nested <ul>
            copy = BeautifulSoup(str(li), "html.parser")
            for sub in copy.find_all("ul"):
                sub.decompose()
            txt = re.sub(r"\s+", " ", copy.get_text(" ", strip=True))
            lines.append(f"{marker}{txt}")
            # recurse into nested UL
            nested = li.find("ul", recursive=False)
            if nested:
                render(nested, level+1)

    render(root_ul, 0)
    return "\n".join(lines)

# Apply the cleaner to every text column
for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].apply(clean_html)

# 6. Categorize entry requirements
def categorize_entry(req):
    r = (req or "").lower()
    if "required for all arriving" in r or "required for all traveler" in r or "proof of vaccination" in r:
        return "Vaccine is required for all arriving travelers"
    if "travel from countries" in r or "risk for yf" in r:
        return "Travel from countries with risk for YF virus transmission"
    if "arriving from" in r or "risk for yf" in r:
        return "Arriving from some countries"
    if "not required" in r:
        return "Vaccine is not required"
    return "Other"

df["Entry Category"] = df["YF_Entry_Requirements"].apply(categorize_entry)

# 7. Categorize malaria resistance
def categorize_resistance(row):
    if not row["Malaria_Has_Transmission"]:
        return "No Malaria"
    txt = (row["Malaria_Chloroquine_Resistance"] or "").lower()
    if "chloroquine and mefloquine" in txt:
        return "CQ&MQ Resist"
    if "previously" in txt and "chloroquine" in txt:
        return "Previous CQ"
    if "chloroquine" in txt:
        return "CQ Resist"
    if "none" in txt:
        return "No Resist"
    return "Other"

df["Resistance Type"] = df.apply(categorize_resistance, axis=1)

# 8. Map availability
df["Map Category"] = df["Map_Links"].apply(
    lambda x: "No map available" 
    if not x or "none" in x.lower() 
    else "Map available"
)

Python 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:54:21) [Clang 16.0.6 ]
requests 2.32.3
pandas 2.2.2


In [9]:
df

Unnamed: 0,Country,YF_Recommendations,YF_Entry_Requirements,Malaria_Has_Transmission,Malaria_Areas_of_Risk,Malaria_Chloroquine_Resistance,Malaria_Species,Malaria_Recommended_Chemoprophylaxis,Other_Vaccines_To_Consider,Map_Links,Map_HTML,Entry Category,Resistance Type,Map Category
0,Afghanistan,Vaccine is not recommended.,Vaccine is not required.,True,"- All areas <2,500 m (<8,200 ft) elevation (Ap...",- Chloroquine,- P. vivax (primarily)\n- P. falciparum (less ...,"- Atovaquone-proguanil, doxycycline, mefloquin...",See Health Information for Travelers to Afghan...,Malaria prevention in Afghanistan,Map Malaria prevention in Afghanistan See foot...,Vaccine is not required,CQ Resist,Map available
1,Albania,Vaccine is not recommended.,- Direct travel from United States: Vaccine is...,False,,,,,See Health Information for Travelers to Albania,,,Travel from countries with risk for YF virus t...,No Malaria,No map available
2,Algeria,Vaccine is not recommended.,- Direct travel from United States: Vaccine is...,False,,,,,See Health Information for Travelers to Algeria,,,Travel from countries with risk for YF virus t...,No Malaria,No map available
3,American Samoa (U.S.),Vaccine is not recommended.,Vaccine is not required.,False,,,,,See Health Information for Travelers to Americ...,,,Vaccine is not required,No Malaria,No map available
4,Andorra,Vaccine is not recommended.,Vaccine is not required.,False,,,,,See Health Information for Travelers to Andorra,,,Vaccine is not required,No Malaria,No map available
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,"Virgin Islands, U.S.",Vaccine is not recommended.,Vaccine is not required.,False,,,,,See Health Information for Travelers to Virgin...,,,Vaccine is not required,No Malaria,No map available
240,"Wake Island, U.S.",Vaccine is not recommended.,Vaccine is not required.,False,,,,,See Health Information for Travelers to Wake I...,,,Vaccine is not required,No Malaria,No map available
241,Yemen,Vaccine is not recommended.,Vaccine is not required.,True,"- All areas <2,000 m (<6,500 ft) elevation\n- ...",- Chloroquine,"- P. falciparum (primarily)\n- P. malariae , P...","- Atovaquone-proguanil, doxycycline, mefloquin...",See Health Information for Travelers to Yemen,Malaria prevention in Yemen,Map Malaria prevention in Yemen See footnotes,Vaccine is not required,CQ Resist,Map available
242,Zambia,- Vaccine is generally not recommended for tra...,- Direct travel from United States: Vaccine is...,True,- All,- Chloroquine,"- P. falciparum (primarily)\n- P. malariae , P...","- Atovaquone-proguanil, doxycycline, mefloquin...",See Health Information for Travelers to Zambia,,,Travel from countries with risk for YF virus t...,CQ Resist,No map available


In [10]:
# # Export to Excel
# df.to_excel("yf_malaria_for_tableau.xlsx", index=False)

In [11]:
# Define dictionary to map unrecognized territories to recognized countries

tableau_mapping = {
    "Azores (Portugal)": "Azores",
    "British Indian Ocean Territory; includes Diego Garcia (U.K.)": "British Indian Ocean Territory",
    "Canary Islands ( Spain )": "Canary Islands",
    "Christmas Island (Australia)": "Christmas Island",
    "Cocos (Keeling) Islands (Australia)": "Cocos (Keeling) Islands",
    "Cook Islands (New Zealand)": "Cook Islands",
    "Curaçao, Netherlands": "Curaçao",
    "Congo, Republic of the (Congo-Brazzaville)": "Republic of Congo",
    "Democratic Republic of the Congo (Congo-Kinshasa)": "Democratic Republic of Congo",
    "Easter Island (Chile)": "Easter Island",
    "Ecuador, including the Galápagos Islands": "Ecuador",
    "Eswatini (Swaziland)": "Eswatini",
    "Falkland Islands (Islas Malvinas), UK Overseas Territory (also claimed by Argentina)": "Falkland Islands (Islas Malvinas)",
    "Faroe Islands (Denmark)": "Faroe Islands",
    "French Polynesia, including the Society Islands [Bora-Bora, Moorea & Tahiti]; Marquesas Islands [Hiva Oa & Ua Huka]; and Austral Islands (Tubuai & Rurutu), France": "French Polynesia",
    "Gibraltar (U.K.)": "Gibraltar",
    "Greenland (Denmark)": "Greenland",
    "Guadeloupe (including Marie-Galante, La Désirade & Îles des Saintes)": "Guadeloupe",
    "Italy (including Holy See [Vatican City])" : "Italy",
    "Kiribati (formerly Gilbert Islands), includes Tarawa, Tabuaeran (Fanning Island), and Banaba (Ocean Island)": "Kiribati",
    "Macau (China)": "Macau",
    "Madeira Islands (Portugal)": "Madeira Islands",
    "Mayotte (France)": "Mayotte",
    "Micronesia, Federated States of (including Chuuk, Kosrae, Pohnpei & Yap)": "Federated States of Micronesia",
    "Montserrat, United Kingdom": "Montserrat",
    "Netherlands Antilles (Bonaire, Curaçao, Saba, St. Eustasius, and St. Maarten)": "Netherlands Antilles",
    "Niue (New Zealand)": "Niue",
    "Norfolk Island (Australia)": "Norfolk Island",
    "Northern Mariana Islands (U.S.), includes Saipan, Tinian, and Rota Island": "Northern Mariana Islands",
    "Saba, Netherlands": "Saba",
    "Saint Barthelemy, France": "Saint Barthelemy",
    "Saint Helena, United Kingdom": "Saint Helena",
    "Saint Kitts (Saint Christopher) & Nevis": "Saint Kitts and Nevis",
    "Saint Martin, France": "Saint Martin",
    "Saint Pierre and Miquelon (France)": "Saint Pierre and Miquelon",
    "Samoa (formerly Western Somoa)": "Samoa",
    "Sint Eustatius, Netherlands": "Sint Eustatius",
    "Sint Maarten, Netherlands": "Sint Maarten (Dutch part)",
    "South Georgia & the South Sandwich Islands, UK Overseas Territory (also claimed by Argentina)": "South Georgia and the South Sandwich Islands",
    "Tokelau (New Zealand)": "Tokelau",
    "Türkiye (Turkey)": "Turkey",
    "Turks and Caicos Islands (U.K.)": "Turks and Caicos Islands",
    "United Kingdom (including Channel Islands, Isle of Man, Ascension Island & Tristan Da Cunha Archipelago)": "United Kingdom",
    "Wake Island, U.S.": "Wake Island"
}

In [12]:
# Define a function to get the correct country name to match with Tableau data
def get_country_tableau(country_name):
    if country_name not in tableau_mapping:
        return country_name
    mapped_value = tableau_mapping[country_name]
    if mapped_value == "Unrecognized":
        return country_name
    else:
        return mapped_value

In [13]:
# Create a new column in df
df["Country_Tableau"] = df["Country"].apply(get_country_tableau)

In [14]:
# Export the final DataFrame to Excel
df.to_excel("yf_malaria_for_tableau_countrymatch.xlsx", index=False)