In [1]:
import sys
import requests
import bs4  # BeautifulSoup is part of bs4 package
import pandas as pd
import re
import plotly
import IPython

In [2]:
# Print Python version
print(f"Python Version: {sys.version}")

# Print library versions
print(f"Requests Version: {requests.__version__}")
print(f"BeautifulSoup Version: {bs4.__version__}")
print(f"Pandas Version: {pd.__version__}")
print(f"Plotly Version: {plotly.__version__}")
print(f"IPython Version: {IPython.__version__}")

Python Version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:54:21) [Clang 16.0.6 ]
Requests Version: 2.32.2
BeautifulSoup Version: 4.12.3
Pandas Version: 2.2.2
Plotly Version: 5.22.0
IPython Version: 8.25.0


In [3]:
from bs4 import BeautifulSoup

# Define the URL
url = "https://wwwnc.cdc.gov/travel/yellowbook/2024/preparing/yellow-fever-vaccine-malaria-prevention-by-country"

# Fetch the webpage
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Locate all country blocks
country_blocks = soup.find_all("div", class_="dyfm-block")

# Function to clean extracted text while preserving reference numbers
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with a single space
    return text.strip()

# Function to format lists with newlines
def format_list(ul_element):
    return "\n".join([li.get_text(strip=True) for li in ul_element.find_all("li")]) if ul_element else ""

# Initialize a list to store extracted data
data_list = []

# Loop through each country section
for country in country_blocks:
    country_name = country.find("h2").get_text(strip=True) if country.find("h2") else ""

    # Initialize extracted data dictionary (empty values instead of "Not Found")
    extracted_data = {
        "Country": country_name,
        "Entry Requirements": "",
        "CDC Recommendations": "",
        "Transmission Areas": "",
        "Drug Resistance": "",
        "Species": "",
        "Recommended Chemoprophylaxis": "",
        "Last Updated": ""
    }

    # Extract Entry Requirements
    paragraphs = country.find_all("p")
    for para in paragraphs:
        strong_tag = para.find("strong")
        if strong_tag:
            strong_text = strong_tag.get_text(strip=True)
            if "Entry requirements" in strong_text:
                entry_text = para.get_text(strip=True).replace("Entry requirements:", "").strip()
                extracted_data["Entry Requirements"] = clean_text(entry_text)

    # Extract CDC Recommendations
    recommended = ""
    generally_not_recommended = ""
    not_recommended = ""

    # Find the paragraph that contains "CDC recommendations"
    cdc_recommendations_p = None
    for para in paragraphs:
        if "CDC recommendations" in para.get_text(strip=True):
            cdc_recommendations_p = para
            break

    if cdc_recommendations_p:
        full_text = cdc_recommendations_p.get_text(" ", strip=True).replace("CDC recommendations:", "").strip()

        # Find all bold elements inside the CDC recommendations
        bold_tags = cdc_recommendations_p.find_all("strong")

        # Create a mapping of recommendation types
        recommendation_sections = {}

        for bold in bold_tags:
            bold_text = bold.get_text(strip=True)
            start_idx = full_text.find(bold_text)
            if start_idx != -1:
                recommendation_sections[bold_text] = start_idx

        # Sort by position in text
        sorted_recommendations = sorted(recommendation_sections.items(), key=lambda x: x[1])

        # Extract each recommendation text based on its position
        extracted_texts = {}
        for i, (label, start_idx) in enumerate(sorted_recommendations):
            end_idx = sorted_recommendations[i + 1][1] if i + 1 < len(sorted_recommendations) else len(full_text)
            extracted_texts[label] = full_text[start_idx + len(label):end_idx].strip()

        # Assign extracted recommendations
        recommended = extracted_texts.get("Recommended", "").strip()
        generally_not_recommended = extracted_texts.get("Generally not recommended", "").strip()
        not_recommended = extracted_texts.get("Not recommended", "").strip()

        # Format recommendations correctly with newlines
        cdc_recommendations = []
        if recommended:
            cdc_recommendations.append(f"Recommended: {recommended}")
        if generally_not_recommended:
            cdc_recommendations.append(f"Generally not recommended: {generally_not_recommended}")
        if not_recommended:
            cdc_recommendations.append(f"Not recommended: {not_recommended}")

        extracted_data["CDC Recommendations"] = "\n".join(cdc_recommendations)

    # Extract Malaria Information
    for para in paragraphs:
        strong_tag = para.find("strong")
        if strong_tag:
            strong_text = strong_tag.get_text(strip=True)
            next_ul = para.find_next("ul")  # Locate the closest <ul> element

            if "Transmission areas" in strong_text and next_ul:
                extracted_data["Transmission Areas"] = format_list(next_ul)

            elif "Drug resistance" in strong_text and next_ul:
                extracted_data["Drug Resistance"] = format_list(next_ul)

            elif "Species" in strong_text and next_ul:
                extracted_data["Species"] = format_list(next_ul)

            elif "Recommended chemoprophylaxis" in strong_text and next_ul:
                extracted_data["Recommended Chemoprophylaxis"] = format_list(next_ul)

    # Extract Last Updated Date
    last_updated = country.find("div", class_="LastDateUpdate")
    if last_updated:
        last_updated_text = last_updated.get_text(strip=True).replace("Date last updated:", "").strip()
        extracted_data["Last Updated"] = last_updated_text  # Keep full date

    # Append extracted data to list
    data_list.append(extracted_data)

# Convert to DataFrame
df = pd.DataFrame(data_list)

# # Save to CSV
# df.to_csv("yellow_fever_malaria_data_cleaned.csv", index=False)

# Display the extracted DataFrame
from IPython.display import display
display(df.head(10))

Unnamed: 0,Country,Entry Requirements,CDC Recommendations,Transmission Areas,Drug Resistance,Species,Recommended Chemoprophylaxis,Last Updated
0,Afghanistan,,,"All areas <2,500 m (≈8,200 ft) elevation (Apri...",Chloroquine,P. vivax(primarily)\nP. falciparum(less commonly),"Atovaquone-proguanil, doxycycline, mefloquine,...","October 4, 2024"
1,Albania,Required for travelers ≥1 year old arriving fr...,,,,,,"October 4, 2024"
2,Algeria,Required for travelers ≥9 months old arriving ...,,,,,,"October 4, 2024"
3,American Samoa (US),,,,,,,"October 4, 2024"
4,Andorra,,,,,,,"October 4, 2024"
5,Angola,Required for arriving travelers ≥9 months old,Recommended: for all travelers ≥9 months old,All,Chloroquine,"P. falciparum(primarily)\nP. malariae,P. ovale...","Atovaquone-proguanil, doxycycline, mefloquine,...","October 4, 2024"
6,Anguilla (U.K.),,,,,,,"October 4, 2024"
7,Antarctica,,,,,,,"October 4, 2024"
8,Antigua and Barbuda,Required for travelers ≥1 year old arriving fr...,,,,,,"October 4, 2024"
9,Argentina,,Recommended: for travelers ≥9 months old going...,,,,,"October 4, 2024"


In [4]:
df[df["Country"] == "Argentina"]

Unnamed: 0,Country,Entry Requirements,CDC Recommendations,Transmission Areas,Drug Resistance,Species,Recommended Chemoprophylaxis,Last Updated
9,Argentina,,Recommended: for travelers ≥9 months old going...,,,,,"October 4, 2024"


In [5]:
data_list

[{'Country': 'Afghanistan',
  'Entry Requirements': 'None',
  'CDC Recommendations': '',
  'Transmission Areas': 'All areas <2,500 m (≈8,200 ft) elevation (April–December)',
  'Drug Resistance': 'Chloroquine',
  'Species': 'P. vivax(primarily)\nP. falciparum(less commonly)',
  'Recommended Chemoprophylaxis': 'Atovaquone-proguanil, doxycycline, mefloquine, tafenoquine3',
  'Last Updated': 'October 4, 2024'},
 {'Country': 'Albania',
  'Entry Requirements': 'Required for travelers ≥1 year old arriving from countries with risk for YF virus transmission1',
  'CDC Recommendations': '',
  'Transmission Areas': '',
  'Drug Resistance': '',
  'Species': '',
  'Recommended Chemoprophylaxis': '',
  'Last Updated': 'October 4, 2024'},
 {'Country': 'Algeria',
  'Entry Requirements': 'Required for travelers ≥9 months old arriving from countries with risk for YF virus transmission; this includes >12-hour airport transits or layovers in countries with risk for YF virus transmission.1',
  'CDC Recommen

In [6]:
df

Unnamed: 0,Country,Entry Requirements,CDC Recommendations,Transmission Areas,Drug Resistance,Species,Recommended Chemoprophylaxis,Last Updated
0,Afghanistan,,,"All areas <2,500 m (≈8,200 ft) elevation (Apri...",Chloroquine,P. vivax(primarily)\nP. falciparum(less commonly),"Atovaquone-proguanil, doxycycline, mefloquine,...","October 4, 2024"
1,Albania,Required for travelers ≥1 year old arriving fr...,,,,,,"October 4, 2024"
2,Algeria,Required for travelers ≥9 months old arriving ...,,,,,,"October 4, 2024"
3,American Samoa (US),,,,,,,"October 4, 2024"
4,Andorra,,,,,,,"October 4, 2024"
...,...,...,...,...,...,...,...,...
240,"Virgin Islands, U.S.",,,,,,,"October 4, 2024"
241,"Wake Island, U.S.",,,,,,,"October 4, 2024"
242,Yemen,,,"All areas <2,000 m (≈6,500 ft) elevation\nNo m...",Chloroquine,"P. falciparum(primarily)\nP. malariae,P. ovale...","Atovaquone-proguanil, doxycycline, mefloquine,...","October 4, 2024"
243,Zambia,Required for travelers ≥1 year of age arriving...,Generally not recommended: for travel to North...,All,Chloroquine,"P. falciparum(primarily)\nP. malariae,P. ovale...","Atovaquone-proguanil, doxycycline, mefloquine,...","October 4, 2024"


In [7]:
import plotly.express as px

# Identify Chloroquine & Mefloquine resistance cases
df["CQ_resist"] = df["Drug Resistance"].str.contains("Chloroquine", na=False)
df["MQ_resist"] = df["Drug Resistance"].str.contains("mefloquine", na=False)

# Assign resistance types (Include CQ, MQ, both resistant cases, and No Resistance)
df["Resistance Type"] = ""
df.loc[df["CQ_resist"], "Resistance Type"] = "CQ-resist"
df.loc[df["MQ_resist"], "Resistance Type"] = "MQ-resist"
df.loc[df["CQ_resist"] & df["MQ_resist"], "Resistance Type"] = "CQ & MQ resist"

# Identify countries with No malaria transmission
df.loc[(df["Transmission Areas"] == ""), "Resistance Type"] = "No Malaria"

# Identify countries with malaria transmission but no recorded resistance
df.loc[(df["Transmission Areas"].notna()) & 
       (df["Transmission Areas"] != "") & 
       (df["Drug Resistance"] == ""), "Resistance Type"] = "No Resistance"

# Filter out only the relevant cases
df_filtered = df[df["Resistance Type"].notna()].copy()

# Define readable categories for visualization
resistance_map = {
    "CQ-resist": "Chloroquine-resistant",
    "MQ-resist": "Mefloquine-resistant",
    "CQ & MQ resist": "Chloroquine & Mefloquine resistant",
    "No Resistance": "No Recorded Resistance"
}
df_filtered["Resistance Category"] = df_filtered["Resistance Type"].map(resistance_map)

# Define color mapping for better visualization
color_scheme = {
    "Chloroquine-resistant": "red",
    "Mefloquine-resistant": "green",
    "Chloroquine & Mefloquine resistant": "purple",
    "No Recorded Resistance": "blue"
}

# Create an interactive map with species information in hover tooltip
fig = px.choropleth(
    df_filtered,
    locations="Country",
    locationmode="country names",
    color="Resistance Category",
    title="Global Distribution of Malaria Drug Resistance",
    color_discrete_map=color_scheme,
    hover_data={"Country": True, "Species": True}  # Show species in tooltip
)

# Show the interactive map
fig.show()

In [8]:
# Function to categorize Yellow Fever Entry Requirements
def categorize_entry_requirements(text):
    if pd.isna(text) or text.strip().lower() in ["none", "no requirements", "not required", ""]:
        return "None"
    elif re.search(r"arriving\s*from\s*countries\s*with\s*risk\s*for\s*YF\s*virus\s*transmission", text, re.IGNORECASE):
        return "Arriving from countries with risk for YF virus transmission"
    elif re.search(r"required\s*for\s*(all\s*)?arriving\s*travelers", text, re.IGNORECASE):
        return "Required for all arriving travelers"
    else:
        return "Other"

# Apply categorization to the existing DataFrame
df["Entry Category"] = df["Entry Requirements"].apply(categorize_entry_requirements)

# Display the updated DataFrame
from IPython.display import display
display(df.head(10))

Unnamed: 0,Country,Entry Requirements,CDC Recommendations,Transmission Areas,Drug Resistance,Species,Recommended Chemoprophylaxis,Last Updated,CQ_resist,MQ_resist,Resistance Type,Entry Category
0,Afghanistan,,,"All areas <2,500 m (≈8,200 ft) elevation (Apri...",Chloroquine,P. vivax(primarily)\nP. falciparum(less commonly),"Atovaquone-proguanil, doxycycline, mefloquine,...","October 4, 2024",True,False,CQ-resist,
1,Albania,Required for travelers ≥1 year old arriving fr...,,,,,,"October 4, 2024",False,False,No Malaria,Arriving from countries with risk for YF virus...
2,Algeria,Required for travelers ≥9 months old arriving ...,,,,,,"October 4, 2024",False,False,No Malaria,Arriving from countries with risk for YF virus...
3,American Samoa (US),,,,,,,"October 4, 2024",False,False,No Malaria,
4,Andorra,,,,,,,"October 4, 2024",False,False,No Malaria,
5,Angola,Required for arriving travelers ≥9 months old,Recommended: for all travelers ≥9 months old,All,Chloroquine,"P. falciparum(primarily)\nP. malariae,P. ovale...","Atovaquone-proguanil, doxycycline, mefloquine,...","October 4, 2024",True,False,CQ-resist,Required for all arriving travelers
6,Anguilla (U.K.),,,,,,,"October 4, 2024",False,False,No Malaria,
7,Antarctica,,,,,,,"October 4, 2024",False,False,No Malaria,
8,Antigua and Barbuda,Required for travelers ≥1 year old arriving fr...,,,,,,"October 4, 2024",False,False,No Malaria,Arriving from countries with risk for YF virus...
9,Argentina,,Recommended: for travelers ≥9 months old going...,,,,,"October 4, 2024",False,False,No Malaria,


In [9]:
# # Save to CSV
# df.to_csv("yellow_fever_malaria_data_cleaned_yfcat.csv", index=False)

In [10]:
# # Export the final DataFrame to an Excel file
# df.to_excel("yellow_fever_malaria_data_cleaned_yfcat.xlsx", index=False)

In [11]:
# Define dictionary to map unrecognized territories to recognized countries

tableau_mapping = {
    "Azores (Portugal)": "Azores",
    "British Indian Ocean Territory; includes Diego Garcia (U.K.)": "British Indian Ocean Territory",
    "Canary Islands ( Spain )": "Canary Islands",
    "Christmas Island (Australia)": "Christmas Island",
    "Cocos (Keeling) Islands (Australia)": "Cocos (Keeling) Islands",
    "Cook Islands (New Zealand)": "Cook Islands",
    "Curaçao, Netherlands": "Curaçao",
    "Congo, Republic of the (Congo-Brazzaville)": "Republic of Congo",
    "Democratic Republic of the Congo (Congo-Kinshasa)": "Democratic Republic of Congo",
    "Easter Island (Chile)": "Easter Island",
    "Ecuador, including the Galápagos Islands": "Ecuador",
    "Eswatini (Swaziland)": "Eswatini",
    "Falkland Islands (Islas Malvinas), UK Overseas Territory (also claimed by Argentina)": "Falkland Islands (Islas Malvinas)",
    "Faroe Islands (Denmark)": "Faroe Islands",
    "French Polynesia, including the Society Islands [Bora-Bora, Moorea & Tahiti]; Marquesas Islands [Hiva Oa & Ua Huka]; and Austral Islands (Tubuai & Rurutu), France": "French Polynesia",
    "Gibraltar (U.K.)": "Gibraltar",
    "Greenland (Denmark)": "Greenland",
    "Guadeloupe (including Marie-Galante, La Désirade & Îles des Saintes)": "Guadeloupe",
    "Italy (including Holy See [Vatican City])" : "Italy",
    "Kiribati (formerly Gilbert Islands), includes Tarawa, Tabuaeran (Fanning Island), and Banaba (Ocean Island)": "Kiribati",
    "Macau (China)": "Macau",
    "Madeira Islands (Portugal)": "Madeira Islands",
    "Mayotte (France)": "Mayotte",
    "Micronesia, Federated States of (including Chuuk, Kosrae, Pohnpei & Yap)": "Federated States of Micronesia",
    "Montserrat, United Kingdom": "Montserrat",
    "Netherlands Antilles (Bonaire, Curaçao, Saba, St. Eustasius, and St. Maarten)": "Netherlands Antilles",
    "Niue (New Zealand)": "Niue",
    "Norfolk Island (Australia)": "Norfolk Island",
    "Northern Mariana Islands (U.S.), includes Saipan, Tinian, and Rota Island": "Northern Mariana Islands",
    "Saba, Netherlands": "Saba",
    "Saint Barthelemy, France": "Saint Barthelemy",
    "Saint Helena, United Kingdom": "Saint Helena",
    "Saint Kitts (Saint Christopher) & Nevis": "Saint Kitts and Nevis",
    "Saint Martin, France": "Saint Martin",
    "Saint Pierre and Miquelon (France)": "Saint Pierre and Miquelon",
    "Samoa (formerly Western Somoa)": "Samoa",
    "Sint Eustatius, Netherlands": "Sint Eustatius",
    "Sint Maarten, Netherlands": "Sint Maarten (Dutch part)",
    "South Georgia & the South Sandwich Islands, UK Overseas Territory (also claimed by Argentina)": "South Georgia and the South Sandwich Islands",
    "Tokelau (New Zealand)": "Tokelau",
    "Türkiye (Turkey)": "Turkey",
    "Turks and Caicos Islands (U.K.)": "Turks and Caicos Islands",
    "United Kingdom (including Channel Islands, Isle of Man, Ascension Island & Tristan Da Cunha Archipelago)": "United Kingdom",
    "Wake Island, U.S.": "Wake Island"
}

In [12]:
# Define a function to get the correct country name to match with Tableau data
def get_country_tableau(country_name):
    if country_name not in tableau_mapping:
        return country_name
    mapped_value = tableau_mapping[country_name]
    if mapped_value == "Unrecognized":
        return country_name
    else:
        return mapped_value

In [13]:
# Create a new column in df_csv
df["Country_Tableau"] = df["Country"].apply(get_country_tableau)

In [14]:
# Export the final DataFrame to an Excel file
df.to_excel("yellow_fever_malaria_data_cleaned_yfcat_tableau.xlsx", index=False)

In [17]:
from IPython.display import display, HTML

# Load DataTable for full table display
def render_myscroll_table(df):
    return display(HTML(df.to_html(index=False, classes="display nowrap", escape=False)))

# Show DataFrame
render_myscroll_table(df)

Country,Entry Requirements,CDC Recommendations,Transmission Areas,Drug Resistance,Species,Recommended Chemoprophylaxis,Last Updated,CQ_resist,MQ_resist,Resistance Type,Entry Category,Country_Tableau
Afghanistan,,,"All areas <2,500 m (≈8,200 ft) elevation (April–December)",Chloroquine,P. vivax(primarily)\nP. falciparum(less commonly),"Atovaquone-proguanil, doxycycline, mefloquine, tafenoquine3","October 4, 2024",True,False,CQ-resist,,Afghanistan
Albania,Required for travelers ≥1 year old arriving from countries with risk for YF virus transmission1,,,,,,"October 4, 2024",False,False,No Malaria,Arriving from countries with risk for YF virus transmission,Albania
Algeria,Required for travelers ≥9 months old arriving from countries with risk for YF virus transmission; this includes >12-hour airport transits or layovers in countries with risk for YF virus transmission.1,,,,,,"October 4, 2024",False,False,No Malaria,Arriving from countries with risk for YF virus transmission,Algeria
American Samoa (US),,,,,,,"October 4, 2024",False,False,No Malaria,,American Samoa (US)
Andorra,,,,,,,"October 4, 2024",False,False,No Malaria,,Andorra
Angola,Required for arriving travelers ≥9 months old,Recommended: for all travelers ≥9 months old,All,Chloroquine,"P. falciparum(primarily)\nP. malariae,P. ovale, andP. vivax(less commonly)","Atovaquone-proguanil, doxycycline, mefloquine, tafenoquine3","October 4, 2024",True,False,CQ-resist,Required for all arriving travelers,Angola
Anguilla (U.K.),,,,,,,"October 4, 2024",False,False,No Malaria,,Anguilla (U.K.)
Antarctica,,,,,,,"October 4, 2024",False,False,No Malaria,,Antarctica
Antigua and Barbuda,Required for travelers ≥1 year old arriving from countries with risk for YF virus transmission this includes >12-hour airport transits or layovers in countries with risk for YF virus transmission.1,,,,,,"October 4, 2024",False,False,No Malaria,Arriving from countries with risk for YF virus transmission,Antigua and Barbuda
Argentina,,"Recommended: for travelers ≥9 months old going to Corrientes and Misiones Provinces.\nGenerally not recommended: for travel to Formosa Province or to designated areas of Chaco, Jujuy, and Salta Provinces.\nNot recommended: for travel limited to provinces and areas not listed above.",,,,,"October 4, 2024",False,False,No Malaria,,Argentina
