In [5]:
import pandas as pd
import re
from fuzzywuzzy import process

# -------------------------------
# Step 1: Load Excel files
# -------------------------------

# Load main DataFrame
df = pd.read_excel("yf_malaria_for_tableau_countrymatch.xlsx")

# Load the Attribute Data sheet
vkp_df = pd.read_excel("countries_with_tiny_vkp_240324_0032/countries_with_tiny_vkp.xlsx", sheet_name="Attribute Data")

# Prepare columns for merging (only needed ones)
vkp_columns = ["Country", "Country_Ta", "BRK_A3", "SOV_A3", "ADM0_A3", "ISO_A2", "VKP_A3", "SUBREGION", "CONTINENT"]
vkp_df = vkp_df[vkp_columns]

In [6]:
# -------------------------------
# Step 2: Clean country names
# -------------------------------

def clean_country_name(name):
    if pd.isna(name):
        return ""
    if not isinstance(name, str):
        name = str(name)
    name = name.lower()
    name = re.sub(r"\(.*?\)", "", name)
    name = re.sub(r",\s*france$", "", name)
    name = re.sub(r"\s+", " ", name).strip()
    name = name.replace("c√¥te", "côte").replace("cura√ßao", "curaçao")
    name = name.replace("gal√°pagos", "galápagos").replace("r√©union", "réunion")
    name = name.replace("s√£o", "são").replace("t√ºrkiye", "türkiye")
    return name

df["Country_clean"] = df["Country"].apply(clean_country_name)
vkp_df["Country_clean_ta"] = vkp_df["Country_Ta"].apply(clean_country_name)
vkp_df["Country_clean"] = vkp_df["Country"].apply(clean_country_name)

vkp_candidates = pd.concat([
    vkp_df[["Country_clean", "BRK_A3", "SOV_A3", "ADM0_A3", "ISO_A2", "VKP_A3", "SUBREGION", "CONTINENT"]],
    vkp_df[["Country_clean_ta", "BRK_A3", "SOV_A3", "ADM0_A3", "ISO_A2", "VKP_A3", "SUBREGION", "CONTINENT"]]
]).drop_duplicates(subset=["Country_clean"])

# -------------------------------
# Step 3: Remove known incorrect
# -------------------------------

# Discard Netherland Antilles
df = df[~df["Country_clean"].str.contains("netherland antilles", case=False)]

# -------------------------------
# Step 4: Manual Overrides
# -------------------------------

manual_overrides = {
    "burma": "myanmar",
    "czechia": "czech republic",
    "congo, republic of the": "republic of the congo",
    "democratic republic of the congo": "democratic republic of the congo",
    "north korea": "dem. rep. korea",
    "south korea": "republic of korea",
    "falkland islands , uk overseas territory": "falkland islands / malvinas",
}

# -------------------------------
# Step 5: Fuzzy Matching + Override
# -------------------------------

matched_vkp = []
matched_scores = []
matched_names = []
overridden_matches = []

for idx, row in df.iterrows():
    query_name = row["Country_clean"]

    overridden_name = manual_overrides.get(query_name, None)
    if overridden_name and overridden_name in vkp_candidates["Country_clean"].values:
        best_match = overridden_name
        score = 999  # mark as manual override
        overridden_matches.append((row["Country"], best_match))
    else:
        best_match, score = process.extractOne(query_name, vkp_candidates["Country_clean"].tolist())

    match_row = vkp_candidates[vkp_candidates["Country_clean"] == best_match].iloc[0]

    matched_vkp.append({
        "BRK_A3": match_row["BRK_A3"],
        "SOV_A3": match_row["SOV_A3"],
        "ADM0_A3": match_row["ADM0_A3"],
        "ISO_A2": match_row["ISO_A2"],
        "VKP_A3": match_row["VKP_A3"],
        "SUBREGION": match_row["SUBREGION"],
        "CONTINENT": match_row["CONTINENT"]
    })
    matched_scores.append(score)
    matched_names.append(best_match)

# Combine results into DataFrame
matched_df = pd.DataFrame(matched_vkp)
df["match_score"] = matched_scores
df["matched_country_name"] = matched_names
df = pd.concat([df, matched_df], axis=1)

# -------------------------------
# Step 6: Drop duplicates by VKP_A3
# -------------------------------

# Capture duplicate rows BEFORE dropping
duplicates = df[df.duplicated(subset="VKP_A3", keep=False)]

# Drop them, keeping only the first
df = df.drop_duplicates(subset="VKP_A3", keep="first")

# -------------------------------
# Step 7: Print override + dropped duplicate info
# -------------------------------

# Print overridden matches
print("\n✅ Manual Overrides Applied:")
if overridden_matches:
    for orig, override in overridden_matches:
        print(f"- {orig} → {override}")
else:
    print("None.")

# Print VKP_A3 duplicates that were dropped
print("\n🔁 Duplicate VKP_A3 entries dropped:")
if not duplicates.empty:
    print(duplicates[["Country", "matched_country_name", "VKP_A3"]].sort_values("VKP_A3").to_string(index=False))
else:
    print("None.")

# -------------------------------
# Step 8: Final result preview
# -------------------------------

print("\n📦 Final Result (Top 10):")
print(df[["Country", "matched_country_name", "match_score", "VKP_A3"]].head(10).to_string(index=False))


✅ Manual Overrides Applied:
- Burma (Myanmar) → myanmar
- Congo, Republic of the (Congo-Brazzaville) → republic of the congo
- Czechia (Czech Republic) → czech republic
- Democratic Republic of the Congo (Congo-Kinshasa) → democratic republic of the congo
- Falkland Islands (Islas Malvinas), UK Overseas Territory (also claimed by Argentina) → falkland islands / malvinas
- North Korea → dem. rep. korea
- South Korea → republic of korea

🔁 Duplicate VKP_A3 entries dropped:
                                                                   Country matched_country_name VKP_A3
                                                                   Georgia              georgia    GEO
South Georgia & the South Sandwich Islands (UK, also claimed by Argentina)              georgia    GEO

📦 Final Result (Top 10):
              Country matched_country_name  match_score VKP_A3
          Afghanistan          afghanistan          100    AFG
              Albania              albania          100    ALB

In [7]:
df

Unnamed: 0,Country,YF_Recommendations,YF_Entry_Requirements,Malaria_Has_Transmission,Malaria_Areas_of_Risk,Malaria_Chloroquine_Resistance,Malaria_Species,Malaria_Recommended_Chemoprophylaxis,Other_Vaccines_To_Consider,Map_Links,...,Country_clean,match_score,matched_country_name,BRK_A3,SOV_A3,ADM0_A3,ISO_A2,VKP_A3,SUBREGION,CONTINENT
0,Afghanistan,Vaccine is not recommended.,Vaccine is not required.,True,"- All areas <2,500 m (<8,200 ft) elevation (Ap...",- Chloroquine,- P. vivax (primarily)\n- P. falciparum (less ...,"- Atovaquone-proguanil, doxycycline, mefloquin...",See Health Information for Travelers to Afghan...,Malaria prevention in Afghanistan,...,afghanistan,100,afghanistan,AFG,AFG,AFG,AF,AFG,Southern Asia,Asia
1,Albania,Vaccine is not recommended.,- Direct travel from United States: Vaccine is...,False,,,,,See Health Information for Travelers to Albania,,...,albania,100,albania,ALB,ALB,ALB,AL,ALB,Southern Europe,Europe
2,Algeria,Vaccine is not recommended.,- Direct travel from United States: Vaccine is...,False,,,,,See Health Information for Travelers to Algeria,,...,algeria,100,algeria,DZA,DZA,DZA,DZ,DZA,Northern Africa,Africa
3,American Samoa (U.S.),Vaccine is not recommended.,Vaccine is not required.,False,,,,,See Health Information for Travelers to Americ...,,...,american samoa,100,american samoa,ASM,,,AS,ASM,Polynesia,Oceania
4,Andorra,Vaccine is not recommended.,Vaccine is not required.,False,,,,,See Health Information for Travelers to Andorra,,...,andorra,100,andorra,,AND,AND,AD,AND,Southern Europe,Europe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,"Virgin Islands, U.S.",Vaccine is not recommended.,Vaccine is not required.,False,,,,,See Health Information for Travelers to Virgin...,,...,"virgin islands, u.s.",100,"virgin islands, u.s.",VIR,,,VI,VIR,Caribbean,North America
240,"Wake Island, U.S.",Vaccine is not recommended.,Vaccine is not required.,False,,,,,See Health Information for Travelers to Wake I...,,...,"wake island, u.s.",100,"wake island, u.s.",UMI,,,UM,UMI,Micronesia,Oceania
241,Yemen,Vaccine is not recommended.,Vaccine is not required.,True,"- All areas <2,000 m (<6,500 ft) elevation\n- ...",- Chloroquine,"- P. falciparum (primarily)\n- P. malariae , P...","- Atovaquone-proguanil, doxycycline, mefloquin...",See Health Information for Travelers to Yemen,Malaria prevention in Yemen,...,yemen,100,yemen,YEM,YEM,YEM,YE,YEM,Western Asia,Asia
242,Zambia,- Vaccine is generally not recommended for tra...,- Direct travel from United States: Vaccine is...,True,- All,- Chloroquine,"- P. falciparum (primarily)\n- P. malariae , P...","- Atovaquone-proguanil, doxycycline, mefloquin...",See Health Information for Travelers to Zambia,,...,zambia,100,zambia,ZMB,ZMB,ZMB,ZM,ZMB,Eastern Africa,Africa


In [8]:
# Export the final DataFrame to an Excel file
df.to_excel("yf_malaria_for_tableau_countrymatch_vkp.xlsx", index=False)