In [1]:
import pandas as pd
import numpy as np
import os
import gc
import time
import re
import ast
import country_converter as coco
import time
import requests
import json
import googlemaps
from urllib.parse import quote_plus
from collections import Counter
from functools import lru_cache

# Manage API Keys
from dotenv import load_dotenv
load_dotenv()
api_key_GOOGLEMAPS = os.getenv("API_KEY_GoogleMaps_new")
user_agent_NOMINATIM = os.getenv("USER_AGENT_NOMINATIM")

import logging
coco_logger = logging.getLogger('country_converter.country_converter')
coco_logger.setLevel(logging.ERROR)

In [2]:
DF_input = input().strip()

      C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\parsedXMLs\


In [3]:
entities_database_input = input().strip()

    C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\resources\allCountries_clean.csv


In [4]:
institutions_database_input = input().strip()

    C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\resources\ScimagoIR 2025 - Overall Rank.csv


In [5]:
# Import databases
entities_database = pd.read_csv(entities_database_input)
institutions_database = pd.read_csv(institutions_database_input, sep=";")

# Remove " *" from institutions when present
institutions_database["Institution"] = institutions_database["Institution"].apply(lambda x: x[:-2] if x[-2:] == " *" else x)

In [6]:
# Import list of csvs to parse
list_csvs = []

for file in os.listdir(DF_input):
    if file[-4:] == ".csv":
        list_csvs.append(file)

n_csvs = len(list_csvs)

In [7]:
# Define a dictionary for US states. 
# Many article's affiliation have the state name/ code and not "United States"
us_states = [
    "Alabama", "AL", "Al",
    "Alaska", "AK", "Ak",
    "Arizona", "AZ", "Az",
    "Arkansas", "AR", "Ar",
    "California", "CA", "Ca",
    "Colorado", "CO", "Co",
    "Connecticut", "CT", "Ct",
    "Delaware", "DE", "De",
    "Florida", "FL", "Fl",
    "Georgia", "GA", "Ga",
    "Hawaii", "HI", "Hi",
    "Idaho", "ID", "Id",
    "Illinois", "IL", "Il",
    "Indiana", "IN", "In",
    "Iowa", "IA", "Ia",
    "Kansas", "KS", "Ks",
    "Kentucky", "KY", "Ky",
    "Louisiana", "LA", "La",
    "Maine", "ME", "Me",
    "Maryland", "MD", "Md",
    "Massachusetts", "MA", "Ma",
    "Michigan", "MI", "Mi",
    "Minnesota", "MN", "Mn",
    "Mississippi", "MS", "Ms",
    "Missouri", "MO", "Mo",
    "Montana", "MT", "Mt",
    "Nebraska", "NE", "Ne",
    "Nevada", "NV", "Nv",
    "New Hampshire", "NH", "Nh",
    "New Jersey", "NJ", "Nj",
    "New Mexico", "NM", "Nm",
    "New York", "NY", "Ny",
    "North Carolina", "NC", "Nc",
    "North Dakota", "ND", "Nd",
    "Ohio", "OH", "Oh",
    "Oklahoma", "OK", "Ok",
    "Oregon", "OR", "Or",
    "Pennsylvania", "PA", "Pa",
    "Rhode Island", "RI", "Ri",
    "South Carolina", "SC", "Sc",
    "South Dakota", "SD", "Sd",
    "Tennessee", "TN", "Tn",
    "Texas", "TX", "Tx",
    "Utah", "UT", "Ut",
    "Vermont", "VT", "Vt",
    "Virginia", "VA", "Va",
    "Washington", "WA", "Wa",
    "West Virginia", "WV", "Wv",
    "Wisconsin", "WI", "Wi",
    "Wyoming", "WY", "Wy",
    "District of Columbia", "DC", "Dc"
]

In [8]:
# Precompute lookup tables
institution_to_country = dict(zip(institutions_database["Institution"], institutions_database["Country"]))
entity_to_country = dict(zip(entities_database["name"], entities_database["country code"]))
all_institutions_set = set(institution_to_country.keys())

# Instance conerter
cc = coco.CountryConverter()

# MUST include contact info. Example format: 'Application_name/1.0 (yourEmail@provider.com)'
headers = {
    'User-Agent': user_agent_NOMINATIM  
}

gmaps = googlemaps.Client(key=api_key_GOOGLEMAPS)

# Compiled regex list if regex matching is required
compiled_institution_patterns = [(inst, re.compile(re.escape(inst))) for inst in institution_to_country]

@lru_cache(maxsize=1000)
def get_country_from_text(text):
    """Fast country name conversion with cache."""
    try:
        result = cc.convert(names=text, to='name_short')
        return result if result != "not found" else None
    except:
        return None

def extract_country_with_states(cell_content):
    if pd.isna(cell_content):
        return None, None

    try:
        parsed_content = ast.literal_eval(cell_content)
        parsed_content_org = [org for org in parsed_content if org["label"] == "ORG"]
    except:
        return None, None

    if not isinstance(parsed_content, list):
        return None, None

    countries = []
    text_blob = []

    for el in parsed_content:
        if isinstance(el, dict):
            text = el.get("text", "").strip(".") # Remove points as trailing spaces afterbefore texts
            label = el.get("label", "")

            text_blob.append(text)

            if text in us_states:
                return "United States", "State_in_US"

            if label in {"LOC", "GPE"}:
                country = get_country_from_text(text)
                if isinstance(country, str):
                    countries.append(country)

    if len(countries) == 1:
        return countries[0], "Direct_country"
    elif len(countries) > 1:
        # Return most common
        try:
            return Counter(countries).most_common(1)[0][0], "Most_common_list_countries"
        except:
            with open(DF_input + "logs errors Most common list countries.txt", "a") as f:
                f.write(str(cell_content) + ", " + str(countries) + "\n")
                
        
    # No matches from LOC/GPE, try ORG matching
    entity_text = " ".join(text_blob)
    for el in reversed(parsed_content):
        if el.get("label") == "ORG":
            name = el["text"].strip(".")

            # Direct match
            if name in institution_to_country:
                return get_country_from_text(institution_to_country[name]), "Direct_institution"

            # Fallback regex search
            matches = [inst for inst, pattern in compiled_institution_patterns if pattern.search(name)]
            if len(matches) == 1:
                return get_country_from_text(institution_to_country[matches[0]]), "Regex_institution"

    # Try entity database
    for el in reversed(parsed_content):
        name = el.get("text", "").strip(".")
        if name in entity_to_country:
            country_code = entity_to_country[name]
            country = get_country_from_text(country_code)
            if country and re.search(r'\b' + re.escape(country) + r'\b', entity_text):
                return country, "Entity_database"

    # If all else fails, try calls to location APIs
    # Some calls to Nominatim and Google Maps will be done by passing all text collected by NER. This needs to be prepared
    # Merge all texts into one string, in case needed for passing to code below
    string_parts = [el["text"] for el in parsed_content if isinstance(el, dict) and "text" in el]
    string = ", ".join(string_parts) + ", " if string_parts else ""
    all_text = string[:-2] # Remove last " ,"
    
    # Prepare for submitting to Nominatim
    all_text_submit = quote_plus(all_text)
    
    
    # Other calls will use just the last ORG-labelled text (corresponding to the Institution). This needs to be prepared
    if len(parsed_content_org) > 0:
        last_org = parsed_content_org[-1]
    else:
        last_org = ""

    if isinstance(last_org, str):
        last_org_submit = quote_plus(last_org)
    else:
        last_org_submit = ""
    
    # Call Nominatim with the institution only (last ORG colledted by NER)
    result = requests.get("https://nominatim.openstreetmap.org/search?q="+last_org_submit+"&format=json&addressdetails=1", headers=headers)
    if result.status_code == 200:
        content = json.loads(result.content)
    
        # If Nominatim retrieves only one item, return its corresponding country
        if len(content) == 1:
            candidate = content[0]["address"]["country_code"]
            return get_country_from_text(candidate), "Nominatim"
    
        # Otherwise, try with the whole affiliation in Nominatim
        else:
            time.sleep(1)
            result_all = requests.get(
                "https://nominatim.openstreetmap.org/search?q=" + all_text_submit + "&format=json&addressdetails=1", 
                headers=headers
            )
            try:
                if result_all.content:
                    content_all = result_all.json()
                else:
                    content_all = []
            except json.JSONDecodeError:
                content_all = []
    
            # Again, if Nominatim retrieves only one item, return its corresponding country
            if len(content_all) == 1:
                if "address" in content_all[0].keys() and "country_code" in content_all[0]["address"]:
                    candidate = content_all[0]["address"]["country_code"]
                    return get_country_from_text(candidate), "Nominatim"
    
            # Otherwise, try with Google Maps
            else:
                try:
                    # First, try with the whole affiliation
                    geocode_result = gmaps.geocode(all_text)
                    if geocode_result:
                        candidate = [component["short_name"] for component in geocode_result[0]["address_components"] if "country" in component["types"]]
                        # If Google Maps call retrieves only one item, return its associated country 
                        if len(candidate) == 1:
                            return get_country_from_text(candidate[0]), "Google Maps"
                    
                    # Otherwise, try Google Maps only with the last ORG item of the affiliation (institution)
                    else:
                        try:
                            geocode_result = gmapsgeocode(text)
                            if geocode_result:
                                candidate = [component["short_name"] for component in geocode_result[0]["address_components"] if "country" in component["types"]]
                                # Again, if Google Maps call retrieves only one item, return its associated country 
                                if len(candidate) == 1:
                                    return get_country_from_text(candidate[0]), "Google Maps"
    
                # If all fails, return None
                            else:
                                return None, None
                        except:
                            return None, None
                except:
                    return None, None
    
    return None, None

In [9]:
parsed_csvs = []
with open(DF_input + "csv files with Country.txt", "r") as f:
    for csv in f:
        parsed_csvs.append(csv[:-1])

parsed_csvs

['parsedX_100000.csv',
 'parsedX_1000000.csv',
 'parsedX_1100000.csv',
 'parsedX_1200000.csv',
 'parsedX_1300000.csv',
 'parsedX_1400000.csv',
 'parsedX_1500000.csv',
 'parsedX_1600000.csv',
 'parsedX_1700000.csv',
 'parsedX_1800000.csv',
 'parsedX_1900000.csv']

In [10]:
for csv in list_csvs:
    if csv not in parsed_csvs:
        df = pd.read_csv(DF_input + csv)
        start = time.time()
        print("Parsing csv: " + csv + " (" + str(list_csvs.index(csv) + 1) + "/" + str(n_csvs) + ")")
        df[["Country", "Country_source"]] = df["NER_lastAuthor"].apply(extract_country_with_states).apply(pd.Series)
        df.to_csv(DF_input + csv, index=False)
        with open(DF_input + "csv files with Country.txt", "a") as f:
            f.write(csv+"\n")
        print("--Parsing time: " + str(round(time.time()-start, 2)))
        del df

ERROR! Session/line number was not unique in database. History logging moved to new session 377
Parsing csv: parsedX_200000.csv (12/45)
--Parsing time: 53023.31
Parsing csv: parsedX_2000000.csv (13/45)
--Parsing time: 8555.63
Parsing csv: parsedX_2100000.csv (14/45)
--Parsing time: 8301.66


  df = pd.read_csv(DF_input + csv)


Parsing csv: parsedX_2200000.csv (15/45)
--Parsing time: 8193.36
Parsing csv: parsedX_2300000.csv (16/45)
--Parsing time: 12025.12


  df = pd.read_csv(DF_input + csv)


Parsing csv: parsedX_2400000.csv (17/45)
--Parsing time: 11053.77


  df = pd.read_csv(DF_input + csv)


Parsing csv: parsedX_2500000.csv (18/45)
--Parsing time: 11310.87


  df = pd.read_csv(DF_input + csv)


Parsing csv: parsedX_2600000.csv (19/45)
--Parsing time: 10049.79


  df = pd.read_csv(DF_input + csv)


Parsing csv: parsedX_2700000.csv (20/45)
--Parsing time: 10061.57


  df = pd.read_csv(DF_input + csv)


Parsing csv: parsedX_2800000.csv (21/45)
--Parsing time: 10255.66
Parsing csv: parsedX_2900000.csv (22/45)
--Parsing time: 10300.86
Parsing csv: parsedX_300000.csv (23/45)
--Parsing time: 34585.96


  df = pd.read_csv(DF_input + csv)


Parsing csv: parsedX_3000000.csv (24/45)


KeyboardInterrupt: 