In [1]:
import pandas as pd
import os

# Motivation
One of the features of this project is the analysis of cancer studies based on the country where the institution publishing the study is located. This location is extracted from the affiliation of the last author (or, if this is empty, from the affiliation of any author, if available).  

After extracting the affiliation, named entity recognition (NER) via Spacy (see *extractInstitutionLocationNER*) is performed. After that, *country-converter* is used to extract and harmonize country names. Whenever regional/cities names but no country name are found by NER library, an alternative approach involving matching these regional/cities names to their corresponding country names must be followed.

To do so, a database of named regional/cities entities and their corresponding countries can be used. Here, the **[GeoNames](https://www.geonames.org)** database is used. Upon downloading the *allCountries.txt* file from the webpage, the file is imported and only the regions/cities names are kept. The output is saved and later used in the *extractCountryFromNER* file to extract the country names corresponding to the sub-country level entities found by the NER library. 

*In this project, the allCountries.txt file was downloaded on 2025-June-04*

In [2]:
all_countries_input = input().strip()

 C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\resources\allCountries.txt


In [3]:
sub_country_entities_output = input().strip()

 C:\Users\svalb\OneDrive\Escritorio\Data_40_years_cancer_studies\resources\allCountries_clean.csv


In [7]:
names_columns = ["geonameid", "name", "asciiname", "alternatenames", "latitude", "longitude", 
                 "feature class", "feature code", "country code", "cc2", "admin1 code", "admin2 code",
                 "admin3 code", "admin4 code", "population", "elevation", "dem", "timezone", 
                 "modification date"]
all_countries = pd.read_csv(all_countries_input, names= names_columns, sep="\t")

  all_countries = pd.read_csv(all_countries_input, names= names_columns, sep="\t")


In [8]:
all_countries.head()

Unnamed: 0,geonameid,name,asciiname,alternatenames,latitude,longitude,feature class,feature code,country code,cc2,admin1 code,admin2 code,admin3 code,admin4 code,population,elevation,dem,timezone,modification date
0,2994701,Roc Meler,Roc Meler,"Roc Mele,Roc Meler,Roc Mélé",42.58765,1.7418,T,PK,AD,"AD,FR",02,,,,0,2811.0,2348,Europe/Andorra,2023-10-03
1,3017832,Pic de les Abelletes,Pic de les Abelletes,"Pic de la Font-Negre,Pic de la Font-Nègre,Pic ...",42.52535,1.73343,T,PK,AD,FR,A9,66.0,663.0,66146.0,0,,2411,Europe/Andorra,2014-11-05
2,3017833,Estany de les Abelletes,Estany de les Abelletes,"Estany de les Abelletes,Etang de Font-Negre,Ét...",42.52915,1.73362,H,LK,AD,FR,A9,,,,0,,2260,Europe/Andorra,2014-11-05
3,3023203,Port Vieux de la Coume d’Ose,Port Vieux de la Coume d'Ose,"Port Vieux de Coume d'Ose,Port Vieux de Coume ...",42.62568,1.61823,T,PASS,AD,,00,,,,0,,2687,Europe/Andorra,2014-11-05
4,3029315,Port de la Cabanette,Port de la Cabanette,"Port de la Cabanette,Porteille de la Cabanette",42.6,1.73333,T,PASS,AD,"AD,FR",B3,9.0,91.0,9139.0,0,,2379,Europe/Andorra,2014-11-05


In [9]:
# Original number of rows
len(all_countries)

13199829

In [14]:
# Keep only the rows corresponding to sub-country level political entities (regions, cities, states) as in the features class field
# See GeoNames documentation for more information about feature classes
all_countries_clean = all_countries.loc[all_countries["feature class"].isin(["A", "P"])]

In [15]:
all_countries_clean.head()

Unnamed: 0,geonameid,name,asciiname,alternatenames,latitude,longitude,feature class,feature code,country code,cc2,admin1 code,admin2 code,admin3 code,admin4 code,population,elevation,dem,timezone,modification date
8,3038816,Xixerella,Xixerella,,42.55327,1.48736,P,PPL,AD,,4,,,,0,,1417,Europe/Andorra,2009-04-24
23,3038832,Vila,Vila,"Casas Vila,Vila",42.53176,1.56654,P,PPL,AD,,3,,,,1418,,1318,Europe/Andorra,2024-11-04
90,3038899,Tossalet i Vinyals1,Tossalet i Vinyals1,"Tossalet i Vin'jal's,Tossalet i Vinyals,Тоссал...",42.48597,1.4891,P,PPLL,AD,,7,,,,0,,1015,Europe/Andorra,2025-03-13
178,3038987,Sornàs,Sornas,"Sornas,Sornàs",42.56461,1.52757,P,PPL,AD,,5,,,,0,,1328,Europe/Andorra,2014-11-05
190,3038999,Soldeu,Soldeu,"Sol'deu,Soldeu,surudeu,swldw,Сольдеу,סולדאו,سو...",42.57688,1.66769,P,PPL,AD,,2,,,,602,,1832,Europe/Andorra,2017-11-06


In [16]:
# 5.65M rows are kept)
len(all_countries_clean)

5651559

In [None]:
all_countries_clean.to_csv(sub_country_entities_output, index=False)