In [1]:
import pandas as pd
import numpy as np
import ast
from tqdm import tqdm
import sys
import time

from flashgeotext.geotext import GeoText
from geopy.geocoders import Nominatim

In [2]:
articles = pd.read_csv('data/inclusions.csv', index_col=0)

In [3]:
articles.authors = articles.authors.apply(lambda x: ast.literal_eval(x) if type(x) == str else x)
articles.author_affils = articles.author_affils.apply(lambda x: ast.literal_eval(x) if type(x) == str else x)
articles.keywords = articles.keywords.apply(lambda x: ast.literal_eval(x) if type(x) == str else x)
articles.references_pmids = articles.references_pmids.apply(lambda x: ast.literal_eval(x) if type(x) == str else x)
articles.mesh_terms = articles.mesh_terms.apply(lambda x: ast.literal_eval(x) if type(x) == str else x)

In [4]:
articles['affil_countries'] = np.nan
articles['affil_countries'] = articles['affil_countries'].astype(object)
articles['affil_countries_unique'] = np.nan
articles['affil_countries_unique'] = articles['affil_countries'].astype(object)
articles['affil_first_country'] = np.nan
articles['affil_last_country'] = np.nan

In [4]:
def find_affil_countries(affils: list, retry_count = 5):
    
    geolocator = Nominatim(user_agent='health_ai_scraper')
    geotext = GeoText()
    
    if affils == affils: # Check to make sure not NaN
        country_list = []
        location = None
        
        last_affil = None
        last_country = None
        
        try_count = 0
    
        for affil in affils:
            if affil == last_affil: # Check to see if we've seen this before and take a shortcut if we have
                country_list = country_list + last_country

            else:
                while try_count < retry_count:
                    try:
                        last_affil = affil # Set that we've examined this affil
                
                        countries = [*geotext.extract(input_text=affil, span_info=True)['countries'].keys()] # Look for countries
                
                        if (len(countries) == 0): # If we dont find a country look harder
                            cities = [*geotext.extract(input_text=affil, span_info=True)['cities'].keys()]
                            if len(cities) > 0:
                                location = geolocator.geocode(cities[-1])
                            else:
                                location = geolocator.geocode(' '.join(affil.split(" ")[-2:]))
                                if location == None:
                                    location = geolocator.geocode(affil)
                                    if location == None:
                                        tqdm.write("Can't find a country for:")
                                        tqdm.write(affil)
                                        country_list = country_list + [np.nan]
                                        last_country = [np.nan]
                            
                        else: # If we do find a country then att it to the list and set the last_country variable
                            country_list = country_list + countries
                            last_country = countries
        
                        if location != None: # If we found an address using the other search techniques
                            countries = [*geotext.extract(input_text=location.address, span_info=True)['countries'].keys()]
                            country_list = country_list + countries
                            last_country = countries
                            
                        break
                            
                    except:
                        try_count += 1
                        tqdm.write(f"Error parsing {affil}, trying again for a maximum of 5 times.")
                        
    else: # If the affil is NaN then make the country list NaN
        country_list = [np.nan]
        
    unique_countries = list(set(country_list))
    first_affil_country = country_list[0]
    last_affil_country = country_list[-1]    
        
    return country_list, unique_countries, first_affil_country, last_affil_country

In [5]:
def parse_affil_countries(df, max_consecutive_failures = 5, filter_column = 'include'):
    
    consecutive_failures = 0
    
    country_df = df.copy()
    
    country_df['affil_countries'] = np.nan
    country_df['affil_countries_unique'] = np.nan
    country_df['affil_countries_first'] = np.nan
    country_df['affil_countries_last'] = np.nan
    
    with tqdm(total=country_df.shape[0], file=sys.stdout) as pbar:
        for row in country_df[country_df[filter_column] == 1].itertuples():

            try:
                affils = row.author_affils
    
                country_list, unique_countries, first_affil_country, last_affil_country = find_affil_countries(affils)
    
                country_df.loc[row.Index, 'affil_countries'] = str(country_list)
                country_df.loc[row.Index, 'affil_countries_unique'] = str(list(set(country_list)))
                country_df.loc[row.Index, 'affil_countries_first'] = country_list[0]
                country_df.loc[row.Index, 'affil_countries_last'] = country_list[-1]
            
                consecutive_failures = 0
        
            except Exception as e:
                tqdm.write(e)
                consecutive_failures += 1
                if consecutive_failures >=  max_consecutive_failures:
                    tqdm.write("Failed too many in a row, something is broken, stopping and returning possibly partially labelled DF...")
                    break
                
            pbar.update(1)
            
    country_df.replace("[nan]", np.nan, inplace=True)
            
    return country_df

In [None]:
consecutive_failures = 0

with tqdm(total=articles.shape[0], file=sys.stdout) as pbar:
    for row in articles.itertuples():

        try:
            affils = row.author_affils
    
            country_list, unique_countries, first_affil_country, last_affil_country = find_affil_countries(affils)
    
            articles.loc[row.Index, 'affil_countries'] = str(country_list)
            articles.loc[row.Index, 'affil_countries_unique'] = str(list(set(country_list)))
            articles.loc[row.Index, 'affil_first_country'] = country_list[0]
            articles.loc[row.Index, 'affil_last_country'] = country_list[-1]
            
            consecutive_failures = 0
        
        except:
            consecutive_failures += 1
            if consecutive_failures > 5:
                print("Failed too many in a row, something is broken, stopping...")
                break
            time.sleep(1)
                
        pbar.update(1)

Can't find a country for:                                                                                              
[                                                                                                                      
Can't find a country for:                                                                                              
'                                                                                                                      
Can't find a country for:                                                                                              
                                                                                                                       
Can't find a country for:                                                                                              
                                                                                                                       
Can't find a country for:               

In [None]:
articles.tail(25)

In [14]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 34181 entries, 1.0 to 172538.0
Data columns (total 81 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   pmid                    34179 non-null  float64
 1   doi                     29547 non-null  object 
 2   title                   34178 non-null  object 
 3   abstract                34179 non-null  object 
 4   article_date            25954 non-null  object 
 5   pubmed_date             34179 non-null  object 
 6   article_type            34179 non-null  object 
 7   lang                    34179 non-null  object 
 8   journal                 34179 non-null  object 
 9   journal_short           34179 non-null  object 
 10  journal_country         34179 non-null  object 
 11  authors                 33377 non-null  object 
 12  author_affils           24189 non-null  object 
 13  keywords                18861 non-null  object 
 14  mesh_terms              25518 n

## CLEAN

In [4]:
articles['affil_fill_country'] = articles['affil_first_country']

In [5]:
## using first affiliation as primary country
## fill first with data from last author, then with pubmed country metadata
articles['affil_fill_country'] = articles['affil_fill_country'].fillna(articles['affil_last_country'])
articles['affil_fill_country'] = articles['affil_fill_country'].fillna(articles['journal_country'])

articles['affil_fill_country'] = articles['affil_fill_country'].astype('string')

In [6]:
##clean countries
articles["affil_fill_country"].replace({"England": "United Kingdom", 
                                             "Wales": "United Kingdom", 
                                             "Scotland": "United Kingdom", 
                                             "China (Republic : 1949- )" : "Taiwan"}, inplace=True)

articles['affil_fill_country'].value_counts()

United States          12415
China                   5305
United Kingdom          2482
South Korea             1327
Germany                 1273
                       ...  
Azerbaijan                 1
Trinidad and Tobago        1
Kazakhstan                 1
Costa Rica                 1
Palestine                  1
Name: affil_fill_country, Length: 91, dtype: int64

In [13]:
##lowercasing list of ANY author
articles['countries_lc'] = articles['affil_countries_unique'].str.lower().fillna(np.nan)

#articles['countries_lc'] = articles['countries_lc'].fillna(articles['affil_fill_country']).str.lower().astype('string')

In [15]:
articles['countries_lc'].replace(np.nan, '', inplace=True)

In [16]:
lmic_list = ["afghanistan", "burundi", "burkina faso", "central african republic", "congo", "eritrea", 
             "ethiopia", "guinea", "gambia", "guinea-bissau", "liberia", "madagascar", "mali", "mozambique", "malawi", 
             "niger", "north korea", "democratic republic of korea", "rwanda", "sudan", "sierra leone", "somalia", "south sudan", "syrian arab republic", 
             "chad", "togo", "uganda", "yemen", "angola", "benin", "bangladesh", "belize", "bolivia", "bhutan", 
             "cote d'ivoire", "ivory coast", "cameroon", "congo", "comoros", "cabo verde", "djibouti", "algeria", "egypt", 
             "micronesia", "ghana", "honduras", "haiti", "indonesia", "india", "iran", "kenya", 
             "kyrgyz republic", "cambodia", "kiribati", "lao", "sri lanka", "lesotho", "morocco", "myanmar", "mongolia", 
             "mauritania", "nigeria", "nicaragua", "nepal", "pakistan", "philippines", "papua new guinea", 
             "west bank and gaza", "palestinbe", "senegal", "solomon islands", "el salvador", "sao tome", "eswatini", 
             "tajikistan", "timor-leste", "tunisia", "tanzania", "ukraine", "uzbekistan", "vietnam", "vanuatu", "samoa", 
             "zambia", "zimbabwe", "albania", "argentina", "armenia", "american samoa", "azerbaijan", "bulgaria", 
             "bosnia", "belarus", "brazil", "botswana", "china", "colombia", "costa rica", "cuba", 
             "dominica", "dominican republic", "ecuador", "fiji", "gabon", "georgia", "equatorial guinea", "grenada", 
             "guatemala", "guyana", "iraq", "jamaica", "jordan", "kazakhstan", "lebanon", "libya", "lucia", "moldova", 
             "maldives", "mexico", "marshall islands", "north macedonia", "montenegro", "mauritius", "malaysia", "namibia", 
             "panama", "peru", "paraguay", "romania", "russian federation", "russia", "serbia", "suriname", "thailand", "turkmenistan", 
             "tonga", "turkey", "tuvalu", "st. vincent", "grenadines", "kosovo", "south africa", "venezuela"]

In [17]:
lmic_lower_list = ["afghanistan", "burundi", "burkina faso", "central african republic", "congo", "eritrea", 
             "ethiopia", "guinea", "gambia", "guinea-bissau", "liberia", "madagascar", "mali", "mozambique", "malawi", 
             "niger", "north korea", "democratic republic of korea", "rwanda", "sudan", "sierra leone", "somalia", "south sudan", "syrian arab republic", 
             "chad", "togo", "uganda", "yemen", "angola", "benin", "bangladesh", "belize", "bolivia", "bhutan", 
             "cote d'ivoire", "ivory coast", "cameroon", "congo", "comoros", "cabo verde", "djibouti", "algeria", "egypt", 
             "micronesia", "ghana", "honduras", "haiti", "indonesia", "india", "iran", "kenya", 
             "kyrgyz republic", "cambodia", "kiribati", "lao", "sri lanka", "lesotho", "morocco", "myanmar", "mongolia", 
             "mauritania", "nigeria", "nicaragua", "nepal", "pakistan", "philippines", "papua new guinea", 
             "west bank and gaza", "palestinbe", "senegal", "solomon islands", "el salvador", "sao tome", "eswatini", 
             "tajikistan", "timor-leste", "tunisia", "tanzania", "ukraine", "uzbekistan", "vietnam", "vanuatu", "samoa", 
             "zambia", "zimbabwe"]

In [18]:
##flags for ANY author

#initiate
articles['lmic_author_flag'] = np.where(articles['countries_lc'].str.contains('iran'), "1", "0")
articles['lmic_author_lower_flag'] = np.where(articles['countries_lc'].str.contains('iran'), "1", "0")
articles['lmic_china_flag'] = np.where(articles['countries_lc'].str.contains('china'), "1", "0")

In [20]:
#use lists
for x in lmic_list:
    articles['lmic_author_flag'] = np.where(articles['countries_lc'].str.contains(x), "1", articles['lmic_author_flag'])
    
for y in lmic_lower_list:
    articles['lmic_author_lower_flag'] = np.where(articles['countries_lc'].str.contains(x), "1", articles['lmic_author_lower_flag'])

In [21]:
articles['pubmed_date'] = pd.to_datetime(articles['pubmed_date'])

In [22]:
## new column for year, and year+month
articles['year'] = articles['pubmed_date'].dt.year

In [23]:
articles.to_csv('data/final_raw.csv')