In [5]:
import pandas as pd
import numpy as np
import ast
from tqdm import tqdm
import sys
import time

from flashgeotext.geotext import GeoText
from geopy.geocoders import Nominatim

In [6]:
articles = pd.read_csv('data/inclusions.csv', index_col=0)

In [7]:
articles.authors = articles.authors.apply(lambda x: ast.literal_eval(x) if type(x) == str else x)
articles.author_affils = articles.author_affils.apply(lambda x: ast.literal_eval(x) if type(x) == str else x)
articles.keywords = articles.keywords.apply(lambda x: ast.literal_eval(x) if type(x) == str else x)
articles.references_pmids = articles.references_pmids.apply(lambda x: ast.literal_eval(x) if type(x) == str else x)
articles.mesh_terms = articles.mesh_terms.apply(lambda x: ast.literal_eval(x) if type(x) == str else x)

In [8]:
articles['affil_countries'] = np.nan
articles['affil_countries'] = articles['affil_countries'].astype(object)
articles['affil_countries_unique'] = np.nan
articles['affil_countries_unique'] = articles['affil_countries'].astype(object)
articles['affil_first_country'] = np.nan
articles['affil_last_country'] = np.nan

In [9]:
def find_affil_countries(affils: list, retry_count = 5):
    
    geolocator = Nominatim(user_agent='health_ai_scraper')
    geotext = GeoText()
    
    if affils == affils: # Check to make sure not NaN
        country_list = []
        location = None
        
        last_affil = None
        last_country = None
        
        try_count = 0
    
        for affil in affils:
            if affil == last_affil: # Check to see if we've seen this before and take a shortcut if we have
                country_list = country_list + last_country

            else:
                while try_count < retry_count:
                    try:
                        last_affil = affil # Set that we've examined this affil
                
                        countries = [*geotext.extract(input_text=affil, span_info=True)['cities'].keys()] # Look for countries
                
                        if (len(countries) == 0): # If we dont find a country look harder
                            cities = [*geotext.extract(input_text=affil, span_info=True)['cities'].keys()]
                            if len(cities) > 0:
                                location = geolocator.geocode(cities[-1])
                            else:
                                location = geolocator.geocode(' '.join(affil.split(" ")[-2:]))
                                if location == None:
                                    location = geolocator.geocode(affil)
                                    if location == None:
                                        tqdm.write("Can't find a country for:")
                                        tqdm.write(affil)
                                        country_list = country_list + [np.nan]
                                        last_country = [np.nan]
                            
                        else: # If we do find a country then att it to the list and set the last_country variable
                            country_list = country_list + countries
                            last_country = countries
        
                        if location != None: # If we found an address using the other search techniques
                            countries = [*geotext.extract(input_text=location.address, span_info=True)['countries'].keys()]
                            country_list = country_list + countries
                            last_country = countries
                            
                        break
                            
                    except:
                        try_count += 1
                        tqdm.write(f"Error parsing {affil}, trying again for a maximum of 5 times.")
                        
    else: # If the affil is NaN then make the country list NaN
        country_list = [np.nan]
        
    unique_countries = list(set(country_list))
    first_affil_country = country_list[0]
    last_affil_country = country_list[-1]    
        
    return country_list, unique_countries, first_affil_country, last_affil_country

In [10]:
def parse_affil_countries(df, max_consecutive_failures = 5, filter_column = 'include'):
    
    consecutive_failures = 0
    
    country_df = df.copy()
    
    country_df['affil_countries'] = np.nan
    country_df['affil_countries_unique'] = np.nan
    country_df['affil_countries_first'] = np.nan
    country_df['affil_countries_last'] = np.nan
    
    with tqdm(total=country_df.shape[0], file=sys.stdout) as pbar:
        for row in country_df[country_df[filter_column] == 1].itertuples():

            try:
                affils = row.author_affils
    
                country_list, unique_countries, first_affil_country, last_affil_country = find_affil_countries(affils)
    
                country_df.loc[row.Index, 'affil_countries'] = str(country_list)
                country_df.loc[row.Index, 'affil_countries_unique'] = str(list(set(country_list)))
                country_df.loc[row.Index, 'affil_countries_first'] = country_list[0]
                country_df.loc[row.Index, 'affil_countries_last'] = country_list[-1]
            
                consecutive_failures = 0
        
            except Exception as e:
                tqdm.write(e)
                consecutive_failures += 1
                if consecutive_failures >=  max_consecutive_failures:
                    tqdm.write("Failed too many in a row, something is broken, stopping and returning possibly partially labelled DF...")
                    break
                
            pbar.update(1)
            
    country_df.replace("[nan]", np.nan, inplace=True)
            
    return country_df

In [11]:
consecutive_failures = 0

with tqdm(total=articles.shape[0], file=sys.stdout) as pbar:
    for row in articles.itertuples():

        try:
            affils = row.author_affils
    
            country_list, unique_countries, first_affil_country, last_affil_country = find_affil_countries(affils)
    
            articles.loc[row.Index, 'affil_countries'] = str(country_list)
            articles.loc[row.Index, 'affil_countries_unique'] = str(list(set(country_list)))
            articles.loc[row.Index, 'affil_first_country'] = country_list[0]
            articles.loc[row.Index, 'affil_last_country'] = country_list[-1]
            
            consecutive_failures = 0
        
        except:
            consecutive_failures += 1
            if consecutive_failures > 5:
                print("Failed too many in a row, something is broken, stopping...")
                break
            time.sleep(1)
                
        pbar.update(1)

Can't find a country for:                                                                                              
West Hertfordshire Hospitals NHS Trust.                                                                                
Can't find a country for:                                                                                              
Institute for Bio-Economy and Agri-Technology, Center for Research and Technology Hellas, 38333, Greece. Electronic address: d.tsaopoulos@certh.gr.
Can't find a country for:                                                                                              
DERMiSCAN, Brazil. Electronic address: renata.bitar@dermiscan.ch.                                                      
Can't find a country for:                                                                                              
Biodesix, United States. Electronic address: thomas.campbell@biodesix.com.                                             
Can't find a

UCLA-Olive View Internal Medicine Residency Program, Sylmar, CA, USA.                                                  
Can't find a country for:                                                                                              
Department of Biomedical EngineeringCase Western Reserve UniversityClevelandOH44106USA.                                
Can't find a country for:                                                                                              
The Tony and Leona Campane Center for Excellence in Image-Guided Surgery and Advancing Imaging ResearchCleveland Clinic Cole Eye InstituteClevelandOH44106USA.
Can't find a country for:                                                                                              
Department of Biomedical EngineeringCase Western Reserve UniversityClevelandOH44106USA.                                
Can't find a country for:                                                                                              
T

Can't find a country for:                                                                                              
Department of Pathology, 3D Medicines Inc.                                                                             
Can't find a country for:                                                                                              
Department of Data Systems, 3D Medicines Inc.                                                                          
Can't find a country for:                                                                                              
Departments of Psychiatry and of Child Study, Yale School of Medicine, United States of America. Electronic address: sarah.yip@yale.edu.
Can't find a country for:                                                                                              
Department of Diagnostic Radiology, Osaka City General Hospital, 2-13-22 Miyakojima-hondori, Miyakojima-ku, Osaka, 534-0021, Japan. o-kuma@msic

National Clinical Research Center for Child Health, China. Electronic address: shuqiang@zju.edu.cn.                    
Can't find a country for:                                                                                              
Department of Neurology, Penn State Hershey Medical Center, Hershey, PA, United States; Department of Radiology, Penn State Hershey Medical Center, Hershey, PA, United States. Electronic address: guangweidu@pennstatehealth.psu.edu.
Can't find a country for:                                                                                              
Department of Electronic Science and TechnologyUniversity of Science and Technology of ChinaHefei230027China.          
Can't find a country for:                                                                                              
Department of ElectrocardiogramThe First Affiliated Hospital of USTC, Division of Life Sciences and MedicineUniversity of Science and Technology of ChinaHefei230001Chin

Faculdade de Ciências Exatas e da Engenharia, Universidade da Madeira, Portugal; ITI/Larsys/Madeira Interactive Technologies Institute, Portugal. Electronic address: morgado@uma.pt.
Can't find a country for:                                                                                              
Division of Digital Health Sciences, Mayo Clinic, MN, USA; Department of Health Sciences Research, Mayo Clinic, MN, USA. Electronic address: Wang.Yanshan@mayo.edu.
Can't find a country for:                                                                                              
Division of Clinical Infectious Diseases, Research Center Borstel, Borstel, Germany jheyckendorf@fz-borstel.de.        
Can't find a country for:                                                                                              
German Center for Infection Research (DZIF), Germany.                                                                  
Can't find a country for:                             

Can't find a country for:                                                                                              
Comprehensive Breast Cancer Center, Changhua Christian Hospital, No. 135, NanXiao Street, Changhua, Taiwan. Electronic address: darren.chen@cch.org.tw.
Can't find a country for:                                                                                              
Institute of Bioelectronic Medicine, Feinstein Institutes for Medical Research, Northwell Health, Manhasset, NY, USA. tzanos@northwell.edu.
Can't find a country for:                                                                                              
Department of Radiology, Shamir Medical Center, Zerifin.                                                               
Can't find a country for:                                                                                              
Department of Radiology, Shamir Medical Center, Zerifin.                                                    

School of Biomedical Informatics, UTHealth, United States. Electronic address: xiaoqian.jiang@uth.tmc.edu.             
Can't find a country for:                                                                                              
Department of Pathology, China-Japan Friendship Hospital, China. Electronic address: 748803069@qq.com.                 
Can't find a country for:                                                                                              
Department of Statistics and Actuarial Sciences.                                                                       
Can't find a country for:                                                                                              
Gnosis Data Analysis PC, Science and Technology Park of Crete, N. Plastira 100, GR-700 13 Vassilika Vouton, Greece.    
Can't find a country for:                                                                                              
Chest Hospital of Xinjiang Uyghur Autono

School of Computing and Electrical EngineeringIndian Institute of Technology MandiMandi175005India.                    
Can't find a country for:                                                                                              
Genome Profiling, LLC.                                                                                                 
Can't find a country for:                                                                                              
Department of Neurology, The Second Medical Centre, National Clinical Research Centre for Geriatric Diseases, Chinese PLA General Hospital, China. Electronic address: zhangxi@301hospital.com.cn.
Can't find a country for:                                                                                              
Hematology department of Shanxi cancer hospital, China. Electronic address: zqzhao69@163.com.                          
Can't find a country for:                                                            

2MIST E-R40129BolognaItaly.                                                                                            
Can't find a country for:                                                                                              
1Department of EngineeringFerrara University44122FerraraItaly.                                                         
Can't find a country for:                                                                                              
Ophthalmology, OHSU.                                                                                                   
Can't find a country for:                                                                                              
Ophthalmology, OHSU.                                                                                                   
Can't find a country for:                                                                                              
Department of Health Outcomes and Biomed

Department of Radiotherapy, First Medical Center of PLA General Hospital, BeiJing 100853, P.R.China.                   
Can't find a country for:                                                                                              
1Weldon School of Biomedical EngineeringPurdue UniversityWest LafayetteIN47907USA.                                     
Can't find a country for:                                                                                              
Fethi Sekin Hospital, Department of Biochemistry, Elazig, Turkey. Electronic address: hakan.ayyildiz1@saglik.gov.tr.   
Can't find a country for:                                                                                              
Department of Otolaryngology, Head and Neck Surgery, Chuo-city, Japan. ishiih@yamanashi.ac.jp.                         
Can't find a country for:                                                                                              
IBM T.J. Watson Research Center, Yorktow

Fellow of the European Board of Nuclear Medicine (FEBNM), USA.                                                         
Can't find a country for:                                                                                              
Department of Radiology, Henan Provincial People's Hospital, Henan, 450003, China; Imaging Diagnosis of Neurological Diseases and Research Laboratory of Henan Province, China. Electronic address: marian9999@163.com.
Can't find a country for:                                                                                              
Fondazione Istituto Ospedaliero di Sospiro, Sospiro (CR) 26048, Italy. Electronic address: marta.gianotti@fondazionesospiro.it.
Can't find a country for:                                                                                              
Division of Applied Life Science Department, PMBBRC.                                                                   
Can't find a country for:                               

Can't find a country for:                                                                                              
2Department of Electrical and Computer EngineeringUniversity of CaliforniaSan DiegoCA92106USA.                         
Can't find a country for:                                                                                              
3Department of NeurosciencesUniversity of CaliforniaSan DiegoCA92106USA.                                               
Can't find a country for:                                                                                              
2Department of Electrical and Computer EngineeringUniversity of CaliforniaSan DiegoCA92106USA.                         
Can't find a country for:                                                                                              
3Department of NeurosciencesUniversity of CaliforniaSan DiegoCA92106USA.                                               
Can't find a country for:               

1Mayo Clinic Neuro-Informatics Laboratory.                                                                             
Can't find a country for:                                                                                              
Health Innovation Program.                                                                                             
Can't find a country for:                                                                                              
Departments of Neurology (A.C., W.T.K., H.A., A.B.S.).                                                                 
Can't find a country for:                                                                                              
Departments of Neurology (A.C., W.T.K., H.A., A.B.S.).                                                                 
Can't find a country for:                                                                                              
The MRI Institute for Biomedical Researc

Can't find a country for:                                                                                              
Division of Gastroenterology, Department of Medicine, Olive View-UCLA Medical Center, Sylmar, California. Electronic address: jtabibian@dhs.lacounty.gov.
Can't find a country for:                                                                                              
Svezdrav Rešenja LLC, Đenerala Draže 44, Klenje 15357, Serbia. stevan@ecg4everybody.com.                               
Can't find a country for:                                                                                              
German Research Center for Environmental Health, Institute for Health Economics and Health Care Management, Helmholtz Zentrum München, Postfach 1129, 85758, Neuherberg, Germany. manuel.huber@helmholtz-muenchen.de.
Can't find a country for:                                                                                              
St. Luke's Catholic Hospital and

Department of Cardiology (A.C., M.L.L., J.D., K.N.) a.coenen@erasmusmc.nl.                                             
Can't find a country for:                                                                                              
Department of Radiology (A.C., A.K., M.L.L., K.N.).                                                                    
Can't find a country for:                                                                                              
Department of Radiology (D.H.Y.).                                                                                      
Can't find a country for:                                                                                              
Foundation Doctor, West Midlands, England, UK. Electronic address: elliotyatesj@gmail.com.                             
Can't find a country for:                                                                                              
Department of Biostatistics.            

Can't find a country for:                                                                                              
Department of Electrical and Computer Engineering, College of Engineering, UF.                                         
Can't find a country for:                                                                                              
Department of Industrial and Systems Engineering, College of Engineering, UF.                                          
Can't find a country for:                                                                                              
COMSATS Institute of Information Technology Wah, Pakistan. Electronic address: hussam@ciitwah.edu.pk.                  
Can't find a country for:                                                                                              
Neural Rehabilitation Group, Cajal Institute, Spanish National Research Council, Spain. Electronic address: jc.moreno@csic.es.
Can't find a country for:        

Can't find a country for:                                                                                              
4Merck Research Laboratories,North Wales,Pennsylvania.                                                                 
Can't find a country for:                                                                                              
UTM Razak School of Engineering and Advanced Technology, Universiti Teknologi Malaysia, Malaysia. Electronic address: tcmjoel2@live.utm.my.
Can't find a country for:                                                                                              
Department of Engineering, UTM Razak School of Engineering and Advanced Technology, Universiti Teknologi Malaysia, Malaysia. Electronic address: norliza@utm.my.
Can't find a country for:                                                                                              
Institute of Respiratory Medicine, Malaysia. Electronic address: ashdr64@yahoo.com.au.             

Technical Research Centre for the Dependency Care and Autonomous Living (CETpD), Universitat Politècnica de Catalunya - BarcelonaTech (UPC), Spain. Electronic address: daniel.rodriguez-martin@upc.edu.
Can't find a country for:                                                                                              
Technical Research Centre for the Dependency Care and Autonomous Living (CETpD), Universitat Politècnica de Catalunya - BarcelonaTech (UPC), Spain; Sense4Care, Spain. Electronic address: andreu.catala@upc.edu.
Can't find a country for:                                                                                              
Technical Research Centre for the Dependency Care and Autonomous Living (CETpD), Universitat Politècnica de Catalunya - BarcelonaTech (UPC), Spain; Sense4Care, Spain. Electronic address: joan.manuel.moreno@upc.edu.
Can't find a country for:                                                                                              
Technical Rese

Can't find a country for:                                                                                              
Computer Science and Artificial Intelligence Lab, EECS, MIT.                                                           
Can't find a country for:                                                                                              
Computer Science and Artificial Intelligence Lab, EECS, MIT.                                                           
Can't find a country for:                                                                                              
CSIRO Health and Biosecurity, The Australian e-Health & Research Centre, Herston, QLD, Australia. Electronic address: kaikai.shen@csiro.au.
Can't find a country for:                                                                                              
CSIRO Health and Biosecurity, The Australian e-Health & Research Centre, Herston, QLD, Australia. Electronic address: pierrick.bourgeat@csir

Medical System Lab.                                                                                                    
Can't find a country for:                                                                                              
School of Mechatronic Engineering, Universiti Malaysia Perlis (UniMAP), 02600, Campus Pauh Putra, Perlis, Malaysia. Electronic address: hari@unimap.edu.my.
Can't find a country for:                                                                                              
Genome Institute, National Center for Genetic Engineering and Biotechnology, Pathumtani, Thailand. Electronic address: sissades@biotec.or.th.
Can't find a country for:                                                                                              
Neuroimaging Research Unit, Institute of Neurological Sciences, National Research Council, Germaneto, CZ, Italy. Electronic address: a.cerasa@unicz.it.
Can't find a country for:                                             

In [12]:
articles.tail(25)

Unnamed: 0,pmid,doi,title,abstract,article_date,pubmed_date,article_type,lang,journal,journal_short,...,author_affils,keywords,mesh_terms,references_pmids,feature,include,affil_countries,affil_countries_unique,affil_first_country,affil_last_country
172411,9508109,,Prediction and cross-validation of neural netw...,The authors developed and cross-validated pred...,,1998-03-21,Comparative Study,eng,American journal of epidemiology,Am J Epidemiol,...,,,"[Algorithms, Health Maintenance Organizations,...",,Prediction and cross-validation of neural netw...,1.0,[nan],[nan],,
172422,9466835,,Modeling obesity using abductive networks.,This paper investigates the use of abductive-n...,,1998-03-21,Comparative Study,eng,"Computers and biomedical research, an internat...",Comput Biomed Res,...,,,"[Adult, Algorithms, Blood Glucose, Blood Press...",,Modeling obesity using abductive networks. Thi...,1.0,[nan],[nan],,
172432,9505252,,A neural net representation of experienced and...,A neural net approach was used to classify and...,,1998-03-20,Clinical Trial,eng,Journal of rehabilitation research and develop...,J Rehabil Res Dev,...,,,"[Adult, Disabled Persons, Energy Metabolism, E...",,A neural net representation of experienced and...,1.0,[nan],[nan],,
172437,9495724,,Use of a neural network to predict stone growt...,To determine whether a neural network is super...,,1998-03-12,Clinical Trial,eng,Urology,Urology,...,,,"[Follow-Up Studies, Humans, Kidney Calculi, Li...",,Use of a neural network to predict stone growt...,1.0,[nan],[nan],,
172450,9481717,,An artificial neural network can select patien...,The object of the study was to develop an arti...,,1998-03-03,Journal Article,eng,"Nephrology, dialysis, transplantation : offici...",Nephrol Dial Transplant,...,,,"[Adolescent, Adult, Aged, Female, Glomerulonep...",,An artificial neural network can select patien...,1.0,[nan],[nan],,
172451,9481716,,Application of Kohonen neural networks for the...,A Kohonen topological map is an artificial int...,,1998-03-03,Journal Article,eng,"Nephrology, dialysis, transplantation : offici...",Nephrol Dial Transplant,...,,,"[Adult, Aged, Humans, Kidney Diseases, Kidney ...",,Application of Kohonen neural networks for the...,1.0,[nan],[nan],,
172457,9479345,,Neural-network-assisted analysis and microscop...,To compare cytologists' detection of abnormali...,,1998-02-28,Clinical Trial,eng,Acta cytologica,Acta Cytol,...,,,"[Automation, Carcinoma, Case-Control Studies, ...",,Neural-network-assisted analysis and microscop...,1.0,[nan],[nan],,
172469,9473995,,Recognition of daily life motor activity class...,To investigate a possible role of artificial n...,,1998-02-25,Comparative Study,eng,Archives of physical medicine and rehabilitation,Arch Phys Med Rehabil,...,,,"[Activities of Daily Living, Adult, Aged, Ampu...",,Recognition of daily life motor activity class...,1.0,[nan],[nan],,
172477,9470397,,Non-parametric classification of esophagus mot...,Automatic long-term recording of esophageal pr...,,1998-02-21,Journal Article,eng,Methods of information in medicine,Methods Inf Med,...,,,"[Adult, Aged, Deglutition Disorders, Esophagea...",,Non-parametric classification of esophagus mot...,1.0,[nan],[nan],,
172478,9470396,,Decision support for psychiatric diagnosis bas...,This paper compares two classifiers: Pseudo Ba...,,1998-02-21,Comparative Study,eng,Methods of information in medicine,Methods Inf Med,...,,,"[Affective Symptoms, Bayes Theorem, Decision S...",,Decision support for psychiatric diagnosis bas...,1.0,[nan],[nan],,


In [14]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 34181 entries, 1.0 to 172538.0
Data columns (total 81 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   pmid                    34179 non-null  float64
 1   doi                     29547 non-null  object 
 2   title                   34178 non-null  object 
 3   abstract                34179 non-null  object 
 4   article_date            25954 non-null  object 
 5   pubmed_date             34179 non-null  object 
 6   article_type            34179 non-null  object 
 7   lang                    34179 non-null  object 
 8   journal                 34179 non-null  object 
 9   journal_short           34179 non-null  object 
 10  journal_country         34179 non-null  object 
 11  authors                 33377 non-null  object 
 12  author_affils           24189 non-null  object 
 13  keywords                18861 non-null  object 
 14  mesh_terms              25518 n

## CLEAN

In [4]:
articles['affil_fill_country'] = articles['affil_first_country']

In [5]:
## using first affiliation as primary country
## fill first with data from last author, then with pubmed country metadata
articles['affil_fill_country'] = articles['affil_fill_country'].fillna(articles['affil_last_country'])
articles['affil_fill_country'] = articles['affil_fill_country'].fillna(articles['journal_country'])

articles['affil_fill_country'] = articles['affil_fill_country'].astype('string')

In [6]:
##clean countries
articles["affil_fill_country"].replace({"England": "United Kingdom", 
                                             "Wales": "United Kingdom", 
                                             "Scotland": "United Kingdom", 
                                             "China (Republic : 1949- )" : "Taiwan"}, inplace=True)

articles['affil_fill_country'].value_counts()

United States          12415
China                   5305
United Kingdom          2482
South Korea             1327
Germany                 1273
                       ...  
Azerbaijan                 1
Trinidad and Tobago        1
Kazakhstan                 1
Costa Rica                 1
Palestine                  1
Name: affil_fill_country, Length: 91, dtype: int64

In [13]:
##lowercasing list of ANY author
articles['countries_lc'] = articles['affil_countries_unique'].str.lower().fillna(np.nan)

#articles['countries_lc'] = articles['countries_lc'].fillna(articles['affil_fill_country']).str.lower().astype('string')

In [15]:
articles['countries_lc'].replace(np.nan, '', inplace=True)

In [16]:
lmic_list = ["afghanistan", "burundi", "burkina faso", "central african republic", "congo", "eritrea", 
             "ethiopia", "guinea", "gambia", "guinea-bissau", "liberia", "madagascar", "mali", "mozambique", "malawi", 
             "niger", "north korea", "democratic republic of korea", "rwanda", "sudan", "sierra leone", "somalia", "south sudan", "syrian arab republic", 
             "chad", "togo", "uganda", "yemen", "angola", "benin", "bangladesh", "belize", "bolivia", "bhutan", 
             "cote d'ivoire", "ivory coast", "cameroon", "congo", "comoros", "cabo verde", "djibouti", "algeria", "egypt", 
             "micronesia", "ghana", "honduras", "haiti", "indonesia", "india", "iran", "kenya", 
             "kyrgyz republic", "cambodia", "kiribati", "lao", "sri lanka", "lesotho", "morocco", "myanmar", "mongolia", 
             "mauritania", "nigeria", "nicaragua", "nepal", "pakistan", "philippines", "papua new guinea", 
             "west bank and gaza", "palestinbe", "senegal", "solomon islands", "el salvador", "sao tome", "eswatini", 
             "tajikistan", "timor-leste", "tunisia", "tanzania", "ukraine", "uzbekistan", "vietnam", "vanuatu", "samoa", 
             "zambia", "zimbabwe", "albania", "argentina", "armenia", "american samoa", "azerbaijan", "bulgaria", 
             "bosnia", "belarus", "brazil", "botswana", "china", "colombia", "costa rica", "cuba", 
             "dominica", "dominican republic", "ecuador", "fiji", "gabon", "georgia", "equatorial guinea", "grenada", 
             "guatemala", "guyana", "iraq", "jamaica", "jordan", "kazakhstan", "lebanon", "libya", "lucia", "moldova", 
             "maldives", "mexico", "marshall islands", "north macedonia", "montenegro", "mauritius", "malaysia", "namibia", 
             "panama", "peru", "paraguay", "romania", "russian federation", "russia", "serbia", "suriname", "thailand", "turkmenistan", 
             "tonga", "turkey", "tuvalu", "st. vincent", "grenadines", "kosovo", "south africa", "venezuela"]

In [17]:
lmic_lower_list = ["afghanistan", "burundi", "burkina faso", "central african republic", "congo", "eritrea", 
             "ethiopia", "guinea", "gambia", "guinea-bissau", "liberia", "madagascar", "mali", "mozambique", "malawi", 
             "niger", "north korea", "democratic republic of korea", "rwanda", "sudan", "sierra leone", "somalia", "south sudan", "syrian arab republic", 
             "chad", "togo", "uganda", "yemen", "angola", "benin", "bangladesh", "belize", "bolivia", "bhutan", 
             "cote d'ivoire", "ivory coast", "cameroon", "congo", "comoros", "cabo verde", "djibouti", "algeria", "egypt", 
             "micronesia", "ghana", "honduras", "haiti", "indonesia", "india", "iran", "kenya", 
             "kyrgyz republic", "cambodia", "kiribati", "lao", "sri lanka", "lesotho", "morocco", "myanmar", "mongolia", 
             "mauritania", "nigeria", "nicaragua", "nepal", "pakistan", "philippines", "papua new guinea", 
             "west bank and gaza", "palestinbe", "senegal", "solomon islands", "el salvador", "sao tome", "eswatini", 
             "tajikistan", "timor-leste", "tunisia", "tanzania", "ukraine", "uzbekistan", "vietnam", "vanuatu", "samoa", 
             "zambia", "zimbabwe"]

In [18]:
##flags for ANY author

#initiate
articles['lmic_author_flag'] = np.where(articles['countries_lc'].str.contains('iran'), "1", "0")
articles['lmic_author_lower_flag'] = np.where(articles['countries_lc'].str.contains('iran'), "1", "0")
articles['lmic_china_flag'] = np.where(articles['countries_lc'].str.contains('china'), "1", "0")

In [20]:
#use lists
for x in lmic_list:
    articles['lmic_author_flag'] = np.where(articles['countries_lc'].str.contains(x), "1", articles['lmic_author_flag'])
    
for y in lmic_lower_list:
    articles['lmic_author_lower_flag'] = np.where(articles['countries_lc'].str.contains(x), "1", articles['lmic_author_lower_flag'])

In [21]:
articles['pubmed_date'] = pd.to_datetime(articles['pubmed_date'])

In [22]:
## new column for year, and year+month
articles['year'] = articles['pubmed_date'].dt.year

In [13]:
articles.to_csv('data/final_raw.csv')