In [1]:
import pandas as pd
import numpy as np
import ast
from tqdm import tqdm
import sys
import time

from flashgeotext.geotext import GeoText
from geopy.geocoders import Nominatim

In [2]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 500)

In [3]:
articles = pd.read_excel('data/comparative_labelled.xlsx', index_col=0)

In [4]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32472 entries, 0 to 161524
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   pmid              32472 non-null  int64         
 1   doi               27619 non-null  object        
 2   title             32472 non-null  object        
 3   abstract          32472 non-null  object        
 4   article_date      24121 non-null  datetime64[ns]
 5   pubmed_date       32472 non-null  datetime64[ns]
 6   article_type      32472 non-null  object        
 7   lang              32472 non-null  object        
 8   journal           32472 non-null  object        
 9   journal_short     32472 non-null  object        
 10  journal_country   32427 non-null  object        
 11  authors           31687 non-null  object        
 12  author_affils     22224 non-null  object        
 13  keywords          17278 non-null  object        
 14  mesh_terms        238

In [5]:
articles.authors = articles.authors.apply(lambda x: ast.literal_eval(x) if type(x) == str else x)
articles.author_affils = articles.author_affils.apply(lambda x: ast.literal_eval(x) if type(x) == str else x)
articles.keywords = articles.keywords.apply(lambda x: ast.literal_eval(x) if type(x) == str else x)
articles.references_pmids = articles.references_pmids.apply(lambda x: ast.literal_eval(x) if type(x) == str else x)
articles.mesh_terms = articles.mesh_terms.apply(lambda x: ast.literal_eval(x) if type(x) == str else x)

In [6]:
non_nan_df = articles[articles['author_affils'].isnull() == False]

In [7]:
temp_list = []
agg_list = []

for affil in non_nan_df.author_affils:
    temp_list.append(affil)
    
for sublist in temp_list:
    for x in sublist:
        agg_list.append(x)

In [8]:
import random 

print(random.sample(agg_list, 10))

['Clinic of Orthodontics and Pediatric Dentistry, Center of Dental Medicine, University of Zurich, Switzerland.', 'Department of Electrical and Computer Engineering, University of Saskatchewan, Saskatoon, Canada.', "Department of Gastroenterology, Shanxi Provincial People's Hospital of Shanxi Medical University, Taiyuan, China.", 'Department of radiology, AP-HP, HÃ´pital Saint-Antoine, Paris, France.', 'The Department of Electrical & Computer Engineering, Texas Tech University (TTU), Lubbock, 79409 TX, USA. ntaro1031@gmail.com.', 'Master of Science Program in Technology of Information System Management, Faculty of Engineering, Mahidol University, Nakhon Pathom 73170, Thailand.', 'Spinal Cord Injury Center, University Hospital Balgrist, Switzerland.', 'Department of Diagnostic Radiology, Graduate School of Life Sciences, Kumamoto University, 1-1-1, Honjo, Chuo-ku, Kumamoto 860-8556, Japan. Electronic address: mstknkgw.a.you.1@gmail.com.', 'Neatec S.p.A., Via Campi Flegrei, 34, 80078 Poz

In [9]:
##some list items contain ';' -> these need to be split into individual list items

y_list = []
affil_list = []

for x in agg_list:
    if ';' in x:
        y = x.split(';')
        y_list.append(y)
    else:
        affil_list.append(x)
        
for sublist in y_list:
    for z in sublist:
        affil_list.append(z)

In [10]:
len(affil_list)

182323

In [11]:
print(random.sample(affil_list, 10))

['Division of Gastroenterology, Bezmialem University School of Medicine, Fatih, Istanbul, 34025, Turkey.', 'Department of Ultrasound, First Affiliated Hospital, Zhejiang University, Hangzhou, 310003, China.', 'Institute for Health and Sport, Victoria University, Melbourne, VIC, Australia.', " Li Ka Shing Knowledge Institute, St. Michael's Hospital, Toronto, ON, Canada.", 'FEIT, School of Computer ScienceAustralian Artificial Intelligence Institute, University of Technology Sydney Ultimo NSW 2007 Australia.', ' Department of Surgery, University of Verona, Verona, Italy.', 'German Center for Vertigo and Balance Disorders, DSGZ, University of Munich, Campus Grosshadern, Marchioninistrasse 15, 81377 Munich, Germany.', 'Academy for Engineering and Technology, Fudan University, 20 Handan Road, Shanghai, 200433, China.', 'Department of Radiation Oncology, Shanghai Chest Hospital, Shanghai Jiao Tong University, Shanghai, China.', 'Department of Psychology, Lund University, Lund, Sweden.']


## Removing duplicates

In [12]:
def remove_dupes(x):
    return list(dict.fromkeys(x))

affil_unique = remove_dupes(affil_list)

In [13]:
len(affil_unique)

85755

In [14]:
with open("affils.txt", "w") as output:
    output.write(str(affil_unique))

## Geocoding uniques

In [15]:
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import folium

In [16]:
from tqdm.notebook import tqdm

In [17]:
locator = Nominatim(user_agent="health_ai_scraper")

geocode = RateLimiter(locator.geocode, min_delay_seconds=0.5)

In [18]:
location_df = pd.DataFrame(affil_unique, columns = ['affil_unique'])

In [19]:
#tqdm.pandas()
#
#locationdf['loc'] = location_df['affil_unique'].progress_apply(geocode)

## Rules affil identification

In [20]:
label_df = location_df.copy()

In [21]:
label_df = label_df.applymap(lambda x:x.lower() if type(x) == str else x) #reduce all to lowercase

In [22]:
text = ['faculty', 'institut', 'college', 'department', 'school', 'research', 'organizat', 'organisat', 'laboratory',
        'graduat', 'polytech', 'univ.', 'dept.', 'service', 'committee', 'agency', 'division', 'schule', 'academy',
        'epidemiology', 'statistics', 'ministry', 'klinik', ' clinic', 'registry', 'trial', 'medical group', 
        'hospital', 'medicine', 'medical cent', 'care cent', 'health cent', 'foundation trust', 'nhs trust',
        'health system', 'care system', 'health education', 'program', 'center', 'centre',
        ' unit', 'dipartimentale', 'laboratoire', 'cancer', 'practice', 'administration', 'care network', 'health network']

label_df['academic'] = np.where(label_df['affil_unique'].str.contains("universit"), "1", "0")

for x in text:
    label_df['academic'] = np.where(label_df['affil_unique'].str.contains(x), "1", label_df['academic']) #if yes then 1, if no, keep current


In [23]:
#' inc,', ' inc.', 'ltd', 'technologies', ' co.', 'enterprise', 'gmbh', 'solutions', 'cyber', 'company', 
        #'healthtech', 'intelligent', 'engineering', 'incorp', '.ai', ' llc', 'corporation', 'biotech', 
         # 'software' ,'b.v', 'pharma', 'holdings', 'consulting', 'n.v', 'limited', 'corp.', 'bioscience', 'life sciences',
          # 'solutions', 'diagnostics', 'intelligence', 'electronics', 's.r.l', ' ag,', ' labs,', ' corp', ' ab,', 'systems',
        #'venture', 'plc.', ' plc,', 'diagnostic system',

set_terms = ['\\.ai', ' llc', 'corporation', 'b\\.v', 'holdings', 'consulting', 'n\\.v', 'limited', 'corp\\.', 
        's\\.r\\.l', ' ag,', ' labs,', ' corp ', 'venture', 'plc\\.', ' plc,',
    ' inc,', ' inc\\.', ' ltd', ' co\\. ', ' co\\., ', ' enterprise', ' gmbh', ' solutions', ' company', ' healthtech', ' incorporat', 
       'xy\\.ai', 'genentech', 'affidea', 'aidence', 'aicure', 'toshiba'
       'fraunhofer', ' ge healthcare', ',ge healthcare', 'huawei', 'imagia cyber', 'intrinsic imaging', 'microsoft', 'md.ai',
       'peninsula diagnostic', 'siemens', 'subtle medical', 'medtronic', 'therapanacea', 'thirona', 'apollo radiology',
       'apple', 'arsalis', 'artialis', 'atheropoint', 'atr cognitive', 'biogrid', 'blueheart', 'brainlab', 'cytognomix',
       'daisylabs', 'live well', 'deep mri', 'deeptrace', 'dianei', 'diasens', 'dicella', 'diversigen', 'entelai', 'essilor',
        'everseen', 'geneis', 'lundbeck', 'tencent', 'connex', 'hewlett packard', 'hisilicon', 'icad,', 'idx,', 'idx ',
        'icometrix', 'imago7', 'imec,', 'iminds', 'imedisyn', 'imsight', 'imt atlantique', 'inalco', 'infervision', 
        'innotears', 'aramis project', 'visages project', 'intel ', 'intel,', 'ivf 2.0', 'ivfqc', 'ipixel', 'screenpoint medical',
        'medo.ai', 'merantix', 'mylab', 'nanocytomics', 'noble life science', 'nordic bioscience', 'ozescribe',
        'nvidia', 'owkin', 'peng cheng lab', 'philips healthcare', 'quantitative imaging', 'rhinotech', 'satisfai health',
        'screenpoint medical', 'sense4care', 'sintef', 'thrombodx', 'treat systems', 'virtual radiologic', 'visiana', 'visuworks', 'vito nv',
        'lucence diag', 'ibm ', 'savvysherpa', 'geneprodx', 'novartis', 'sigma tech', 'pfizer', 'roche ', 'roche,', 'sage bionet', 'iluvatar corex',
        'mcgraw-hill', 'senselab', 'force tech', 'genesystems', '2d soft', 'datarobot', 'lilly', 'anolinx', 'idrogenet',
        'acer ', 'acobiom', 'acroviz', 'active cosmetics', 'adobe', 'iqvia', 'advance analytics', 'aurora health', 'aetherai',
        'agfa', 'agomab', 'ai cure', 'deepwise', 'selvas', 'ai4lyf', 'aicure', 'aibraintree', 'aideas', 'aiforia', 'aimbrain', 
        'aifred', 'aitrics', 'alacris', 'alivecor', 'alpiq energy', 'amazon', 'amnis', 'ansys', 'applied materials',
        'applied minds', 'applied proteomics', 'aramis lab', 'argus cognitive', 'armani', 'audi ag', 'baidu', 'bayer', 'behavior imaging',
        'behold.ai', 'beiersdorf ag', 'better care s.l.', 'bgi genomics', 'bgi-', 'biobridge', 'biofourmis', 'biogen', 'biosensics', 
        'boehringer', 'bolton clarke', 'bosch', 'boston scientific', 'bracco imaging', 'brain innovation', 'brain vision solutions', 'brain wise', 
        'brainlab ag', 'brainreader', 'brainvue', 'bristol myers', 'bristol-myers', 'bruker', 'nagercoil', 'exactech', 'canhelp genomics',
        'capital one', 'cardiio', 'carl zeiss', 'casio', 'cbmed', 'cellanyx', 'cerner', 'cliexa', 'clarigent', 'circle cardio', 'biowink',
        'cockerell', 'infotech', 'coloplast', 'contextvision ab', 'crescom', 'data61', 'dai nippon', 'daktari', 'dedalus', 'deepmind',
        'defibtech', 'deloitte', 'demant', 'docbot', 'dynelytics', 'eigenvision', 'eli lilly', 'elsevier', 'gsk', 'qure.ai',
        'epic systems', 'johnson & johnson', 'fibrogen', 'fidelity' ,'flatiron', 'synaptive', 'freenome', 'fresenius',
       'fujifilm', 'fujitsu', 'gc pharma', ' ge medical', ' ge medical,' 'general electric', 'gilead', 'gns healthcare', 'hisense',
        'hitachi', 'ibm ', 'ibm,', 'immunitybio', 'infervision', 'inflammatix', 'ingeniorx', 'infobyte', 'insilico genomics',
        'interpretable ai', 'janssen', 'kyocera', 'lifesemantics', 'limbus ai', 'medcurio', 'mediatek', 'mediasoft', 'merck',
        'mindpax', 'mindray', 'mindsgo', 'midas,', 'mitsubishi', 'mobilab', 'modiface', 'mondobrain' ,'sunsoft', 'mvision ai',
        'nference labs', ' nference ', ' nference,', 'novartis', 'novo nordisk', 'okaki health', 'olink proteomics', 'astrazeneca', 'astra zeneca', 'astra-zeneca',
        'oncostem', 'opsens', 'optech', 'optellum', 'orobix srl', 'orange health', 'oracle', 'optretina', 'oura health',
        'panasonic', 'parexel', 'phastar', 'ping an health', 'pingan health', 'global biomedical', 'predible health', 'prescience labs',
        'primary endpoint', 'pricewaterhouse', 'procardiaco', 'qt ultrasound', 'renalytix', 'rolls-royce', 'sage bionetworks',
        'sanofi', 'savana', 'screenpoint', 'sensyne', 'smart blood analytics', 'sysmex', 'takeda', 'taliaz', 'texisense', 'truemotion',
        'unanimous ai', 'varian medical', 'veracyte', 'verily life', 'visionary intelligence', 'visulytix', 'welldoc']


In [24]:
label_df['commercial'] = np.where(label_df['affil_unique'].str.contains("google"), "1", "0")

for x in set_terms:
    label_df['commercial'] = np.where(label_df['affil_unique'].str.contains(x), "1", label_df['commercial']) #if yes then 1, if no, keep current

In [25]:
label_df.to_csv('unique_affil.csv')

## Dataframe for affil tag

In [26]:
export_df = articles[['pmid', 'author_affils', 'include']].copy()

In [27]:
#set_terms = ['\\.ai', ' llc', 'corporation', 'b\\.v', 'pharmaceutical', 'holdings', 'consulting', 'n\\.v', 'limited', 'corp\\.', 
#        's\\.r\\.l', ' ag,', ' labs,', ' corp ', 'venture', 'plc\\.', ' plc,',
#    ' inc,', ' inc\\.', ' ltd', ' co\\. ', ' co\\., ', ' enterprise', ' gmbh', ' solutions', ' company', ' healthtech', ' incorporat', 
#       'xy\\.ai', 'genentech', 'affidea', 'aidence', 'aicure', 'canon', 'advanced imaging',
#       'fraunhofer', 'ge healthcare', 'huawei', 'imagia cyber', 'intrinsic imaging', 'microsoft', 'md.ai',
#       'peninsula diagnostic', 'siemens', 'subtle medical', 'therapanacea', 'thirona', 'apollo radiology',
#       'apple', 'arsalis', 'artialis', 'atheropoint', 'atr cognitive', 'biogrid', 'blueheart', 'brainlab', 'cytognomix',
#       'daisylabs', 'live well', 'deep mri', 'deeptrace', 'dianei', 'diasens', 'dicella', 'diversigen', 'entelai', 'essilor',
#        'everseen', 'geneis', 'lundbeck', 'tencent', 'connex', 'hewlett packard', 'hisilicon', 'icad,', 'idx,', 'idx ',
#        'icometrix', 'imago7', 'imec,', 'iminds', 'imedisyn', 'imsight', 'imt atlantique', 'inalco', 'infervision', 
#        'innotears', 'aramis project', 'visages project', 'intel ', 'intel,', 'ivf 2.0', 'ivfqc', 'ipixel', 'screenpoint medical',
#        'medo.ai', 'merantix', 'mylab', 'nanocytomics', 'noble life science', 'nordic bioscience', 'ozescribe',
#        'nvidia', 'owkin', 'peng cheng lab', 'philips healthcare', 'quantitative imaging', 'rhinotech', 'satisfai health',
#        'screenpoint medical', 'sense4care', 'sintef', 'thrombodx', 'treat systems', 'virtual radiologic', 'visiana', 'visuworks', 'vito nv',
#        'lucence diag', 'ibm ', 'savvysherpa', 'geneprodx', 'novartis', 'sigma tech', 'pfizer', 'roche ', 'roche,', 'sage bionet', 'iluvatar corex',
#        'mcgraw-hill', 'senselab', 'force tech', 'genesystems', '2d soft', 'datarobot', 'lilly', 'anolinx', 'idrogenet',
#        'acer ', 'acobiom', 'acroviz', 'active cosmetics', 'adobe', 'iqvia', 'advance analytics', 'aurora health', 'aetherai',
#        'agfa', 'agomab', 'ai cure', 'deepwise', 'selvas', 'ai4lyf', 'aicure', 'aibraintree', 'aideas', 'aiforia', 'aimbrain', 
#        'aifred', 'aitrics', 'alacris', 'alivecor', 'alpiq energy', 'amazon', 'amnis', 'ansys', 'applied materials',
#        'applied minds', 'applied proteomics', 'aramis lab', 'argus cognitive', 'armani', 'audi ag', 'baidu', 'bayer', 'behavior imaging',
#        'behold.ai', 'beiersdorf ag', 'better care s.l.', 'bgi genomics', 'bgi-', 'biobridge', 'biofourmis', 'biogen', 'biosensics', 
#        'boehringer', 'bolton clarke', 'bosch', 'boston scientific', 'bracco imaging', 'brain innovation', 'brain vision solutions', 'brain wise', 
#        'brainlab ag', 'brainreader', 'brainvue', 'bristol myers', 'bristol-myers', 'bruker', 'nagercoil', 'exactech', 'canhelp genomics',
#        'capital one', 'cardiio', 'carl zeiss', 'casio', 'cbmed', 'cellanyx', 'cerner', 'cliexa', 'clarigent', 'circle cardio', 'biowink',
#        'cockerell', 'infotech', 'coloplast', 'contextvision ab', 'crescom', 'data61', 'dai nippon', 'daktari', 'dedalus', 'deepmind',
#        'defibtech', 'deloitte', 'demant', 'docbot', 'dynelytics', 'eigenvision', 'eli lilly', 'elsevier', 'gsk', 'qure.ai',
#        'epic systems', 'johnson & johnson', 'fibrogen', 'fidelity' ,'flatiron', 'synaptive', 'freenome', 'fresenius',
#       'fujifilm', 'fujitsu', 'gc pharma', 'ge medical' , 'general electric healthcare', 'gilead', 'gns healthcare', 'hisense',
#        'hitachi', 'ibm ', 'ibm,', 'immunitybio', 'infervision', 'inflammatix', 'ingeniorx', 'infobyte', 'insilico genomics',
#        'interpretable ai', 'janssen', 'kyocera', 'lifesemantics', 'limbus ai', 'medcurio', 'mediatek', 'mediasoft', 'merck',
#        'mindpax', 'mindray', 'mindsgo', 'midas,', 'mitsubishi', 'mobilab', 'modiface', 'mondobrain' ,'sunsoft', 'mvision ai',
#        'nference labs', 'novartis', 'novo nordisk', 'okaki health', 'olink proteomics', 'astrazeneca', 'astra zeneca', 'astra-zeneca',
#        'oncostem', 'opsens', 'optech', 'optellum', 'orobix srl', 'orange health', 'oracle', 'optretina', 'oura health',
#        'panasonic', 'parexel', 'phastar', 'ping an health', 'pingan health', 'global biomedical', 'predible health', 'prescience labs',
#        'primary endpoint', 'pricewaterhouse', 'procardiaco', 'qt ultrasound', 'renalytix', 'rolls-royce', 'sage bionetworks',
#        'sanofi', 'savana', 'screenpoint', 'sensyne', 'smart blood analytics', 'sysmex', 'takeda', 'taliaz', 'texisense', 'truemotion',
#        'unanimous ai', 'varian medical', 'veracyte', 'verily life', 'visionary intelligence', 'visulytix', 'welldoc']

In [28]:
export_df['author_affils'] = export_df['author_affils'].fillna('na')
export_df['author_affils'] = export_df['author_affils'].map(lambda x: list(map(str.lower, x)))

In [29]:
set_list = []

for affil in export_df.author_affils:
    set_list.append(affil)
    
set_list = [', '.join(x) for x in set_list]

In [30]:
export_df['joined_affils'] = set_list

In [31]:
export_df['commercial'] = np.where(export_df['joined_affils'].str.contains("google"), "google", "0")

for x in set_terms:
    export_df['commercial'] = np.where(export_df['joined_affils'].str.contains(x), x, export_df['commercial'])

In [32]:
export_df['comm_flag'] = np.where(export_df['joined_affils'].str.contains("google"), "1", "0")

for x in set_terms:
    export_df['comm_flag'] = np.where(export_df['joined_affils'].str.contains(x), "1", export_df['comm_flag']) #if yes then 1, if no, keep current

In [33]:
export_df.loc[export_df['commercial'] != '0'].sample(50)

Unnamed: 0,pmid,author_affils,include,joined_affils,commercial,comm_flag
71466,30050768,"[optellum ltd, oxford, uk., department of radiology, oxford university hospitals nhs foundation trust, oxford, uk.]",0,"optellum ltd, oxford, uk., department of radiology, oxford university hospitals nhs foundation trust, oxford, uk.",optellum,1
27528,32964477,"[md anderson uthealth graduate school, houston, tx, usa., department of radiation oncology, the university of texas md anderson cancer center, houston, tx, usa., department of imaging physics, the university of texas md anderson cancer center, houston, tx, usa., md anderson uthealth graduate school, houston, tx, usa., department of radiation physics, division of radiation oncology, the university of texas md anderson cancer center, houston, tx, usa., department of radiation physics, division of radiation oncology, the university of texas md anderson cancer center, houston, tx, usa., department of radiation physics, division of radiation oncology, the university of texas md anderson cancer center, houston, tx, usa., department of radiation physics, division of radiation oncology, the university of texas md anderson cancer center, houston, tx, usa., department of imaging physics, the university of texas md anderson cancer center, houston, tx, usa., department of medical physics (g68), university of the free state, bloemfontein, south africa., department of medical physics (g68), university of the free state, bloemfontein, south africa., division of radiation oncology and medical physics, university of cape town and groote schuur hospital, cape town, south africa., division of radiation oncology and medical physics, university of cape town and groote schuur hospital, cape town, south africa., division of radiation oncology and medical physics, university of cape town and groote schuur hospital, cape town, south africa., division of medical physics, stellenbosch university, tygerberg academic hospital, cape town, south africa., division of radiation oncology, stellenbosch university, tygerberg academic hospital, cape town, south africa., department of radiation physics, division of radiation oncology, the university of texas md anderson cancer center, houston, tx, usa.]",0,"md anderson uthealth graduate school, houston, tx, usa., department of radiation oncology, the university of texas md anderson cancer center, houston, tx, usa., department of imaging physics, the university of texas md anderson cancer center, houston, tx, usa., md anderson uthealth graduate school, houston, tx, usa., department of radiation physics, division of radiation oncology, the university of texas md anderson cancer center, houston, tx, usa., department of radiation physics, division of radiation oncology, the university of texas md anderson cancer center, houston, tx, usa., department of radiation physics, division of radiation oncology, the university of texas md anderson cancer center, houston, tx, usa., department of radiation physics, division of radiation oncology, the university of texas md anderson cancer center, houston, tx, usa., department of imaging physics, the university of texas md anderson cancer center, houston, tx, usa., department of medical physics (g68), university of the free state, bloemfontein, south africa., department of medical physics (g68), university of the free state, bloemfontein, south africa., division of radiation oncology and medical physics, university of cape town and groote schuur hospital, cape town, south africa., division of radiation oncology and medical physics, university of cape town and groote schuur hospital, cape town, south africa., division of radiation oncology and medical physics, university of cape town and groote schuur hospital, cape town, south africa., division of medical physics, stellenbosch university, tygerberg academic hospital, cape town, south africa., division of radiation oncology, stellenbosch university, tygerberg academic hospital, cape town, south africa., department of radiation physics, division of radiation oncology, the university of texas md anderson cancer center, houston, tx, usa.",bosch,1
27846,32947608,"[novo nordisk foundation center for basic metabolic research, university of copenhagen, copenhagen, denmark., novo nordisk foundation center for basic metabolic research, university of copenhagen, copenhagen, denmark., novo nordisk foundation center for basic metabolic research, university of copenhagen, copenhagen, denmark., department of nutrition, exercise and sports, university of copenhagen, copenhagen, denmark., novo nordisk foundation center for basic metabolic research, university of copenhagen, copenhagen, denmark., novo nordisk foundation center for basic metabolic research, university of copenhagen, copenhagen, denmark., novo nordisk foundation center for basic metabolic research, university of copenhagen, copenhagen, denmark., national food institute, technical university of denmark, lyngby, denmark., department of biotechnology and biomedicine, technical university of denmark, kgs. lyngby, denmark., novo nordisk foundation center for basic metabolic research, university of copenhagen, copenhagen, denmark., department of veterinary disease biology, faculty of science, university of copenhagen, frederiksberg, denmark., novo nordisk foundation center for basic metabolic research, university of copenhagen, copenhagen, denmark., department of biotechnology and biomedicine, technical university of denmark, kgs. lyngby, denmark., clinical-microbiomics a/s, copenhagen, denmark., novo nordisk foundation center for basic metabolic research, university of copenhagen, copenhagen, denmark., novo nordisk foundation center for basic metabolic research, university of copenhagen, copenhagen, denmark., department of health technology, technical university of denmark, lyngby, denmark., national food institute, technical university of denmark, lyngby, denmark., department of nutrition, exercise and sports, university of copenhagen, copenhagen, denmark., novo nordisk foundation center for basic metabolic research, university of copenhagen, copenhagen, denmark.]",0,"novo nordisk foundation center for basic metabolic research, university of copenhagen, copenhagen, denmark., novo nordisk foundation center for basic metabolic research, university of copenhagen, copenhagen, denmark., novo nordisk foundation center for basic metabolic research, university of copenhagen, copenhagen, denmark., department of nutrition, exercise and sports, university of copenhagen, copenhagen, denmark., novo nordisk foundation center for basic metabolic research, university of copenhagen, copenhagen, denmark., novo nordisk foundation center for basic metabolic research, university of copenhagen, copenhagen, denmark., novo nordisk foundation center for basic metabolic research, university of copenhagen, copenhagen, denmark., national food institute, technical university of denmark, lyngby, denmark., department of biotechnology and biomedicine, technical university of denmark, kgs. lyngby, denmark., novo nordisk foundation center for basic metabolic research, university of copenhagen, copenhagen, denmark., department of veterinary disease biology, faculty of science, university of copenhagen, frederiksberg, denmark., novo nordisk foundation center for basic metabolic research, university of copenhagen, copenhagen, denmark., department of biotechnology and biomedicine, technical university of denmark, kgs. lyngby, denmark., clinical-microbiomics a/s, copenhagen, denmark., novo nordisk foundation center for basic metabolic research, university of copenhagen, copenhagen, denmark., novo nordisk foundation center for basic metabolic research, university of copenhagen, copenhagen, denmark., department of health technology, technical university of denmark, lyngby, denmark., national food institute, technical university of denmark, lyngby, denmark., department of nutrition, exercise and sports, university of copenhagen, copenhagen, denmark., novo nordisk foundation center for basic metabolic research, university of copenhagen, copenhagen, denmark.",novo nordisk,1
20071,33317528,"[assistant executive director of nursing, hamad medical corporation, doha, qatar., management information systems, business, and economics faculty, qatar university, doha, qatar., industrial engineering, university of central florida, orlando, usa., department of surgery, trauma surgery, hamad medical corporation, doha, qatar., department of surgery, trauma surgery, hamad medical corporation, doha, qatar., department of surgery, trauma surgery, clinical research, hamad medical corporation, doha, qatar. aymanco65@yahoo.com.]",0,"assistant executive director of nursing, hamad medical corporation, doha, qatar., management information systems, business, and economics faculty, qatar university, doha, qatar., industrial engineering, university of central florida, orlando, usa., department of surgery, trauma surgery, hamad medical corporation, doha, qatar., department of surgery, trauma surgery, hamad medical corporation, doha, qatar., department of surgery, trauma surgery, clinical research, hamad medical corporation, doha, qatar. aymanco65@yahoo.com.",corporation,1
66052,30528092,"[department of radiology, peking university people's hospital, beijing, p r china; department of radiology, yantai yuhuangding hospital, yantai, shandong, p r china., department of radiology, peking university people's hospital, beijing, p r china., department of radiology, yantai yuhuangding hospital, yantai, shandong, p r china., department of radiology, yantai yuhuangding hospital, yantai, shandong, p r china., department of radiology, yantai yuhuangding hospital, yantai, shandong, p r china., ge healthcare, china, shanghai, p r china., department of radiology, yantai yuhuangding hospital, yantai, shandong, p r china. electronic address: xhz000417@sina.com., department of radiology, peking university people's hospital, beijing, p r china. electronic address: hongnan@bjmu.edu.cn.]",1,"department of radiology, peking university people's hospital, beijing, p r china; department of radiology, yantai yuhuangding hospital, yantai, shandong, p r china., department of radiology, peking university people's hospital, beijing, p r china., department of radiology, yantai yuhuangding hospital, yantai, shandong, p r china., department of radiology, yantai yuhuangding hospital, yantai, shandong, p r china., department of radiology, yantai yuhuangding hospital, yantai, shandong, p r china., ge healthcare, china, shanghai, p r china., department of radiology, yantai yuhuangding hospital, yantai, shandong, p r china. electronic address: xhz000417@sina.com., department of radiology, peking university people's hospital, beijing, p r china. electronic address: hongnan@bjmu.edu.cn.",ge healthcare,1
31273,32755355,"[department of urology and pediatric urology, university medical center, mainz, germany., clinical research directorate, leidos biomedical research, inc., frederick, md., division of cancer treatment and diagnosis: biometric research program, national cancer institute, national institutes of health, rockville, md., molecular imaging program, national cancer institute, national institutes of health, 10 center dr, msc 1182, bldg 10, rm b3b85, bethesda, md 20892-1088., molecular imaging program, national cancer institute, national institutes of health, 10 center dr, msc 1182, bldg 10, rm b3b85, bethesda, md 20892-1088., department of urology, acibadem university, istanbul, turkey., department of radiology, university of udine, udine, italy., diagnostic imaging department, albert einstein hospital, sao paolo, brazil., department of urology, koã§ university, school of medicine, istanbul, turkey., department of radiology, ankara city hospital, ankara, turkey., department of radiology, university of udine, udine, italy., department of radiology, acibadem university, istanbul, turkey., department of urology, acibadem university, istanbul, turkey., department of radiology, cleveland clinic, cleveland, oh., department of urology, university of alabama at birmingham, birmingham, al., diagnostic imaging department, albert einstein hospital, sao paolo, brazil., department of pathology, cleveland clinic, cleveland, oh., department of pathology, university of alabama at birmingham, birmingham, al., pathology department, albert einstein hospital, sao paolo, brazil., laboratory of pathology, national cancer institute, national institutes of health, bethesda, md., department of pathology, ankara yildirim beyazit university, school of medicine, ankara, turkey., department of pathology, acibadem university, istanbul, turkey., department of pathology, university of udine, udine, italy., department of pathology, university of cambridge, cambridge, uk., department of radiology, university of cambridge, cambridge, uk., department of radiology, federal fluminense university, rio de janeiro, brazil., department of radiology, university of health sciences dr. behã§et uz child disease and pediatric surgery training and research hospital, ä°zmir, turkey., department of radiology, walter reed medical center, bethesda, md., department of radiology, singapore general hospital, singapore., department of radiology and imaging sciences, clinical center, national institutes of health, bethesda, md., weill cornell imaging, cornell university, new york, ny., department of radiology and imaging sciences, clinical center, national institutes of health, bethesda, md., department of radiology, medical imaging centre, nuclear medicine and molecular imaging, university of groningen, university medical center groningen, groningen, the netherlands., center for interventional oncology, national cancer institute and radiology and imaging sciences, clinical center, national institutes of health, bethesda, md., urologic oncology branch, national cancer institute, national institutes of health, bethesda, md., molecular imaging program, national cancer institute, national institutes of health, 10 center dr, msc 1182, bldg 10, rm b3b85, bethesda, md 20892-1088., national institutes of health clinical center, imaging biomarkers and computer-aided diagnosis laboratory, radiology and imaging sciences, bethesda, md., molecular imaging program, national cancer institute, national institutes of health, 10 center dr, msc 1182, bldg 10, rm b3b85, bethesda, md 20892-1088.]",1,"department of urology and pediatric urology, university medical center, mainz, germany., clinical research directorate, leidos biomedical research, inc., frederick, md., division of cancer treatment and diagnosis: biometric research program, national cancer institute, national institutes of health, rockville, md., molecular imaging program, national cancer institute, national institutes of health, 10 center dr, msc 1182, bldg 10, rm b3b85, bethesda, md 20892-1088., molecular imaging program, national cancer institute, national institutes of health, 10 center dr, msc 1182, bldg 10, rm b3b85, bethesda, md 20892-1088., department of urology, acibadem university, istanbul, turkey., department of radiology, university of udine, udine, italy., diagnostic imaging department, albert einstein hospital, sao paolo, brazil., department of urology, koã§ university, school of medicine, istanbul, turkey., department of radiology, ankara city hospital, ankara, turkey., department of radiology, university of udine, udine, italy., department of radiology, acibadem university, istanbul, turkey., department of urology, acibadem university, istanbul, turkey., department of radiology, cleveland clinic, cleveland, oh., department of urology, university of alabama at birmingham, birmingham, al., diagnostic imaging department, albert einstein hospital, sao paolo, brazil., department of pathology, cleveland clinic, cleveland, oh., department of pathology, university of alabama at birmingham, birmingham, al., pathology department, albert einstein hospital, sao paolo, brazil., laboratory of pathology, national cancer institute, national institutes of health, bethesda, md., department of pathology, ankara yildirim beyazit university, school of medicine, ankara, turkey., department of pathology, acibadem university, istanbul, turkey., department of pathology, university of udine, udine, italy., department of pathology, university of cambridge, cambridge, uk., department of radiology, university of cambridge, cambridge, uk., department of radiology, federal fluminense university, rio de janeiro, brazil., department of radiology, university of health sciences dr. behã§et uz child disease and pediatric surgery training and research hospital, ä°zmir, turkey., department of radiology, walter reed medical center, bethesda, md., department of radiology, singapore general hospital, singapore., department of radiology and imaging sciences, clinical center, national institutes of health, bethesda, md., weill cornell imaging, cornell university, new york, ny., department of radiology and imaging sciences, clinical center, national institutes of health, bethesda, md., department of radiology, medical imaging centre, nuclear medicine and molecular imaging, university of groningen, university medical center groningen, groningen, the netherlands., center for interventional oncology, national cancer institute and radiology and imaging sciences, clinical center, national institutes of health, bethesda, md., urologic oncology branch, national cancer institute, national institutes of health, bethesda, md., molecular imaging program, national cancer institute, national institutes of health, 10 center dr, msc 1182, bldg 10, rm b3b85, bethesda, md 20892-1088., national institutes of health clinical center, imaging biomarkers and computer-aided diagnosis laboratory, radiology and imaging sciences, bethesda, md., molecular imaging program, national cancer institute, national institutes of health, 10 center dr, msc 1182, bldg 10, rm b3b85, bethesda, md 20892-1088.",inc\.,1
36117,32488126,"[department of psychiatry, new york university grossman school of medicine, new york, ny, usa. ks3796@cumc.columbia.edu., department of psychiatry, center for alcohol use disorder and ptsd, new york university grossman school of medicine, new york, ny, usa., department of psychiatry, center for alcohol use disorder and ptsd, new york university grossman school of medicine, new york, ny, usa., harvard paulson school of engineering & applied sciences, boston, ma, usa., department of psychiatry, new york university grossman school of medicine, new york, ny, usa., department of psychiatry, new york university grossman school of medicine, new york, ny, usa., integrative systems biology, us army center for environmental health research, usacehr, fort detrick, frederick, md, usa., mclean hospital, harvard university, boston, ma, usa., integrative systems biology, us army center for environmental health research, usacehr, fort detrick, frederick, md, usa., harvard paulson school of engineering & applied sciences, boston, ma, usa., department of obstetrics, gynecology & reproductive sciences, university of california, san francisco, ca, usa., department of psychiatry/weill institute for neurosciences, university of california, san francisco, ca, usa., department of psychiatry, center for alcohol use disorder and ptsd, new york university grossman school of medicine, new york, ny, usa., alto neuroscience, inc., los altos, ca, usa., mclean hospital, harvard university, boston, ma, usa., harvard paulson school of engineering & applied sciences, boston, ma, usa., integrative systems biology, us army center for environmental health research, usacehr, fort detrick, frederick, md, usa., department of psychiatry, center for alcohol use disorder and ptsd, new york university grossman school of medicine, new york, ny, usa.]",0,"department of psychiatry, new york university grossman school of medicine, new york, ny, usa. ks3796@cumc.columbia.edu., department of psychiatry, center for alcohol use disorder and ptsd, new york university grossman school of medicine, new york, ny, usa., department of psychiatry, center for alcohol use disorder and ptsd, new york university grossman school of medicine, new york, ny, usa., harvard paulson school of engineering & applied sciences, boston, ma, usa., department of psychiatry, new york university grossman school of medicine, new york, ny, usa., department of psychiatry, new york university grossman school of medicine, new york, ny, usa., integrative systems biology, us army center for environmental health research, usacehr, fort detrick, frederick, md, usa., mclean hospital, harvard university, boston, ma, usa., integrative systems biology, us army center for environmental health research, usacehr, fort detrick, frederick, md, usa., harvard paulson school of engineering & applied sciences, boston, ma, usa., department of obstetrics, gynecology & reproductive sciences, university of california, san francisco, ca, usa., department of psychiatry/weill institute for neurosciences, university of california, san francisco, ca, usa., department of psychiatry, center for alcohol use disorder and ptsd, new york university grossman school of medicine, new york, ny, usa., alto neuroscience, inc., los altos, ca, usa., mclean hospital, harvard university, boston, ma, usa., harvard paulson school of engineering & applied sciences, boston, ma, usa., integrative systems biology, us army center for environmental health research, usacehr, fort detrick, frederick, md, usa., department of psychiatry, center for alcohol use disorder and ptsd, new york university grossman school of medicine, new york, ny, usa.",inc\.,1
45472,31925552,"[department of urology, yonsei university college of medicine, 211 eonju-ro, gangnam-gu, seoul, 135-720, republic of korea., department of urology, yonsei university college of medicine, 211 eonju-ro, gangnam-gu, seoul, 135-720, republic of korea., selvas ai, seoul, republic of korea., selvas ai, seoul, republic of korea., department of urology, yonsei university college of medicine, 211 eonju-ro, gangnam-gu, seoul, 135-720, republic of korea., department of urology, yonsei university college of medicine, 211 eonju-ro, gangnam-gu, seoul, 135-720, republic of korea., department of urology, yonsei university college of medicine, 211 eonju-ro, gangnam-gu, seoul, 135-720, republic of korea., department of urology, yonsei university college of medicine, 211 eonju-ro, gangnam-gu, seoul, 135-720, republic of korea., department of urology, yonsei university college of medicine, 211 eonju-ro, gangnam-gu, seoul, 135-720, republic of korea., department of urology, yonsei university college of medicine, 211 eonju-ro, gangnam-gu, seoul, 135-720, republic of korea., department of urology, yonsei university college of medicine, 211 eonju-ro, gangnam-gu, seoul, 135-720, republic of korea. chung646@yuhs.ac.]",0,"department of urology, yonsei university college of medicine, 211 eonju-ro, gangnam-gu, seoul, 135-720, republic of korea., department of urology, yonsei university college of medicine, 211 eonju-ro, gangnam-gu, seoul, 135-720, republic of korea., selvas ai, seoul, republic of korea., selvas ai, seoul, republic of korea., department of urology, yonsei university college of medicine, 211 eonju-ro, gangnam-gu, seoul, 135-720, republic of korea., department of urology, yonsei university college of medicine, 211 eonju-ro, gangnam-gu, seoul, 135-720, republic of korea., department of urology, yonsei university college of medicine, 211 eonju-ro, gangnam-gu, seoul, 135-720, republic of korea., department of urology, yonsei university college of medicine, 211 eonju-ro, gangnam-gu, seoul, 135-720, republic of korea., department of urology, yonsei university college of medicine, 211 eonju-ro, gangnam-gu, seoul, 135-720, republic of korea., department of urology, yonsei university college of medicine, 211 eonju-ro, gangnam-gu, seoul, 135-720, republic of korea., department of urology, yonsei university college of medicine, 211 eonju-ro, gangnam-gu, seoul, 135-720, republic of korea. chung646@yuhs.ac.",selvas,1
22234,33211025,"[department of urology, ten-chan general hospital, taoyuan, taiwan., division of research and development, createcare technology corporation, shenzhen, china., department of urology, national taiwan university hospital, taipei, taiwan., department of urology, national taiwan university hospital, taipei, taiwan., department of urology, national taiwan university hospital, taipei, taiwan.]",0,"department of urology, ten-chan general hospital, taoyuan, taiwan., division of research and development, createcare technology corporation, shenzhen, china., department of urology, national taiwan university hospital, taipei, taiwan., department of urology, national taiwan university hospital, taipei, taiwan., department of urology, national taiwan university hospital, taipei, taiwan.",corporation,1
38027,32374378,"[school of nursing, university of pennsylvania, philadelphia, pennsylvania, usa., school of nursing, university of pennsylvania, philadelphia, pennsylvania, usa., family medicine, school of medicine, university of missouri, columbia, missouri, usa., family medicine, school of medicine, university of missouri, columbia, missouri, usa., live circle inc, ridgewood, new jersey, usa., live circle inc, ridgewood, new jersey, usa., live circle inc, ridgewood, new jersey, usa., school of nursing, university of pennsylvania, philadelphia, pennsylvania, usa.]",0,"school of nursing, university of pennsylvania, philadelphia, pennsylvania, usa., school of nursing, university of pennsylvania, philadelphia, pennsylvania, usa., family medicine, school of medicine, university of missouri, columbia, missouri, usa., family medicine, school of medicine, university of missouri, columbia, missouri, usa., live circle inc, ridgewood, new jersey, usa., live circle inc, ridgewood, new jersey, usa., live circle inc, ridgewood, new jersey, usa., school of nursing, university of pennsylvania, philadelphia, pennsylvania, usa.","inc,",1


In [34]:
export_df['comm_flag'].value_counts()

0    29780
1     2692
Name: comm_flag, dtype: int64

In [35]:
export_df['commercial'].value_counts(sort=True)

0                         29780
 inc\.                      317
 co\.,                      225
 ltd                        142
 inc,                       119
 ge healthcare               95
siemens                      87
corporation                  81
ibm                          79
 gmbh                        71
 co\.                        69
 solutions                   68
 llc                         60
 company                     47
deepwise                     34
 enterprise                  34
limited                      32
philips healthcare           29
n\.v                         28
google                       26
infervision                  26
janssen                      23
atheropoint                  22
 labs,                       22
pfizer                       21
tencent                      20
microsoft                    20
 ag,                         20
\.ai                         18
nvidia                       18
roche                        17
bosch   

In [42]:
export_df_1 = export_df[export_df['joined_affils']=='n, a']

In [43]:
len(export_df_1)

10248

In [44]:
export_df = export_df.drop('author_affils', axis=1)

In [45]:
export_df.to_csv('comm_affil.csv')

In [47]:
export_df_1.to_csv('no_affils.csv')