In [1]:
import pandas as pd
import numpy as np

import requests

datafile = 'SCOPUS Disease Control through Social Network Surveillance.csv'

In [2]:
df = pd.read_csv(datafile)

In [3]:
df.isnull().sum()

Authors                            0
Author(s) ID                       0
Title                              0
Year                               0
Source title                       0
Volume                            15
Issue                            100
Art. No.                         799
Page start                       150
Page end                         160
Page count                       934
Cited by                         105
DOI                              116
Link                               0
Affiliations                      30
Authors with affiliations         11
Abstract                           0
Author Keywords                  429
Index Keywords                    37
Funding Details                  716
Funding Text 1                   732
Funding Text 2                   912
Funding Text 3                   938
References                       121
Correspondence Address            59
Editors                          937
Publisher                        446
I

In [4]:
def shorten_auths(auths: list):
    if auths[0] == '[No author name available]':
        return 'Unknown'
    if len(auths) == 1:
        return auths[0].strip().split(' ')[0]
    elif len(auths) == 2:
        return auths[0].strip().split(' ')[0] + ' & ' + auths[1].strip().split(' ')[0]
    else:
        return auths[0].strip().split(' ')[0] + ' et al'

In [5]:
df['abbrev'] = df.apply(lambda x: shorten_auths(x.Authors.split(',')) + ' ({})'.format(x.Year), axis=1)

In [6]:
display(df.columns)
df

Index(['Authors', 'Author(s) ID', 'Title', 'Year', 'Source title', 'Volume',
       'Issue', 'Art. No.', 'Page start', 'Page end', 'Page count', 'Cited by',
       'DOI', 'Link', 'Affiliations', 'Authors with affiliations', 'Abstract',
       'Author Keywords', 'Index Keywords', 'Funding Details',
       'Funding Text 1', 'Funding Text 2', 'Funding Text 3', 'References',
       'Correspondence Address', 'Editors', 'Publisher', 'ISSN', 'ISBN',
       'CODEN', 'PubMed ID', 'Language of Original Document',
       'Abbreviated Source Title', 'Document Type', 'Publication Stage',
       'Access Type', 'Source', 'EID', 'abbrev'],
      dtype='object')

Unnamed: 0,Authors,Author(s) ID,Title,Year,Source title,Volume,Issue,Art. No.,Page start,Page end,...,CODEN,PubMed ID,Language of Original Document,Abbreviated Source Title,Document Type,Publication Stage,Access Type,Source,EID,abbrev
0,"Band J.D., Chamberland M.E., Platt T., Weaver ...",7003937394;7006018311;7102275983;7202668225;71...,Trends in meningococcal disease in the United ...,1983,Journal of Infectious Diseases,148,4,,754,758,...,JIDIA,,English,J. INFECT. DIS.,Article,Final,,Scopus,2-s2.0-0020959176,Band et al (1983)
1,"Kaewsonthi S., Harding A.G.",6602421048;7202709554;,Cost and performance of malaria surveillance i...,1984,Social Science and Medicine,19,10,,1081,1097,...,SSMDE,6441262.0,English,Soc. Sci. Med.,Article,Final,,Scopus,2-s2.0-0021644285,Kaewsonthi & Harding (1984)
2,"Zessin K.-H., Carpenter T.E.",35580327500;7101875070;,Benefit-cost analysis of an epidemiologic appr...,1985,Preventive Veterinary Medicine,3,4,,323,337,...,PVMEE,,English,Prev. Vet. Med.,Article,Final,,Scopus,2-s2.0-10644293118,Zessin & Carpenter (1985)
3,"Graitcer P.L., Burton A.H.",6701583731;57197463085;,The epidemiology surveillance project: A compu...,1987,American Journal of Preventive Medicine,3,3,,123,127,...,AJPME,2838060.0,English,AM. J. PREV. MED.,Article,Final,,Scopus,2-s2.0-0023159175,Graitcer & Burton (1987)
4,[No author name available],[No author id available],Progress toward achieving the national 1990 ob...,1988,MMWR. Morbidity and mortality weekly report,37,40,,613,617,...,,3139981.0,English,MMWR Morb Mortal Wkly Rep,Article,Final,,Scopus,2-s2.0-0024286285,Unknown (1988)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
934,"Gunderson A.K., Kumar R.E., Recalde-Coronel C....",57219388742;57219387281;57219394488;6504626981...,Malaria transmission and spillover across the ...,2020,International Journal of Environmental Researc...,17,20,7434,1,9,...,,33066022.0,English,Int. J. Environ. Res. Public Health,Article,Final,Open Access,Scopus,2-s2.0-85092486863,Gunderson et al (2020)
935,"Chow E.J., Rolfes M.A., O'Halloran A., Anderso...",57218300571;57212275884;55954035600;5671461440...,Acute Cardiovascular Events Associated With In...,2020,Annals of internal medicine,173,8,,605,613,...,,32833488.0,English,Ann Intern Med,Article,Final,,Scopus,2-s2.0-85093894765,Chow et al (2020)
936,"Leuba S.I., Yaesoubi R., Antillon M., Cohen T....",57204167461;24588193800;57086624700;7202415780...,Tracking and predicting U.S. influenza activit...,2020,PLoS Computational Biology,16,11,e1008180,,,...,,33137088.0,English,PLoS Comput. Biol.,Article,Final,Open Access,Scopus,2-s2.0-85095728066,Leuba et al (2020)
937,"Bottichio L., Keaton A., Thomas D., Fulton T.,...",56097669200;57202338147;57218207411;5721987591...,Shiga Toxin-Producing Escherichia coli Infecti...,2020,Clinical infectious diseases : an official pub...,71,8,,e323,e330,...,,31814028.0,English,Clin Infect Dis,Article,Final,Open Access,Scopus,2-s2.0-85095861021,Bottichio et al (2020)


In [7]:
df_orig = df.copy()
df = df.drop(columns=['Volume',
       'Issue', 'Art. No.', 'Page start', 'Page end', 'Page count', 'Cited by',
       'Link', 'Affiliations', 'Authors with affiliations', 'Abstract',
       'Author Keywords', 'Index Keywords', 'Funding Details',
       'Funding Text 1', 'Funding Text 2', 'Funding Text 3', 
       'Correspondence Address', 'Editors', 'Publisher', 'ISSN', 'ISBN',
       'CODEN', 'PubMed ID', 'Language of Original Document',
       'Abbreviated Source Title', 'Document Type', 'Publication Stage',
       'Access Type', 'EID'])

In [8]:
df.dropna(subset=['References'], inplace=True)
df['RefList'] = df.apply(lambda x: x.References.split(';'), axis=1)

In [39]:
df_refs = df.explode('RefList')
df_refs = df_refs[~df_refs.RefList.str.contains("http", na=False)]  # was 27,503 rows before adding this. removed ~4100 refs

In [40]:
import re
res2 = re.compile('''
        (?P<ref_authors>([A-Z][a-z]+, ([A-Z]\.)+, )+).*(?=\()\(
        (?P<ref_year>\d{4})\) 
        (?P<ref_title>.*)
        (?=pp\. )pp\. (?P<pp>\d{1,5}-\d{1,5})(\. ?, )?
        (?P<pub>.+)
        ''', re.VERBOSE)

def ref_res(ref): 
    res = re.search(r'(?P<ref_authors>.*)(, )\((?P<ref_year>\d{4})\)(?P<ref_title>.*)', ref)
    if res and 'ref_authors' in res.groupdict() and 'ref_year' in res.groupdict() and 'ref_title' in res.groupdict(): 
        return res.groupdict()
    
    res = re.search(r'(?P<ref_authors>.*)(?P<ref_title>(?<=,).*)\((?P<ref_year>\d{4})\)', ref)
    if res and 'ref_authors' in res.groupdict() and 'ref_year' in res.groupdict() and 'ref_title' in res.groupdict(): 
        return res.groupdict()
    
    res = re.search(res2, ref)
    if res and 'ref_authors' in res.groupdict() and 'ref_year' in res.groupdict() and 'ref_title' in res.groupdict(): 
        return res.groupdict()
    
    else:
        return {'ref_authors':np.NaN, 'ref_year':np.NaN, 'ref_title':np.NaN}

df_refs.reset_index(inplace=True)
df_refs.rename({'index':'srcID'}, axis='columns',inplace=True)
df_refs = pd.concat([df_refs, df_refs.apply(lambda x: pd.Series(ref_res(x.RefList)), axis=1)], axis=1)


In [41]:
print('total count of refs:',len(df_refs),
'\n\n% refs successfully parsed:', 100*(1-df_refs.ref_title.isnull().sum()/len(df_refs)))

total count of refs: 23418 

% refs successfully parsed: 88.59424374412845


In [42]:
df_refs.dropna(subset=['ref_authors'], inplace=True)

In [43]:
def clean_auth(auth: list):
    if ',' in auth:
        auth = auth.split(',')[0].strip()
    if '.' in auth:
        auth = re.split('[A-Z]\.',auth)[0].strip()
    return auth

def shorten_ref_auths(auths: list):
    if auths[0] == '[No author name available]':
        return 'Unknown'
    if '.' in auths:
        auths = auths.split('.,')
    else:
        auths = auths.strip(', ').split(',')
    
        
    if len(auths) == 1:
        return clean_auth(auths[0])
    elif len(auths) == 2:
        return clean_auth(auths[0]) + ' & ' + clean_auth(auths[1])
    else:
        return clean_auth(auths[0]) + ' et al'

In [44]:
df_refs['ref_abbrev'] = df_refs.apply(lambda x: shorten_ref_auths(x.ref_authors) + ' ({})'.format(x.ref_year), axis=1)

In [45]:
df_refs

Unnamed: 0,srcID,Authors,Author(s) ID,Title,Year,Source title,DOI,References,Source,abbrev,RefList,ref_authors,ref_title,ref_year,ref_abbrev
0,2,"Zessin K.-H., Carpenter T.E.",35580327500;7101875070;,Benefit-cost analysis of an epidemiologic appr...,1985,Preventive Veterinary Medicine,10.1016/0167-5877(85)90010-8,"De Boer, Final project report on vaccination c...",Scopus,Zessin & Carpenter (1985),"De Boer, Final project report on vaccination c...","De Boer,",Final project report on vaccination campaign,1979,De Boer (1979)
1,2,"Zessin K.-H., Carpenter T.E.",35580327500;7101875070;,Benefit-cost analysis of an epidemiologic appr...,1985,Preventive Veterinary Medicine,10.1016/0167-5877(85)90010-8,"De Boer, Final project report on vaccination c...",Scopus,Zessin & Carpenter (1985),"El Nasri, Present status of diseases and dise...","El Nasri,",Present status of diseases and disease control,1966,El Nasri (1966)
2,2,"Zessin K.-H., Carpenter T.E.",35580327500;7101875070;,Benefit-cost analysis of an epidemiologic appr...,1985,Preventive Veterinary Medicine,10.1016/0167-5877(85)90010-8,"De Boer, Final project report on vaccination c...",Scopus,Zessin & Carpenter (1985),"Ministry of Agriculture, Forestry and Natural...","Ministry of Agriculture, Forestry and Natural...",Sudan national livestock census and natural r...,1976,Ministry of Agriculture et al (1976)
3,2,"Zessin K.-H., Carpenter T.E.",35580327500;7101875070;,Benefit-cost analysis of an epidemiologic appr...,1985,Preventive Veterinary Medicine,10.1016/0167-5877(85)90010-8,"De Boer, Final project report on vaccination c...",Scopus,Zessin & Carpenter (1985),"Piercy, Present methods of control of contagi...","Piercy,",Present methods of control of contagious bovi...,1958,Piercy (1958)
4,2,"Zessin K.-H., Carpenter T.E.",35580327500;7101875070;,Benefit-cost analysis of an epidemiologic appr...,1985,Preventive Veterinary Medicine,10.1016/0167-5877(85)90010-8,"De Boer, Final project report on vaccination c...",Scopus,Zessin & Carpenter (1985),"Project Development Unit (PDU), (1978) Season...",Project Development Unit (PDU),Seasonal and longterm changes in the marketed...,1978,Project Development Unit (PDU) (1978)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23413,938,"Renardy M., Eisenberg M., Kirschner D.",57201006568;15762032100;7006592090;,Predicting the second wave of COVID-19 in Wash...,2020,Journal of Theoretical Biology,10.1016/j.jtbi.2020.110461,"Adam, D., Special report: the simulations driv...",Scopus,Renardy et al (2020),"Thompson, R.N., Hollingsworth, T.D., Isham, V...","Thompson, R.N., Hollingsworth, T.D., Isham, V...",", Key questions for modelling COVID-19 exit st...",2020,Thompson et al (2020)
23414,938,"Renardy M., Eisenberg M., Kirschner D.",57201006568;15762032100;7006592090;,Predicting the second wave of COVID-19 in Wash...,2020,Journal of Theoretical Biology,10.1016/j.jtbi.2020.110461,"Adam, D., Special report: the simulations driv...",Scopus,Renardy et al (2020),"Thunström, L., Newbold, S.C., Finnoff, D., As...","Thunström, L., Newbold, S.C., Finnoff, D., As...",The benefits and costs of using social distan...,2020,Thunström et al (2020)
23415,938,"Renardy M., Eisenberg M., Kirschner D.",57201006568;15762032100;7006592090;,Predicting the second wave of COVID-19 in Wash...,2020,Journal of Theoretical Biology,10.1016/j.jtbi.2020.110461,"Adam, D., Special report: the simulations driv...",Scopus,Renardy et al (2020),"Wei, W.E., Li, Z., Chiew, C.J., Yong, S.E., T...","Wei, W.E., Li, Z., Chiew, C.J., Yong, S.E., T...",2020,2020,Wei et al (2020)
23416,938,"Renardy M., Eisenberg M., Kirschner D.",57201006568;15762032100;7006592090;,Predicting the second wave of COVID-19 in Wash...,2020,Journal of Theoretical Biology,10.1016/j.jtbi.2020.110461,"Adam, D., Special report: the simulations driv...",Scopus,Renardy et al (2020),"Wheaton, W., Cajka, J., Chasteen, B., Wagener...","Wheaton, W., Cajka, J., Chasteen, B., Wagener...",Synthesized population databases: a US geospa...,2009,Wheaton et al (2009)


In [46]:
df.to_csv('dcsns_scopus_sources.csv')
df_refs.to_csv('dcsns_scopus_refs.csv')