In [1]:
import json
import ads
import orcid
import bibtexparser
import tqdm
import lxml
import requests
import datetime
import pandas as pd
import numpy as np

In [326]:
def merge_orcids(paper_record):
    orcid_df = pd.DataFrame(columns=['orcid_user', 'orcid_other', 'orcid_pub'])
    for col in orcid_df.columns:
        if getattr(paper_record, col) is None:
            continue
        if len(paper_record.author)!= len(getattr(paper_record, col)): continue
        orcid_df[col] = getattr(paper_record, col)
    orcid_df = orcid_df.replace('-', np.nan)

    return orcid_df.orcid_other.combine_first(orcid_df.orcid_pub).combine_first(orcid_df.orcid_user)

def generate_author_df(paper_record):
    author_df = pd.DataFrame(columns=['author_name', 'orcid'])
    assert len(paper_record.author) == len(merge_orcids(paper_record).values)
    author_df['author_name'] = paper_record.author
#    author_df['affiliation'] = paper_record.aff
    author_df['orcid'] = merge_orcids(paper_record).values
    return author_df

def generate_coa_table_from_orcid(orcids, orcid_api):
    coa_df = pd.DataFrame(columns=['first_name', 'last_name', 'organization'])
    for orcid in orcids:
        orcid_record = orcid_api.read_record_public(orcid, 'record', token)

        orcid_activities = orcid_record['activities-summary']
        orcid_name = orcid_record['person']['name']
        family_name = orcid_name['family-name']['value']
        first_name = orcid_name['given-names']['value']
        try:
            org = orcid_activities['employments']['employment-summary'][0]['organization']['name']
        except IndexError:
            
            try:
                org = orcid_activities['educations']['education-summary'][0]['organization']['name']
            except IndexError:
                org = None
        
        organization = org
        coa_df = coa_df.append({'first_name':first_name, 'last_name':family_name, 'organization':organization}, ignore_index=True)
    return coa_df


## Generating Publication statistics ##

In [2]:
ads_key = json.load(open('keys.json'))['ads_key']
ads.config.token = ads_key

In [62]:
orcid_access = json.load(open('keys.json'))
orcid_key = orcid_access['orcid_client_id']
orcid_secret = orcid_access['orcid_client_secret']
api = orcid.PublicAPI(orcid_key, orcid_secret)
token = api.get_search_token_from_orcid()

In [167]:
cur_time = datetime.datetime.now()
past_time = cur_time - datetime.timedelta(days=48*31)
co_author_query = ('author:"kerzendorf" database:astronomy property:article '
                   'pubdate:[{past_time.year}-{past_time.month} TO {cur_time.year}-{cur_time.month}]').format(past_time=past_time, cur_time=cur_time)

In [168]:
kerzendorf_query = ads.SearchQuery(q=co_author_query, rows=1000)
papers = {}
for paper in kerzendorf_query:
    papers[paper.bibcode] = paper
print(kerzendorf_query.response.get_ratelimits())
print("Found {0} papers matching the search".format(len(papers)))

{'limit': '5000', 'remaining': '4810', 'reset': '1541779168'}
Found 30 papers matching the search


In [240]:
conflict_df = pd.DataFrame(columns=['author_name', 'orcid'])
for paper in papers.values():
    if len(paper.author)>1000:
        continue
    author_df = generate_author_df(paper)
    if 'kerzendorf' in paper.first_author.lower():
        conflict_df = conflict_df.append(author_df.iloc[1:], ignore_index=True, sort=False)
    elif 'collaboration' in paper.first_author.lower():
        pass
    else:
        conflict_df = conflict_df.append(author_df.iloc[0], ignore_index=True, sort=False)

In [241]:
conflict_df['author_last'] = conflict_df.author_name.str.split(', ').str[0]
conflict_df = conflict_df.sort_values(['author_last', 'orcid'])

In [242]:
conflict_df = conflict_df.drop_duplicates()

In [243]:
conflict_df = conflict_df.drop_duplicates('author_last', keep='first')

In [245]:
saved_orcids = pd.read_csv('conflict_orcids.csv', index_col=0)['orcid']

In [247]:
conflict_df.loc[saved_orcids.index, 'orcid'] = saved_orcids

In [327]:
final_coa = generate_coa_table_from_orcid(conflict_df.orcid.dropna().values, api)


In [337]:
no_orcid = conflict_df[conflict_df.orcid.isnull()]
no_orcid['last_name'] = no_orcid.author_name.str.split(', ').str[0]
no_orcid['first_name'] = no_orcid.author_name.str.split(', ').str[1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [339]:
final_coa = final_coa.append(no_orcid[['first_name', 'last_name']], ignore_index=True)

In [342]:
final_coa.to_excel('final_coa.xls')

In [346]:
final_coa['concat_name'] = (final_coa.last_name+', '+final_coa.first_name).values