# Conference Connections
This notebook demonstrates how to make a map of the research for attendees of a conference.

In [None]:
import numpy as np
import os
import pandas as pd
import tqdm

In [None]:
import matplotlib
import matplotlib.pyplot as plt

matplotlib.style.use( '/Users/zhafen/repos/clean-bold/clean-bold.mplstyle' )

In [None]:
import ads

In [None]:
import verdict

In [None]:
import cc.atlas as atlas
import cc.cartography as cartography

# Analysis Paramters

In [None]:
scientists_csv_fp = './data/attendees.csv'
scientists_fp = './data/scientists.hdf5'
atlas_dir = './data'

In [None]:
retrieve_data = False

In [None]:
formatted_names = {
    'Chris Howk': 'Howk, J',
    'Mark Voit': 'Voit, G',
}

In [None]:
bibtex_names = {
    'Carlos Vargas': 'Carlos J.',
    'Charlotte Christensen': 'Charlotte',
    'Evan Schneider': 'Evan',
    'Kate Rubin': 'Kate',
    'Michelle Berg': 'Michelle',
    'Nao Suzuki': 'Nao ',
    'Ramona Augustin': 'Ramona',
    'Stephanie Ho': 'Stephanie',
    'Yong Zheng': 'Yong',
}

In [None]:
ads_field_list = [
    'abstract',
    'citation',
    'reference',
    'date',
    'entry_date',
    'author',
    'volume',
    'page',
    'bibcode',
    'arxivid',
    'doi',
]

# Retrieve and Process Data

In [None]:
if retrieve_data:
    scientists_df = pd.read_csv( scientists_csv_fp )
    
    scientists = verdict.Dict.from_hdf5( scientists_fp, create_nonexistent=True )
    bibcodes = []
    for i in tqdm.tqdm( scientists_df.index ):

        row = scientists_df.loc[i]

        # Get and format the name
        name = row['Name']
        if name not in formatted_names:
            name_split = name.split( ' ' )
            last_name = name_split[-1]
            first_init = name_split[0][0]
            formatted_name = '{}, {}'.format( last_name, first_init )
        else:
            formatted_name = formatted_names[name]

        # Retrieve data from ADS
        q = ads.SearchQuery(
            author = formatted_name,
            fl = ads_field_list,
            max_pages = 50,
        )
        papers_i = list( q )
        bibcodes_i = [ _.bibcode for _ in papers_i ]
        
        # Retrieve first author data from ADS
        q = ads.SearchQuery(
            first_author = formatted_name,
            fl = ads_field_list,
            max_pages = 50,
        )
        papers_i = list( q )
        fa_bibcodes_i = [ _.bibcode for _ in papers_i ]
        
        scientists[name] = {
            'bibcodes': bibcodes_i,
            'first-author bibcodes': fa_bibcodes_i,
            'institution': row['Institution'],
        }

        bibcodes += bibcodes_i
        
    scientists.to_hdf5( scientists_fp )

In [None]:
if retrieve_data:
    # Format data into a useable class
    a = atlas.Atlas.from_bibcodes( atlas_dir, bibcodes )
    a.get_ads_data()
    a.save_data()

In [None]:
if retrieve_data:
    # Vectorization
    vp_dict = a.vectorize()
    c = cartography.Cartographer( **vp_dict )

In [None]:
if retrieve_data:
    # Correlate vectorized data with bibcodes
    for name, scientist in scientists.items():
        isin = pd.Series( c.publications ).isin( scientist['bibcodes'] )
        scientist['indices'] = isin.index[isin].values
        
        isin = pd.Series( c.publications ).isin( scientist['first-author bibcodes'] )
        scientist['first-author indices'] = isin.index[isin].values

    scientists.to_hdf5( scientists_fp )

# Load Processed Data

In [None]:
scientists = verdict.Dict.from_hdf5( scientists_fp )

In [None]:
a = atlas.Atlas( atlas_dir, load_bibtex=False )

In [None]:
c = cartography.Cartographer.from_hdf5( os.path.join( atlas_dir, 'projection.h5' ) )

# Identify Papers of Interest

In [None]:
# Calculate number of citations
citations_per_year = np.array([ a[_].citations_per_year() for _ in c.publications ])
citations = []
for key in c.publications:
    try:
        citations.append( len( a[key].citations ) )
    except TypeError:
        citations.append( 0 )
citations = np.array( citations )

In [None]:
def print_info( key ):
    
    title = a[key].citation['title']
    title = title.replace( '{', '' ).replace( '}', '' )
    print( '    {}'.format( title ) )
    if 'author' in a[key].citation:
        print( '    {}'.format( a[key].citation['author'] ) )
    print( '    {}    {}\n'.format( key, a[key].entry_date ) )
    print( '    {}\n\n'.format( a[key].abstract ) )

    return title

In [None]:
def choose_ind( sorted_inds, name ):
    
    for ind in sorted_inds:
        key = c.publications[ind]
        
        # Filters
        if ( 'AAS' in key ) or ( 'PhDT' in key ) or ( 'IAU' in key ) or ( 'prop' in key ):
            continue
        if name in bibtex_names and 'author' in a[key].citation:
            if bibtex_names[name] not in a[key].citation['author']:
                continue
        
        print_info( key )
        
        output = input( 'Accept? ' )
        if output == '':
            return ind

In [None]:
selected_fp = os.path.join( atlas_dir, 'selected.h5' )
if os.path.exists( selected_fp ):
    selected = verdict.Dict.from_hdf5( selected_fp )
    names = selected['names']
    inds_recent = selected['most recent indices']
    fa_inds_recent = selected['most recent first-author indices']
    inds_lowcite = selected['least-cited indices']
else:
    names = []
    inds_recent = []
    fa_inds_recent = []
    inds_lowcite = []

In [None]:
for name, scientist in tqdm.tqdm( scientists.items() ):
    
    # DEBUG
    if name in formatted_names:
        continue
        
    # Don't duplicate progress
    if name in names:
        continue
        
    print( '{}:'.format( name ) )
    
    inds = scientist['indices']
    fa_inds = scientist['first-author indices']

    # Find the most recent paper
    print( '    MOST RECENT' )
    sorted_recent = np.argsort( c.entry_dates[inds] )[::-1]
    ind_most_recent = choose_ind( inds[sorted_recent], name )
    
    # Find the most recent first-author paper
    print( '    MOST RECENT FIRST-AUTHOR' )
    sorted_recent = np.argsort( c.entry_dates[fa_inds] )[::-1]
    fa_ind_most_recent = choose_ind( fa_inds[sorted_recent], name )

    # Filter out the paper we've already selected
    fa_inds_o = list( fa_inds )
    fa_inds_o.remove( fa_ind_most_recent )
    fa_inds_o = np.array( fa_inds_o )

    # Find the lowest-cited paper (with more than zero citations)
    print( '    LOWEST CITATIONS' )
    nonzero_citations = citations[fa_inds_o] > 0
    sorted_lowcite = np.argsort( citations[fa_inds_o[nonzero_citations]] )
    ind_lowcite = choose_ind( fa_inds_o[nonzero_citations][sorted_lowcite], name )
    
    names.append( name )
    inds_recent.append( ind_most_recent )
    fa_inds_recent.append( fa_ind_most_recent )
    inds_lowcite.append( ind_lowcite )

In [None]:
selected = verdict.Dict({
    'names': names,
    'most recent': c.publications[np.array(inds_recent)],
    'most recent first-author': c.publications[np.array(fa_inds_recent)],
    'least-cited': c.publications[np.array(inds_lowcite)],
    'most recent indices': np.array(inds_recent),
    'most recent first-author indices': np.array(fa_inds_recent),
    'least-cited indices': np.array(inds_lowcite),
})
selected.to_hdf5( os.path.join( atlas_dir, 'selected.h5' ) )

# Map

In [None]:
def new_c( inds ):
    # Make a new cartographer from a subset of a previous one

    attrs = [
        'vectors',
        'norms',
        'feature_names',
        'publications',
        'publication_dates',
        'entry_dates',
    ]
    vp_dict = {}
    for attr in attrs:
        vp_dict[attr] = getattr( c, attr )[inds]
        
    return cartography.Cartographer( **vp_dict )

In [None]:
c_fa = new_c( fa_inds )

In [None]:
i = -1
key = c_fa.publications[i]

In [None]:
coords, inds, pairs = c_fa.map( key, max_links=6, )

In [None]:
fig = plt.figure()
ax = plt.gca()

img, _ = c_fa.plot_map(
    key,
    data = ( coords, inds, pairs ),
    scatter = True,
#     links = True,
    histogram = False,
    ax = ax,
    labels = True,
)

# Cautions and Challenges

## Scientists with the same names

Some scientists have a lot of works, possibly because of other scientists with the same name.

In [None]:
n_papers = scientists.inner_item( 'bibcodes' ).apply( len )
lots_of_papers = n_papers.array() > 100
n_papers.keys_array()[lots_of_papers]

In [None]:
# Can potentially use orcid to identify publications actually associated with the scientist.

# orcids = {
#     'Stephanie Ho': '0000-0002-9607-7365',
#     'Evan Schneider': '0000-0001-9735-7484',
#     'Suzuki Nao': '0000-0001-7266-930X',
#     'Kate Rubin': '0000-0001-6248-1864',
#     'Alison Coil': '0000-0002-2583-5894',
#     'Benjamin Weiner': '0000-0001-6065-7483',
# }

## Unofficial Publications

A lot of things are included here, e.g. PhD theses, proposals, etc.

## Specifically Targeting Coauthor Works
E.g. if we wanted to find the works of each scientist's most-junior collaborator.

Unfortunately it takes a while to search ADS for this data.

In [None]:
# Build a list of coauthors
coauthors = []
for name, scientist in tqdm.tqdm( scientists.items() ):
    
    inds = scientist['indices']
    fa_inds = scientist['first-author indices']
    
    # Co-author papers
    isin = pd.Series( inds ).isin( fa_inds )
    co_inds = inds[np.invert( isin.values )]
    
    for co_ind in co_inds:
        key = c.publications[co_ind]
        try:
            authors = a[key].citation['author']
            first_author = authors.split( ' and ' )[0].replace( '{', '' ).replace( '}', '' )
        except ( ValueError, KeyError ) as e:
            continue
        coauthors.append( first_author )
coauthors = list( set( coauthors ) )

In [None]:
# Based on above search
avg_seconds_per_author = 15.

In [None]:
time_to_complete = len( coauthors ) * avg_seconds_per_author
hours_to_complete = time_to_complete / 3600

In [None]:
print( 'It would take {:.0f} hrs to search ADS for data on all {} coauthors.'.format( hours_to_complete, len( coauthors ) ) )