In [None]:
import ast
import copy
import numpy as np
import os
import pandas as pd
import scipy
import scipy.sparse as ss

In [None]:
import cc.atlas as atlas
import cc.cartography as cartography
import cc.publication as publication
import cc.utils as utils
import cc.tex as tex

In [None]:
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use( '~/repos/clean-bold/clean-bold.mplstyle' )
import palettable

In [None]:
import trove
import verdict

# Literature Review

This notebook contains a record of how I performed the literature review for this work.

# Parameters

In [None]:
config_fp = '/Users/zhafen/paper_repos/cgm_modeling_challenge_paper/literature_review/literature_review.trove'
cp = trove.config_parser.ConfigParser( config_fp )
pm = trove.link_params_to_config(
    config_fp,
    variation = 'multicloud_modeling',
)

In [None]:
atlas_dir = pm['root_data_dir']

In [None]:
topics = {}
search_strs = {}
for variation in cp.variations:
    topics[variation] = ast.literal_eval( cp.get( variation, 'publications' ) )
    try:
        search_strs[variation] = ast.literal_eval( cp.get( variation, 'search_str' ) )
    except:
        search_strs[variation] = ''
    if ( len( topics[variation] ) == 1 ) and ( search_strs[variation] == '' ):
        raise ValueError( 'Topics that are just copies of papers will be tossed out. Use the paper itself instead or add a search string.' )

# Load Data

## Notes

In [None]:
notes_fp = os.path.join( pm['root_data_dir'], 'notes.json' )
notes = verdict.Dict.from_json( notes_fp, create_nonexisting=True )

In [None]:
if 'addressed' not in notes:
    notes['addressed'] = []

## Zotero Atlas

In [None]:
# Larger atlas
a_zotero = atlas.Atlas( atlas_dir, load_bibtex=False, data_fp=pm['zotero_atlas_fp'] )

In [None]:
# Vectorization
vp_dict = a_zotero.vectorize( projection_fp=pm['zotero_projection_fp'] )
c_zotero = cartography.Cartographer( **vp_dict )

## Large Atlas

In [None]:
# Larger atlas
a = atlas.Atlas( atlas_dir, load_bibtex=False, )

## Tex Draft

In [None]:
pub_doc = tex.Tex( filepath=pm['tex_fp'] )

In [None]:
abstract = pub_doc.string.split( '\\begin{abstract}' )[1].split( '\\end{abstract}' )[0]

In [None]:
print( abstract )

# Incorporate Topics and Abstract

## Paper Abstract

In [None]:
p = publication.UnofficialPublication( pm['citation_key'] )

In [None]:
p.process_abstract( abstract_str = abstract )

In [None]:
a.data[p.citation_key] = copy.deepcopy( p )

## Topics

In [None]:
for key, publications in topics.items():
    
    # Create abstract string
    abstract_str = copy.copy( search_strs[key] )
    for cite_key in publications:
        abstract_str += a[cite_key].abstract_str()
    
    # Add publication
    p = publication.UnofficialPublication( key )
    p.process_abstract( abstract_str=abstract_str )
    a.data[key] = copy.deepcopy( p )

## Emphasis Vector

In [None]:
emph_keys = []
emph_coeffs = []
for i, emph_i in enumerate( pm['emphasis_vector'] ):
    
    # Parse
    if len( emph_i ) > 2:
        key = emph_i[2]
    else:
        key = emph_i[0]
    emph_keys.append( key )
    emph_coeffs.append( emph_i[1] )
    
    # Add publication
    p = publication.UnofficialPublication( key )
    p.process_abstract( abstract_str=emph_i[0] )
    a.data[key] = copy.deepcopy( p )

## Mark as not for review

In [None]:
topics_and_center = [ pm['citation_key'], ] + list( topics.keys() )

In [None]:
for key in topics_and_center:
    notes['addressed'].append( key )
for key in emph_keys:
    notes['addressed'].append( key )
notes['addressed'] = list( set( notes['addressed'] ) )

## Vectorize

In [None]:
# Vectorization
vp_dict = a.vectorize( overwrite=True )
c = cartography.Cartographer( **vp_dict )

In [None]:
inds_series = pd.Series( c.inds, c.publications )

### Edit Vectorization for Emphasis

In [None]:
# Create the emphasis vector
emph_inds = inds_series.loc[emph_keys].values
emph_vector = ( np.array( emph_coeffs ) * c.vectors[emph_inds] )
emph_vector /= np.linalg.norm( emph_vector )

In [None]:
# Add the emphasis vector
tac_inds = inds_series.loc[[pm['citation_key'],]]
tac_vectors = c.vectors[tac_inds]
tac_vectors += emph_vector * c.norms[tac_inds] * pm['emphasis_scaling']
tac_vectors = ss.csr_matrix( tac_vectors )
c.vectors[tac_inds] = tac_vectors

In [None]:
# Store new norms
tac_norm_squared_unformatted = tac_vectors.multiply( tac_vectors ).sum( axis=1 )
tac_norms = np.sqrt( np.array( tac_norm_squared_unformatted ).flatten() )
c.norms[tac_inds] = tac_norms

# Map

## Calculate Maps

In [None]:
coords, inds, pairs = c.map(
    pm['citation_key'],
    max_links = 10,
)

## Colors

In [None]:
# Get the values we'll color by, starting with similarity
color_values = {}
color_values[pm['citation_key']] = c.cospsi( pm['citation_key'], 'all' )
for variation in cp.variations:
    color_values[variation] = c.cospsi( variation, 'all' )

In [None]:
# Add density
color_values['density'] = c.topography_metric( metric='density' )

In [None]:
# Add citations/year
citations_per_year = np.array([ a[_].citations_per_year() for _ in c.publications ])
color_values['citations_per_year'] = citations_per_year

In [None]:
logscale = topics_and_center

In [None]:
# Convert to colors
colors = {}
for key, cvs in color_values.items():
    vmin = np.nanpercentile( cvs, 1 )
    vmax = np.nanpercentile( cvs, 99 )
    if key in logscale:
        norm_fn = matplotlib.colors.LogNorm
        vmin = np.nanpercentile( cvs[cvs>0.], 1 )
    else:
        norm_fn = matplotlib.colors.Normalize
    normed_cvs = norm_fn( vmin=vmin, vmax=vmax )( cvs[inds] )
    colors[key] = palettable.cubehelix.classic_16.mpl_colormap( normed_cvs )

## Hatching

In [None]:
hatching = [ '/' if _ in notes['addressed'] else None for _ in c.publications[inds] ]

## Plotting

### Overall View

In [None]:
fig = plt.figure()
ax = plt.gca()

ax, (coords0, inds0, pairs0) = c.plot_map(
    pm['citation_key'],
    data = ( coords, inds, pairs ),
    ax = ax,
    # xlim = [ -5, 5 ],
    # ylim = [ -5, 5 ],
    clean_plot = False,
    scatter = False,
    histogram = True,
    voronoi = False,
    # voronoi_kwargs = { 'colors': colors, 'edgecolor': 'none' },
    labels = False,
    # labels_formatter = label_formatter,
    labels_kwargs = { 'fontsize': 6 },
)

In [None]:
# Voronoi overview
for key, colors_key in colors.items():

    fig = plt.figure()
    ax = plt.gca()

    ax, (coords0, inds0, pairs0) = c.plot_map(
        pm['citation_key'],
        data = ( coords, inds, pairs ),
        ax = ax,
        xlim = [ -1, 1 ],
        ylim = [ -1, 1 ],
        clean_plot = False,
        scatter = False,
        histogram = False,
        voronoi = True,
        voronoi_kwargs = { 'colors': colors_key, 'edgecolors': colors_key },
        labels = False,
        # labels_formatter = label_formatter,
        labels_kwargs = { 'fontsize': 6 },
    )
    
    ax.annotate(
        text = key,
        xy = ( 0, 1 ),
        xycoords = 'axes fraction',
        xytext = ( 5, 5 ),
        textcoords = 'offset points',
        va = 'bottom',
        ha = 'left',
    )
    
    fig

### Zoomed

In [None]:
center_key = 'multicloud_modeling'
colors_key = colors[center_key]

In [None]:
import re
def labels_formatter( i, m_i, c ):
    key = c.publications[i]
    key_split = re.findall( r'[^\W\d_]+|\d+', key )
    if '.' in key:
        return '{}'.format( m_i )
    else:
        if len( key_split ) > 1:
            if not key_split[-1].isalpha():
                year = key_split[-1][2:]
            else:
                year = key_split[-2][2:]
        else:
            year = ''
        label = '{}: {}{}'.format( m_i, key[:3], year )
        return label

In [None]:
fig = plt.figure()
ax = plt.gca()

ax, (coords0, inds0, pairs0) = c.plot_map(
    pm['citation_key'],
    data = ( coords, inds, pairs ),
    ax = ax,
    xlim = [ -0.5, 0.5 ],
    ylim = [ -0.5, 0.5 ],
    clean_plot = False,
    scatter = False,
    histogram = False,
    voronoi = True,
    voronoi_kwargs = { 'colors': colors_key, 'hatching': hatching, 'default_edgecolor': 'none' },
    labels = True,
    labels_formatter = labels_formatter,
    labels_kwargs = { 'fontsize': 6 },
)

ax.annotate(
    text = center_key,
    xy = ( 0, 1 ),
    xycoords = 'axes fraction',
    xytext = ( 5, 5 ),
    textcoords = 'offset points',
    va = 'bottom',
    ha = 'left',
)

In [None]:
m_i = 49
i = inds[m_i]
key = c.publications[i]
print( a.data[key].citation )
print( a.data[key].abstract_str() )

# Create a Reading List

## Setup

In [None]:
if pm['variation'] not in topics_data['not_included']:
    topics_data['not_included'][pm['variation']] = []

In [None]:
def sort_by_similarity( target_key, c ):
    
    cospsi = c.cospsi( target_key, 'all' )
    sort_inds = np.argsort(cospsi)[::-1]
    sorted_cospsi = cospsi[sort_inds]
    sorted_publications = c.publications[sort_inds]
    
    return sorted_cospsi, sorted_publications

In [None]:
def print_sorted_publications(
    sorted_cospsi,
    sorted_publications,
    kernel_size,
    a,
    show_unread_only = False,
    do_not_show_included = True,
    do_not_show_not_included = True,
    central_publication = '',
):

#     print( 'The {} most related publications are...\n'.format( kernel_size ) )

    n_shown = 0
    for i, key_i in enumerate( sorted_publications ):
                
        if i > kernel_size:
            break
                
        if key_i not in a.data:
            continue
        
        p_i = a[key_i]

        if 'read' in p_i.notes:
            read_flag = p_i.notes['read']
        else:
            read_flag = 'UNREAD'
        if show_unread_only and read_flag != 'UNREAD':
            continue
            
        included_flag = key_i in pub_doc.string           
        deliberately_not_included = key_i in topics_data['not_included'][pm['variation']]
        
        if do_not_show_included and included_flag:
            continue
        if do_not_show_not_included and deliberately_not_included:
            continue

        if not isinstance( p_i, publication.UnofficialPublication ):
            print( '{} -- {}'.format( n_shown, key_i, ) )
            print( p_i.citation['title'] )
            print( p_i.citation['author'] )
            print( 'Related rank: {}.{}'.format( i, central_publication ) ) 
            print( 'Read: {}'.format( ''.join( read_flag ) ) )
            print( 'Included: {}'.format( included_flag ) )
            print( p_i.citation['ENTRYTYPE'] + '\n' )
            print( p_i.points_str() + '\n\n' )
        else:
            pass
#             print( '{} -- {}'.format( n_shown, key_i, ) )
#             print( 'Related rank: {}.{}'.format( i, central_publication ) ) 
        
        n_shown += 1



## Conservative Reading List
For the average abstract in combination with chosen words.

### Create and add average vector

In [None]:
# Can add chosen words if so wished.
search_str = ''

In [None]:
for i, key_i in enumerate( pm['publications'] ):
    search_str += a[key_i].points_str()

In [None]:
if 'search_str' in pm:
    search_str += pm['search_str']

In [None]:
p = publication.UnofficialPublication( pm['variation'] )

In [None]:
p.process_abstract( abstract_str=search_str )

In [None]:
a.data[pm['variation']] = p

In [None]:
a.update( a_zotero )
a.prune_duplicates(preferred=list(a_zotero.data.keys()))

In [None]:
# Vectorize
vp_dict = a.vectorize( overwrite=True, projection_fp='pass', )
c = cartography.Cartographer( **vp_dict )

In [None]:
map_fp = os.path.join( a.atlas_dir, 'map.h5' )
coords, inds, pairs = c.map( pm['citation_key'], save_filepath=map_fp, )

### Produce reading list

In [None]:
sorted_cospsi, sorted_publications = sort_by_similarity( pm['variation'], c )

In [None]:
pub_doc = tex.Tex( filepath=pm['tex_fp'] )

In [None]:
topics_data['not_included'][pm['variation']].append( '2012ApJ...750...10S' )
topics_data['not_included'][pm['variation']] = list( set( topics_data['not_included'][pm['variation']] ) )
topics_data.to_json( topics_fp )

In [None]:
ax, _ = c.plot_map(
    pm['citation_key'],
    data = ( coords, inds, pairs ),
    scatter = False,
    histogram = True,
    clean_plot = False,
)

In [None]:
ax, _ = c.plot_map(
    pm['citation_key'],
    data = ( coords, inds, pairs ),
    scatter = False,
    histogram = True,
    clean_plot = False,
    xlim = [ -0.2, 0.2 ],
    ylim = [ -0.2, 0.2 ],
)

In [None]:
fig = plt.figure()
ax = plt.gca()

ax, _ = c.plot_map(
    pm['citation_key'],
    data = ( coords, inds, pairs ),
    ax = ax,
    scatter = True,
    xlim = [ -0.1, 0.1 ],
    ylim = [ -0.1, 0.1 ],
    clean_plot = False,
    labels = True,
    labels_kwargs = { 'fontsize': 5 },
    voronoi = True,
    voronoi_kwargs = { 'offset_magnitude': 0 },
)

In [None]:
print_sorted_publications(
    sorted_cospsi,
    sorted_publications,
    pm['kernel_size'] * 2,
    a,
    do_not_show_included = True,
    do_not_show_not_included = True,
    central_publication = pm['variation'],
)

## Extensive Reading List
For each and everyone of the publications.

In [None]:
a_pubs = []
sorted_cospsis = []
sorted_publications = []
for i, key_i in enumerate( pm['publications'] ):
    a_i = atlas.Atlas(atlas_dir, load_bibtex=False, load_atlas_data=False )
    a_i.data[key_i] = a[key_i]
    
    # Identify relevant publications
    sorted_cospsi_i, sorted_publications_i = sort_by_similarity( key_i, c )
    sorted_cospsis.append( sorted_cospsi_i )
    sorted_publications.append( sorted_publications_i )

    for key in sorted_publications_i:

        # Get missing publications
        if key in a_zotero.data:
            a_i.data[key] = a_zotero.data[key]
        elif key in a.data:
            a_i.data[key] = a.data[key]

    a_i.prune_duplicates(preferred=list(a_zotero.data.keys()))
    
    a_pubs.append( a_i )

In [None]:
pub_doc = tex.Tex( filepath=pm['tex_fp'] )

In [None]:
topics_data['not_included'][pm['variation']].append( '2017MNRAS.466.3460V' )
topics_data.to_json( topics_fp )

In [None]:
for i, a_i in enumerate( a_pubs ):
    
#     print( '===============================================================================' )
#     print( pm['publications'][i] + '\n' )
    
    print_sorted_publications(
        sorted_cospsis[i],
        sorted_publications[i],
        pm['kernel_size'],
        a_i,
        do_not_show_included = True,
        do_not_show_not_included = True,
        central_publication = pm['publications'][i],
    )