In [None]:
import ast
import copy
import numpy as np
import os
import pandas as pd
import scipy
import scipy.sparse as ss

In [None]:
import cc.atlas as atlas
import cc.cartography as cartography
import cc.publication as publication
import cc.utils as utils
import cc.tex as tex

In [None]:
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use( '~/repos/clean-bold/clean-bold.mplstyle' )
import palettable

In [None]:
import trove
import verdict

# Literature Review

This notebook contains a record of how I performed the literature review for this work.

# Parameters

In [None]:
config_fp = '/Users/zhafen/paper_repos/cgm_modeling_challenge_paper/literature_review/literature_review.trove'
cp = trove.config_parser.ConfigParser( config_fp )
pm = trove.link_params_to_config(
    config_fp,
    variation = 'multicloud_modeling',
)

In [None]:
atlas_dir = pm['root_data_dir']

In [None]:
topics = {}
search_strs = {}
for variation in cp.variations:
    topics[variation] = ast.literal_eval( cp.get( variation, 'publications' ) )
    try:
        search_strs[variation] = ast.literal_eval( cp.get( variation, 'search_str' ) )
    except:
        search_strs[variation] = ''
    if ( len( topics[variation] ) == 1 ) and ( search_strs[variation] == '' ):
        raise ValueError( 'Topics that are just copies of papers will be tossed out. Use the paper itself instead or add a search string.' )

# Load Data

## Notes

In [None]:
notes_fp = os.path.join( pm['root_data_dir'], 'notes.json' )
notes = verdict.Dict.from_json( notes_fp, create_nonexisting=True )

In [None]:
if 'addressed' not in notes:
    notes['addressed'] = []

## Zotero Atlas

In [None]:
# Larger atlas
a_zotero = atlas.Atlas( atlas_dir, load_bibtex=False, data_fp=pm['zotero_atlas_fp'] )

In [None]:
# Vectorization
vp_dict = a_zotero.vectorize( projection_fp=pm['zotero_projection_fp'] )
c_zotero = cartography.Cartographer( **vp_dict )

## Large Atlas

In [None]:
# Larger atlas
a = atlas.Atlas( atlas_dir, load_bibtex=False, )

In [None]:
a.update( a_zotero )

## Tex Draft

In [None]:
pub_doc = tex.Tex( filepath=pm['tex_fp'] )

In [None]:
abstract = pub_doc.string.split( '\\begin{abstract}' )[1].split( '\\end{abstract}' )[0]

In [None]:
print( abstract )

# Incorporate Topics and Abstract

## Paper Abstract

In [None]:
p = publication.UnofficialPublication( pm['citation_key'] )

In [None]:
p.process_abstract( abstract_str = abstract )

In [None]:
a.data[p.citation_key] = copy.deepcopy( p )

## Topics

In [None]:
for key, publications in topics.items():
    
    # Create abstract string
    abstract_str = copy.copy( search_strs[key] )
    for cite_key in publications:
        abstract_str += a[cite_key].abstract_str()
    
    # Add publication
    p = publication.UnofficialPublication( key )
    p.process_abstract( abstract_str=abstract_str )
    a.data[key] = copy.deepcopy( p )

## Emphasis Vector

In [None]:
emph_keys = []
emph_coeffs = []
for i, emph_i in enumerate( pm['emphasis_vector'] ):
    
    # Parse
    if len( emph_i ) > 2:
        key = emph_i[2]
    else:
        key = emph_i[0]
    emph_keys.append( key )
    emph_coeffs.append( emph_i[1] )
    
    # Add publication
    p = publication.UnofficialPublication( key )
    p.process_abstract( abstract_str=emph_i[0] )
    a.data[key] = copy.deepcopy( p )

## Mark as not for review

In [None]:
topics_and_center = [ pm['citation_key'], ] + list( topics.keys() )

In [None]:
for key in topics_and_center:
    notes['addressed'].append( key )
for key in emph_keys:
    notes['addressed'].append( key )
notes['addressed'] = list( set( notes['addressed'] ) )

## Vectorize

In [None]:
# Vectorization
vp_dict = a.vectorize( overwrite=True )
c = cartography.Cartographer( **vp_dict )

In [None]:
inds_series = pd.Series( c.inds, c.publications )

### Edit Vectorization for Emphasis

In [None]:
# Create the emphasis vector
emph_inds = inds_series.loc[emph_keys].values
emph_vector = ( np.array( emph_coeffs ) * c.vectors[emph_inds] )
emph_vector /= np.linalg.norm( emph_vector )

In [None]:
# Add the emphasis vector
tac_inds = inds_series.loc[[pm['citation_key'],]]
tac_vectors = c.vectors[tac_inds]
tac_vectors += emph_vector * c.norms[tac_inds] * pm['emphasis_scaling']
tac_vectors = ss.csr_matrix( tac_vectors )
c.vectors[tac_inds] = tac_vectors

In [None]:
# Store new norms
tac_norm_squared_unformatted = tac_vectors.multiply( tac_vectors ).sum( axis=1 )
tac_norms = np.sqrt( np.array( tac_norm_squared_unformatted ).flatten() )
c.norms[tac_inds] = tac_norms

# Map

## Calculate Maps

In [None]:
center_key = 'multicloud_modeling'

### Preserving Pairwise Distances

In [None]:
coords, inds, pairs = c.map(
    center_key,
    # max_links = 1000,
)

### Just similarity between two central publications

In [None]:
x = c.cospsi( pm['citation_key'], 'all' )
y = c.cospsi( center_key, 'all' )
coords_simple = np.array([ x, y ]).transpose()

## Colors

In [None]:
# Get the values we'll color by, starting with similarity
color_values = {}
for key in topics_and_center:
    color_values[key] = c.cospsi( key, 'all' )

In [None]:
# Add density
# color_values['density'] = c.topography_metric( metric='density' )

In [None]:
# Add citations/year
# citations_per_year = np.array([ a[_].citations_per_year() for _ in c.publications ])
# color_values['citations_per_year'] = citations_per_year

In [None]:
vlim = [ 0., 0.4 ]

## Plotting

### Overall View

#### Total

In [None]:
fig = plt.figure()
ax = plt.gca()

ax, (coords0, inds0, pairs0) = c.plot_map(
    pm['citation_key'],
    coords = coords,
    ax = ax,
    cmap = 'viridis',
    scatter = False,
    histogram = True,
)

#### Colored

In [None]:
for key in topics_and_center:
    
    fig = plt.figure()
    ax = plt.gca()

    ax, (coords0, inds0, pairs0) = c.plot_map(
        pm['citation_key'],
        coords = coords,
        ax = ax,
        colors = color_values[key],
        xlim = [ -3, 3 ],
        ylim = [ -3, 3 ],
        vlim = vlim,
        scatter = False,
        histogram = True,
    )
    
    
    ax.annotate(
        text = key,
        xy = ( 0, 1 ),
        xycoords = 'axes fraction',
        xytext = ( 5, 5 ),
        textcoords = 'offset points',
        va = 'bottom',
        ha = 'left',
    )

### Zoomed

In [None]:
colors_key = 'Hafen2022b'
colors_used = color_values[colors_key]

In [None]:
import re
def labels_formatter( i, m_i, c ):
    key = c.publications[i]
    key_split = re.findall( r'[^\W\d_]+|\d+', key )
    if '.' in key:
        return '{}'.format( m_i )
    else:
        if len( key_split ) > 1:
            if not key_split[-1].isalpha():
                year = key_split[-1][2:]
            else:
                year = key_split[-2][2:]
        else:
            year = ''
        label = '{}: {}{}'.format( m_i, key[:3], year )
        return label

In [None]:
hatching = []
for key in c.publications:
    if key in notes['addressed']:
        hatching.append( '//' )
    elif key in notes['to read']:
        hatching.append( '*' )
    else:
        hatching.append( None )
hatching = np.array( hatching )

In [None]:
fig = plt.figure()
ax = plt.gca()

ax, _ = c.plot_map(
    center_key,
    coords = coords,
    inds = inds,
    pairs = pairs,
    colors = colors_used,
    hatching = hatching,
    ax = ax,
    xlim = [ -0.05, 0.05 ],
    ylim = [ -0.05, 0.05 ],
    vlim = vlim,
    voronoi = True,
    labels = True,
    labels_formatter = labels_formatter,
    labels_kwargs = { 'fontsize': 6 },
)

ax.annotate(
    text = colors_key,
    xy = ( 0, 1 ),
    xycoords = 'axes fraction',
    xytext = ( 5, 5 ),
    textcoords = 'offset points',
    va = 'bottom',
    ha = 'left',
)
ax.grid( visible=True, color='0.4' )

In [None]:
m_i = 12
i = inds[m_i]
key = c.publications[i]
print( a.data[key].citation )
print( a.data[key].abstract_str() )

In [None]:
key = 'Hummels2013'
i = inds_series.loc[key]
print( '{}: ( {:.3g}, {:.3g})'.format( np.argsort( inds )[i], coords[i][0], coords[i][1] ) )
print( a.data[key].citation )
print( a.data[key].abstract_str() )

# Save Progress

In [None]:
notes['addressed'].append( key )

In [None]:
notes['addressed'].append( 'Lacki2010' )

In [None]:
notes['to read']

In [None]:
notes.to_json( notes_fp )