In [1]:
import ast
import copy
import numpy as np
import os

In [2]:
import cc.atlas as atlas
import cc.cartography as cartography
import cc.publication as publication
import cc.utils as utils
import cc.tex as tex

In [3]:
import trove
import verdict

# Literature Review

This notebook contains a record of how I performed the literature review for this work.

# Parameters

In [4]:
config_fp = '/Users/zhafen/paper_repos/multidimensional_imaging/literature_review/literature_review.trove'
cp = trove.config_parser.ConfigParser( config_fp )
pm = trove.link_params_to_config(
    config_fp,
    variation = 'lyman_alpha_halos',
)

In [5]:
atlas_dir = pm['root_data_dir']

In [6]:
topics = {}
for variation in cp.variations:
    topics[variation] = ast.literal_eval( cp.get( variation, 'publications' ) )

# Load Data

## Topics Data

In [7]:
topics_fp = os.path.join( pm['root_data_dir'], 'topics.json' )
topics_data = verdict.Dict.from_json( topics_fp, create_nonexisting=True )
if 'not_included' not in topics_data:
    topics_data['not_included'] = {}
if 'intended_to_include' not in topics_data:
    topics_data['intended_to_include'] = {}

## Zotero Atlas

In [8]:
# Larger atlas
a_zotero = atlas.Atlas( atlas_dir, load_bibtex=False, data_fp=pm['zotero_atlas_fp'] )

Loading saved atlas data.


0it [00:00, ?it/s]
100%|██████████| 2041/2041 [00:00<00:00, 16918.67it/s]


In [9]:
# Vectorization
vp_dict = a_zotero.vectorize( projection_fp=pm['zotero_projection_fp'], )
c_zotero = cartography.Cartographer( **vp_dict )

Vectorizing text...
Using saved vectorized text...


  if hasattr( a[first_element_index][0], 'decode' ):


## Large Atlas

In [10]:
# Larger atlas
a = atlas.Atlas( atlas_dir, load_bibtex=False, )

Loading saved atlas data.


0it [00:00, ?it/s]
100%|██████████| 31026/31026 [00:01<00:00, 24746.30it/s]


In [11]:
# Update with zotero atlas (in case there are changes to the zotero data,
# which may occur during the review process as we download and look over papers).
a.update( a_zotero )

100%|██████████| 2041/2041 [00:00<00:00, 79391.75it/s]
100%|██████████| 29012/29012 [00:00<00:00, 128923.97it/s]


In [12]:
# Vectorization
vp_dict = a.vectorize()
c = cartography.Cartographer( **vp_dict )

Vectorizing text...
Using saved vectorized text...


## Tex Draft

In [13]:
pub_doc = tex.Tex( filepath=pm['tex_fp'] )

# Create a Reading List

## Setup

In [14]:
if pm['variation'] not in topics_data['not_included']:
    topics_data['not_included'][pm['variation']] = []
if 'global' not in topics_data['not_included']:
    topics_data['not_included']['global'] = []

In [15]:
if pm['variation'] not in topics_data['intended_to_include']:
    topics_data['intended_to_include'][pm['variation']] = []

In [16]:
def sort_by_similarity( target_key, c ):
    
    cospsi = c.cospsi( target_key, 'all' )
    sort_inds = np.argsort(cospsi)[::-1]
    sorted_cospsi = cospsi[sort_inds]
    sorted_publications = c.publications[sort_inds]
    
    return sorted_cospsi, sorted_publications

In [17]:
def print_sorted_publications(
    sorted_cospsi,
    sorted_publications,
    kernel_size,
    a,
    show_unread_only = False,
    do_not_show_included = True,
    do_not_show_not_included = True,
    do_not_show_intended_to_include = True,
    central_publication = '',
):

#     print( 'The {} most related publications are...\n'.format( kernel_size ) )

    n_shown = 0
    for i, key_i in enumerate( sorted_publications ):
                
        if i > kernel_size:
            break
                
        if key_i not in a.data:
            continue
        
        p_i = a[key_i]

        if 'read' in p_i.notes:
            read_flag = p_i.notes['read']
        else:
            read_flag = 'UNREAD'
        if show_unread_only and read_flag != 'UNREAD':
            continue
            
        included_flag = key_i in pub_doc.string           
        deliberately_not_included = ( key_i in topics_data['not_included'][pm['variation']] ) or ( key_i in topics_data['not_included']['global'] )
        intended_to_include = key_i in topics_data['intended_to_include'][pm['variation']]
        
        if do_not_show_included and included_flag:
            continue
        if do_not_show_not_included and deliberately_not_included:
            continue
        if do_not_show_intended_to_include and intended_to_include:
            continue

        if not isinstance( p_i, publication.UnofficialPublication ):
            print( '{} -- {}'.format( n_shown, key_i, ) )
            print( p_i.citation['title'] )
            print( p_i.citation['author'] )
            print( 'Related rank: {}.{}'.format( i, central_publication ) ) 
            print( 'Read: {}'.format( ''.join( read_flag ) ) )
            print( 'Included: {}'.format( included_flag ) )
            print( p_i.citation['ENTRYTYPE'] + '\n' )
            print( p_i.points_str() + '\n\n' )
        else:
            pass
#             print( '{} -- {}'.format( n_shown, key_i, ) )
#             print( 'Related rank: {}.{}'.format( i, central_publication ) ) 
        
        n_shown += 1


## Conservative Reading List
For the average abstract in combination with chosen words.

### Create and add average vector

In [18]:
# Can add chosen words if so wished.
search_str = ''

In [19]:
for i, key_i in enumerate( pm['publications'] ):
    search_str += a[key_i].points_str()

In [20]:
if 'search_str' in pm:
    search_str += pm['search_str']

In [21]:
p = publication.UnofficialPublication( pm['variation'] )

In [22]:
p.process_abstract( abstract_str=search_str )

In [23]:
a.data[pm['variation']] = p

In [24]:
a.update( a_zotero )
a.prune_duplicates(preferred=list(a_zotero.data.keys()))

100%|██████████| 2041/2041 [00:00<00:00, 116901.43it/s]
100%|██████████| 27147/27147 [00:00<00:00, 117585.65it/s]
100%|██████████| 1960/1960 [00:00<00:00, 119716.84it/s]
100%|██████████| 27147/27147 [00:00<00:00, 110187.36it/s]


In [25]:
# Vectorize
vp_dict = a.vectorize( overwrite=True, projection_fp='pass', )
c = cartography.Cartographer( **vp_dict )

Vectorizing text...
    Retrieving publication data...


100%|██████████| 29107/29107 [00:00<00:00, 135283.79it/s]


    Calculating vectorization...


### Produce reading list

In [26]:
sorted_cospsi, sorted_publications = sort_by_similarity( pm['variation'], c )

In [27]:
pub_doc = tex.Tex( filepath=pm['tex_fp'] )

In [28]:
def update_topics_data( key, list_name, topic=pm['variation'] ):
    
    if topic not in topics_data[list_name]:
        topics_data[list_name][topic] = [ key, ]
    else:
        topics_data[list_name][topic].append( key )
        topics_data[list_name][topic] = list( set( topics_data[list_name][topic] ) )
        
    topics_data.to_json( topics_fp )

update_topics_data( '2018Natur.562..229W', 'not_included' )

In [32]:
update_topics_data( 'VanDeVoort2012', 'not_included', 'global' )

In [34]:
update_topics_data( '2009ApJ...694..314A', 'intended_to_include' )

update_topics_data( 'Wisotzki2018', 'intended_to_include', 'lyman_alpha_halos' )

In [35]:
print_sorted_publications(
    sorted_cospsi,
    sorted_publications,
    25,
    a,
    do_not_show_included = True,
    do_not_show_not_included = True,
    central_publication = pm['variation'],
)

1 -- 2012MNRAS.426.1073H
{Energetic galaxy-wide outflows in high-redshift ultraluminous infrared galaxies hosting AGN activity}
{Harrison}, C.~M. and {Alexander}, D.~M. and {Swinbank}, A.~M. and {Smail}, Ian and {Alaghband-Zadeh}, S. and {Bauer}, F.~E. and {Chapman}, S.~C. and {Del Moro}, A. and {Hickox}, R.~C. and {Ivison}, R.~J. and {Men{\'e}ndez-Delmestre}, Kar{\'\i}n. and {Mullaney}, J.~R. and {Nesvadba}, N.~P.~H.
Related rank: 6.lyman_alpha_halos
Read: UNREAD
Included: False
article

We present integral field spectroscopy observations, covering the [O III] λλ4959, 5007 emission-line doublet of eight high-redshift (z = 1.4-3.4) ultraluminous infrared galaxies (ULIRGs) that host active galactic nucleus (AGN) activity, including known submillimetre luminous galaxies. The targets have moderate radio luminosities that are typical of high-redshift ULIRGs (L<SUB>1.4 GHz</SUB> = 10<SUP>24</SUP>-10<SUP>25</SUP> W Hz<SUP>-1</SUP>) and therefore are not radio-loud AGNs. We decouple kinematic

## Extensive Reading List
For each and everyone of the publications.

In [None]:
a_pubs = []
sorted_cospsis = []
sorted_publications = []
for i, key_i in enumerate( pm['publications'] ):
    a_i = atlas.Atlas(atlas_dir, load_bibtex=False, load_atlas_data=False )
    a_i.data[key_i] = a[key_i]
    
    # Identify relevant publications
    sorted_cospsi_i, sorted_publications_i = sort_by_similarity( key_i, c )
    sorted_cospsis.append( sorted_cospsi_i )
    sorted_publications.append( sorted_publications_i )

    for key in sorted_publications_i:

        # Get missing publications
        if key in a_zotero.data:
            a_i.data[key] = a_zotero.data[key]
        elif key in a.data:
            a_i.data[key] = a.data[key]

    a_i.prune_duplicates(preferred=list(a_zotero.data.keys()))
    
    a_pubs.append( a_i )

In [None]:
pub_doc = tex.Tex( filepath=pm['tex_fp'] )

In [None]:
topics_data['not_included'][pm['variation']].append( '2017MNRAS.466.3460V' )
topics_data.to_json( topics_fp )

In [None]:
for i, a_i in enumerate( a_pubs ):
    
#     print( '===============================================================================' )
#     print( pm['publications'][i] + '\n' )
    
    print_sorted_publications(
        sorted_cospsis[i],
        sorted_publications[i],
        pm['kernel_size'],
        a_i,
        do_not_show_included = True,
        do_not_show_not_included = True,
        central_publication = pm['publications'][i],
    )