# 3. CREATE DFR-BROWSER

### This notebook will create a dfr-browser using model data generated in notebook 2.

## SETTINGS

In [None]:
import csv
import os

## project directory
project_dir = %pwd
print(project_dir)

## import global project settings from config.py
from settings import *

## CREATE METADATA: Create DFR csv metadata from json files

In [None]:
%%time 

## CREATE METADATA FROM JSON FILES

import json

## Delete old metadata files
!rm -fr {metadata_dir}
!mkdir -p {metadata_dir}

json_directory = 'caches/json/'

## DEFINE METADATA STRINGCLEANER

import string
import unidecode

## MAP FIELDS FROM JSON TO DFRB METADATA

## id, publication, pubdate, title, articlebody, author, docUrl, wordcount

## idx       ->  id
## title     ->  title
##           ->  author
## pub       ->  publication
##           ->  docUrl
## length    ->  wordcount
## pub_date  ->  pubdate

## content   ->  articlebody

csv.field_size_limit(100000000)

metadata_csv_file = 'caches/metadata/metadata-dfrb.csv'

# ## infieldnames provides names for the original column order
# infieldnames = 'id', 'publication', 'pubdate', 'title', 'articlebody', 'pagerange', 'author', 'docUrl', 'wordcount'
# ## outfieldnames re-orders that name list into a new column order
# outfieldnames = 'id', 'title', 'author', 'publication', 'docUrl', 'wordcount', 'pubdate', 'pagerange'


with open(metadata_csv_file, 'w') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',')
#   csvwriter.writerow(['id'] + ['publication'] + ['pubdate'] + ['title'] + ['articlebody'] + ['author'] + ['docUrl'] + ['wordcount'])
#   csvwriter.writerow(['id'] + ['title'] + ['author'] + ['publication'] + ['docUrl'] + ['wordcount'] + ['pubdate'] + ['pagerange'])
    csvwriter.writerow(['id'] + ['title'] + ['author'] + ['journaltitle'] + ['volume'] + ['issue'] + ['pubdate'] + ['pagerange'])

    sorted_json = sorted(f for f in os.listdir(json_directory) if f.endswith(".json"))
    
    idx=0
    for filename in sorted_json:

        # log: preview the first and last files only to prevent log overflow
        if(idx<5 or idx > len(sorted_json)-5):
            print(idx, ':', filename, '\n')
        if(idx==5 and len(sorted_json)>10):
            print('...\n')
            
        with open(os.path.join(json_directory, filename)) as f:
            j = json.loads(f.read())
            if not 'pagerange' in j:
                j['pagerange'] = 'no-pg'
            if not 'author' in j:
                j['author'] = 'unknown'
            if not 'volume'in j:
                j['volume'] = 'no-vol'
            if not 'issue' in j:
                j['issue'] = 'no-issue'
            if not 'pub_date' in j:
                j['pub_date'] = 'none'

            # write article metadata to csv
            # csvwriter.writerow([idx] + [j['title']] + [] + [j['pub']] + [] + [j['length']] + [j['pub_date']])
            csvwriter.writerow(['json/' + filename] + [j['title']] + [j['author']] + [j['pub']] + [j['volume']] + [j['issue']] + [j['pub_date']] + [j['length']])
        
        idx = idx+1
        
print('\n\n----------Time----------')

Check metadata before modeling

In [None]:
!echo CHECK METADATA
!echo
!echo {metadata_dir} :
!ls -1 {metadata_dir}
!echo
!echo {metadata_file_reorder} :
!head -n 5 {metadata_file_reorder}
!echo

## CREATE BROWSER: Create files needed for dfr-browser

In [None]:
import csv

browser_meta_file_temp = 'caches/metadata/meta.temp.csv'
browser_meta_file = 'caches/metadata/meta.csv'

#write csv dfr-browser needs
with open(metadata_csv_file, 'r') as csv_in:
    csvreader = csv.reader(csv_in, delimiter=',')
    next(csvreader)  # skip header row
    with open(browser_meta_file_temp, 'w') as csv_out:
        # enforce quoted fields
        csvwriter = csv.writer(csv_out, delimiter=',', quoting=csv.QUOTE_ALL)
        for row in csvreader:
            csvwriter.writerow(row)
            
with open(browser_meta_file_temp, 'r') as fin:
    with open(browser_meta_file, 'w') as fout:
        for line in fin:
            fout.write(line.replace(',"",', ',NA,'))

!rm {browser_meta_file_temp}
!rm -r browser
# copy dfrbrowser template from scripts to project browser folder
!cp -r scripts/dfrbrowser-full/ browser/
# move and rename customized minimized js file
!mv browser/js/dfb.min.js.custom browser/js/dfb.min.js
!mkdir browser/data/

# create and move files for dfr-browser
!scripts/dfrbrowser-full/bin/prepare-data convert-state caches/model/topic-state.gz --tw browser/data/tw.json --dt browser/data/dt.json.zip
!scripts/dfrbrowser-full/bin/prepare-data info-stub -o browser/data/info.json
!cp caches/model/topic_scaled.csv browser/data


### Expected output: 
`rm: cannot remove 'browser': No such file or directory` (if running cell for the first time)    
`beta value, not saved in a file: 0.01529451469574406`    
`Wrote topic-words information to browser/data/tw.json`    
`Wrote sparse doc-topics to browser/data/dt.json.zip`        
`Created stub file in browser/data/info.json`    

Move metadata-dfrb to browser/data, zip up and rename, delete meta.csv copy

In [None]:
!rm browser/data/meta.csv.zip
!cp {browser_meta_file} browser/data/
!zip -j browser/data/meta.csv.zip browser/data/meta.csv
!rm browser/data/meta.csv

### Expected output:
`rm: cannot remove 'browser/data/meta.csv.zip': No such file or directory` (if running cell for the first time)    
 `adding: meta.csv (deflated x%)`   

Copy json cache into local browser for links

In [None]:
!if [ -d browser/json ]; then rm -rf browser/json; fi
!mkdir -p browser/json && cp -rf caches/json browser/
!find browser/json -type f -print0 | xargs -0 chmod 755

Tweak default index.html to link to JSON, not JSTOR

In [None]:
fpath_html = "browser/index.html"
with open(fpath_html, 'r') as file:
    filedata = file.read()
filedata = filedata.replace('on JSTOR', 'JSON')
with open(fpath_html, 'w') as file:
    file.write(filedata)

Generate an HTML menu with live browsing and download links based on the current working directory.

In [None]:
import os
    
project_name = os.path.basename(project_dir)
project_reldir = project_dir.split("/write/")[1]
print(project_reldir)
project_link = "http://harbor.english.ucsb.edu:10001/" + project_reldir + "/browser/"
## Can't get HTML display to work, so hack:
print("To view the browser live: " + project_link)

## ZIP BROWSER: Create a zipped copy of the browser for export

In [None]:
!zip browser.zip -r browser

Run the following cell for instructions about how to download and run your local dfrbrowser copy.

In [None]:
from IPython.display import display, HTML
msg = '''
<h2>Download</h2>
<p>To download and view the browser through a webserver hosted on your local machine:</p>
<ol>
    <li><a href="browser.zip" target='new'>Download browser.zip</a></li>
    <li>Unzip browser.zip</li>
    <li>Open a shell/terminal, and navigate to the browser directory you just downloaded</li>
    <li>On Linux / OSX, launch local webserver by running:<br><code>./bin/server</code></li>
    <li>View from your local webserver: <a href='http://localhost:8888/' target='_blank'>http://localhost:8888/</a></li>
    </ol>
'''
output = HTML(msg)
display(output)


## NEXT NOTEBOOKS

In [None]:
from IPython.display import display, HTML

write_project_dir = project_dir.replace('/home/jovyan/', '')
next_link = 'http://harbor.english.ucsb.edu:10000/notebooks/' + write_project_dir

next_link_html = HTML('<h2>Next:</h2><p><ul><li>Go to <a href="' + next_link + '/4_customize_dfrbrowser.ipynb" target="_blank"><strong>Notebook 4</a></strong> to make customize your dfrbrowser.</li><li>Go to <a href="' + next_link + '/5_browser_pyldavis.ipynb" target="_blank"><strong>Notebook 5</a></strong> to make a PyLDAVis visualization.</li><li>Go to <a href="' + next_link + '/6_browser_topic_bubbles.ipynb" target="_blank"><strong>Notebook 6</a></strong> to make a topic bubbles visualization.</li></ul></p>')
display(next_link_html)
       
