# TOPIC MODELING NOTEBOOK
## Run top cell, then click its "RUN ALL" button

In [None]:
%%HTML
<button id="do_run_all" style="font-size:40;">RUN ALL</button>
<script>
$("#do_run_all").click(
    function () {
        // assign port to Python variable
        var port_command = "port = " + location.port + "";
        IPython.notebook.kernel.execute(port_command);
        // write notebook url bases for target ports
        var url_parser = document.createElement("a");
        url_parser.href = location.href.substring(0, location.href.lastIndexOf("/"));
        url_parser.port = "9999";
        var url_9999_command = "url_9999 = '" + url_parser.href + "'";
        url_parser.port = "10000";
        var url_10000_command = "url_10000 = '" + url_parser.href + "'";
        // assign to Python variables
        IPython.notebook.kernel.execute(url_9999_command);
        IPython.notebook.kernel.execute(url_10000_command);
        // in %%javascript cell only:
        // element.html(port_command + '<br>' + url_9999_command + '<br>' + url_10000_command);
        $("#run_all_cells").click();
    });
</script>

In [None]:
## IMPORT

import csv
import glob
import os
import shutil

## SETTINGS

## project directory
project_dir = %pwd
print(project_dir)

## import global project settings from config.py
from settings import *

## Detect port / path / url environment

In [None]:
%%javascript

// for manual running if not using run-all button

// detect port
var port = location.port;
// assign to Python variable
var port_command = "port = " + port + "";

// write notebook url bases for target ports
var url_parser = document.createElement("a");
url_parser.href = location.href.substring(0, location.href.lastIndexOf("/"));
url_parser.port = "9999";
var url_9999_command = "url_9999 = '" + url_parser.href + "'";
url_parser.port = "10000";
var url_10000_command = "url_10000 = '" + url_parser.href + "'";

// assign to Python variables
IPython.notebook.kernel.execute(port_command);
IPython.notebook.kernel.execute(url_9999_command);
IPython.notebook.kernel.execute(url_10000_command);

// display results
element.html(port_command + '<br>' + url_9999_command + '<br>' + url_10000_command);

In [None]:
try:
    print(port)
    print(url_9999)
    print(url_10000)
except NameError as e:
    print("Not defined.")
    raise

In [None]:
# define non-url root path based on port
jupter_root = "/home/jovyan"
if (port==9999):
    jupter_root = jupter_root + "/work"
print('jupter_root =', jupter_root)

In [None]:
%%javascript
var url_parser = document.createElement("a");
url_parser.href = location.href;
if(url_parser.port == "10000"){
    url_parser.port = "9999";
} else {
    url_parser.port = "10000";
}
element.html('<p>If you wish, <strong>save first</strong> and then <a href="' + url_parser.href + '""><strong>switch to this notebook on ' + url_parser.port + '</strong></a>.')

// // assign to Python variable
// IPython.notebook.kernel.execute(url_switch_command);

## BROWSE: search zip filenames for keywords

Choose search_text to filter available data files.

In [None]:
search_text='mexico'

Run the cell and review the results.

In [None]:
import os
filespath = jupter_root + '/data/'
print("datafile_list = [")
for (dirname, _dirs, files) in os.walk(filespath):
    for filename in files:
        if filename.endswith('.zip') and search_text in filename:
            filepath = os.path.join(dirname.split(filespath)[1], filename)
            print("    '" + filepath + "',")
print("                 ]")

## LIST: define which zips will be used to import JSON files

Optionally cut-paste the entire cell output and replace the datafile_list array in the following cell.

In [None]:
jsondatadir = jupter_root + '/data/data-new/'
datafile_list = ['164282_deseretmorningnewssaltlakecity_bodypluralhumanitiesorhleadpluralhumanities_2017-01-01_2017-12-31.zip',
'6742_thenewyorktimes_bodypluralhumanitiesorhleadpluralhumanities_1980-01-01_1980-12-31.zip',
'164282_deseretmorningnews_bodypluralhumanitiesorhleadpluralhumanities_2017-01-01_2017-12-31.zip',
'300814_theforward_bodypluralhumanitiesorhleadpluralhumanities_2017-01-01_2017-12-31.zip',
'438278_thefreepressfernie_bodypluralhumanitiesorhleadpluralhumanities_2017-01-01_2017-12-31.zip']


## IMPORT: copy JSON from zip files to cache

JSON files will be stored in the /caches/ project directory. Original zip source data remains untouched.

In [None]:
%%time 

!rm -r caches/json
!mkdir -p caches/json

for datafile in datafile_list:
    datapath = jsondatadir + datafile
    !unzip -j -o -u "{datapath}" -d caches/json > /dev/null

!ls caches/json | wc -l
    
print('\n\n----------Time----------')

## FILTER: delete non-matching JSON

If you want to filter out any articles that do not contain a required keyword or phrase -- e.g. 'humanities' -- then write word here:

In [None]:
required_phrase = ''

Run the filter to delete JSON files that do not match. If no filter is defined, this step will be skipped.

In [None]:
%%time

import os, re, json

if required_phrase:
    
    json_directory = 'caches/json/'
    sorted_json = sorted(f for f in os.listdir(json_directory) if f.endswith(".json"))

    del_count = 0
    for filename in sorted_json:
        fpath = os.path.join(json_directory, filename)
        scrub_changed = False
        with open(fpath) as f:
            # json_decoded = json.load(json_file)
            json_decoded = json.loads(f.read())
            json_content = json_decoded['content']
            if not re.search(required_phrase, json_content, re.IGNORECASE):
                os.remove(os.path.join(json_directory, filename))
                del_count += 1
                if(del_count%10==0):
                    print('. ', end='')
    new_num_docs = len(os.listdir(json_directory))
    print('Number of documents deleted: ' + str(del_count))
    print('Number of documents containing "' + required_phrase + '": ' + str(new_num_docs))
else:
    print('No required phrase, no documents deleted.')


print('\n\n----------Time----------')

## SCRUB: add scrubbed content to JSON

Scrubbing is performed on each article JSON file, and the results are stored in a new key in the JSON file.

-  To perform, set this step to True.
-  If an article is already scrubbed it will be skipped unless rescrub is True.
-  To reduce the JSON cache size, set delete original content. If original content is deleted then scrubbing cannot be repeated without re-exporting JSON from zip above.

In [None]:
do_scrub = True
do_scrub_rescrub = False
do_scrub_delete_original_content = True 

Run to scrub.

In [None]:
%%time

import json
from scripts.scrub.scrub import scrub

if do_scrub:

    json_directory = 'caches/json/'
    sorted_json = sorted(f for f in os.listdir(json_directory) if f.endswith(".json"))

    scrub_count = 0
    for filename in sorted_json:
        fpath = os.path.join(json_directory, filename)
        scrub_changed = False
        with open(fpath) as f:
            # json_decoded = json.load(json_file)
            json_decoded = json.loads(f.read())
            if 'content' in json_decoded and (not 'content_scrubbed' in json_decoded or do_scrub_rescrub):
                json_decoded['content_scrubbed'] = scrub(json_decoded['content'])
                scrub_changed = True
            if do_scrub_delete_original_content and 'content_scrubbed' in json_decoded and 'content' in json_decoded:
                json_decoded.pop('content', None)
                scrub_changed = True
        if scrub_changed:
            with open(fpath, 'w') as json_file:
                json.dump(json_decoded, json_file)
            scrub_count += 1
            ## progress indicator
            if(scrub_count%100==0):
                print('. ', end='')
    print('Scrubbed ' + str(scrub_count) + ' files.')
else:
    print('Skipping scrub.')

print('\n\n----------Time----------')

## DE-DUPLICATE

**Deduplication is currently disabled, as it does not have an interface for large collections of JSON files.**

In [None]:
do_dedupe = False

In [None]:
## DE-DUPLICATE

## For help on script options:
## %run scripts/deduplicate/corpus_compare.py -h 

if do_dedupe:

    print(project_dir)
    print(dedup_dir)
    print(dedup_name)
    
    ## delete previous results
    !rm -f {dedup_dir}/{dedup_output}.csv
    !rm -f {dedup_dir}/{dedup_output}.log
    !rm -f {dedup_output}.log

    !mkdir -p {text_files_clean_dir}
    %run {dedup_dir}/{dedup} -i {text_files_clean_dir}/ -o {dedup_dir}/{dedup_name}.csv -l {dedup_dir}/{dedup_name}.log

## --------------
## FOR DockerFile
## --------------
## relies on sklearn
## need to pip install or pip2 install or conda install scikit-learn?

else:
    print('Skipping de-deuplicate')



Delete duplicates

In [None]:
## MERGE METADATA
import os
import csv

csv.field_size_limit(100000000)

if do_dedupe:
    with open(project_dir + '/' + dedup_dir + '/' + dedup_name + '.csv','r') as fin:
        cfin = csv.reader(fin)
        # print(cfin, None)
        next(cfin) # skip header
        for row in cfin:
            if os.path.isfile(row[5]):
                print('Deleting: ' + row[5])
                os.remove(row[5])
            else:
                print('Missing:  '+ row[5])
    print('\n-----\nDuplicates deleted from:', dedup_dir + '/' + dedup_name + '.csv')

else:
    print('Skipping de-deuplicate')

## EXPORT: MALLET text files and DFR csv metadata

In [None]:
%%time 

## CREATE METADATA FROM JSON FILES

import json

## Delete old metadata files
!rm -fr {metadata_dir}
!mkdir -p {metadata_dir}

## Delete old text files
!rm -fr {text_files_clean_dir}
!mkdir -p {text_files_clean_dir}

json_directory = 'caches/json/'

## DEFINE METADATA STRINGCLEANER

import string
import unidecode

def string_cleaner(unistr):
    """Returns string in unaccented form, printable characters only."""
    unaccented = unidecode.unidecode(unistr)
    printonly = ''.join(filter(lambda x:x in string.printable, unaccented))
    return printonly

## MAP FIELDS FROM JSON TO DFRB METADATA

## id, publication, pubdate, title, articlebody, author, docUrl, wordcount

## idx       ->  id
## title     ->  title
##           ->  author
## pub       ->  publication
##           ->  docUrl
## length    ->  wordcount
## pub_date  ->  pubdate

## content   ->  articlebody


csv.field_size_limit(100000000)

metadata_csv_file = 'caches/metadata/metadata-dfrb.csv'

# ## infieldnames provides names for the original column order
# infieldnames = 'id', 'publication', 'pubdate', 'title', 'articlebody', 'pagerange', 'author', 'docUrl', 'wordcount'
# ## outfieldnames re-orders that name list into a new column order
# outfieldnames = 'id', 'title', 'author', 'publication', 'docUrl', 'wordcount', 'pubdate', 'pagerange'


with open(metadata_csv_file, 'w') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',')
#   csvwriter.writerow(['id'] + ['publication'] + ['pubdate'] + ['title'] + ['articlebody'] + ['author'] + ['docUrl'] + ['wordcount'])
#   csvwriter.writerow(['id'] + ['title'] + ['author'] + ['publication'] + ['docUrl'] + ['wordcount'] + ['pubdate'] + ['pagerange'])
    csvwriter.writerow(['id'] + ['title'] + ['author'] + ['journaltitle'] + ['volume'] + ['issue'] + ['pubdate'] + ['pagerange'])

    sorted_json = sorted(f for f in os.listdir(json_directory) if f.endswith(".json"))
    
    idx = 0
    for filename in sorted_json:

        # log: preview the first and last files only to prevent log overflow
        if(idx<5 or idx > len(sorted_json)-5):
            print(idx, ':', filename, '\n')
        if(idx==5 and len(sorted_json)>10):
            print('...\n')
            
        with open(os.path.join(json_directory, filename)) as f:
            j = json.loads(f.read())
            if not 'pagerange' in j:
                j['pagerange'] = 'no-pg'
            if not 'author' in j:
                j['author'] = 'unknown'
            if not 'volume'in j:
                j['volume'] = 'no-vol'
            if not 'issue' in j:
                j['issue'] = 'no-issue'

            # write article metadata to csv
            # csvwriter.writerow([idx] + [j['title']] + [] + [j['pub']] + [] + [j['length']] + [j['pub_date']])
            csvwriter.writerow(['json/' + filename] + [j['title']] + [j['author']] + [j['pub']] + [j['volume']] + [j['issue']] + [j['pub_date']] + [j['length']])

            # name article body file
            padded_id = str(idx).zfill(len(str(len(sorted_json))))
            
            # write article body file to txt
            with open(project_dir+'/' + text_files_clean_dir + '/'+ padded_id + '_.txt', 'w') as outfile:
                if 'content_scrubbed' in j:
                    outfile.write(string_cleaner(j['content_scrubbed']))
                else:
                    outfile.write(string_cleaner(j['content']))

            idx = idx+1

print('\n\n----------Time----------')

In [None]:
## CHECK METADATA

!echo CHECK METADATA
!echo
!echo {metadata_dir} :
!ls -1 {metadata_dir}
!echo
!echo {metadata_file_reorder} :
!head -n 5 {metadata_file_reorder}
!echo
!echo CHECK TEXT FILES
!echo
!echo {text_files_clean_dir} :
!ls -1 {text_files_clean_dir} | head
!echo ...
!ls -1 {text_files_clean_dir} | tail


## MODEL: build mallet topic model

In [None]:
!mkdir -p {model_dir}

In [None]:
%%time 

## 1. run mallet -- import

## build the mallet import command string
mallet_import_args = '--input ' + project_dir + '/' + text_files_clean_dir + '/ ' \
  + '--output ' + project_dir + '/' + model_dir + '/' + model_file + ' ' \
  + '--keep-sequence ' \
  + '--remove-stopwords ' \
  + '--extra-stopwords ' + project_dir + '/' + stopwords_dir + '/' + stopwords_file + ' '
mallet_import_command = 'mallet import-dir ' + mallet_import_args
print(mallet_import_command+'\n')

## run mallet; capture and display output
mout = !mallet import-dir {mallet_import_args}
print('\n'.join(mout)+'\n')

print(os.listdir(project_dir + '/' + model_dir))

print('\n-----\nModel import done.')

print('\n\n----------Time----------')

In [None]:
%%time

## 2. run mallet -- train

## only generate diagnostics if feature available -- running on port 10000
if(port==10000):
    generate_diagnostics = True
else:
    generate_diagnostics = False
    
## build the mallet training command string
mallet_train_args = '--input ' + project_dir + '/' + model_dir + '/' + model_file + ' ' \
  + '--num-topics ' + model_num_topics + ' ' \
  + '--optimize-interval 10 ' \
  + '--output-state ' + project_dir + '/' + model_dir + '/' + model_state + ' ' \
  + '--output-topic-keys ' + project_dir + '/' + model_dir + '/' + model_keys + ' ' \
  + '--output-doc-topics ' + project_dir + '/' + model_dir + '/' + model_composition + ' ' \
  + '--word-topic-counts-file ' + project_dir + '/' + model_dir + '/' + model_counts
if use_random_seed == True:
  mallet_train_args += ' --random-seed ' + model_random_seed
if generate_diagnostics == True:
  mallet_train_args += ' --diagnostics-file ' + project_dir + '/' + model_dir + '/diagnostics.xml'
    
mallet_train_command = 'mallet train-topics ' + mallet_train_args
print(mallet_train_command+'\n')

print('\nRunning:\n')

## run mallet
!mallet train-topics {mallet_train_args}
    
print(os.listdir(project_dir + '/' + model_dir))

print('\n-----\nModel training done.')

if generate_diagnostics == True:
    print('A diagnostics web page will be generated soon. This feature is not yet active. In the meantime, you can view the diagnostics.xml file in your model directory.')

print('\n\n----------Time----------')

In [None]:
if(generate_diagnostics):
    print('View diagnostics.xml in Edit mode:')
    diagnostics_edit_view = url_10000.replace('/notebooks/', '/edit/') + '/caches/model/diagnostics.xml'
    from IPython.display import display, HTML
    browser_link_html = HTML('<p><a href="' + diagnostics_edit_view + '" target="_blank"><strong>diagnostics.xml</strong></a></p>')
    display(browser_link_html)
else:
    print('No diagnostics generated when run on 9999.')

In [None]:
## NEXT
## Generate a link to the next notebook in the workflow

from IPython.display import display, HTML
browser_link_html = HTML('<p>The topic model is built.</p><h2>Make a <a href="' + url_9999 + '/4_make_topic_browser.ipynb" target="_blank"><strong>dfrbrowser</strong> topic browser</a> (on 9999)</h2><p>...or...</p><h2>Make a <a href="' + url_10000 + '/6_browser_pyldavis.ipynb" target="_blank"><strong>pyLDAvis</strong> topic browser</a> (on 10000)</h2>')
display(browser_link_html)


----------