# TOPIC MODELING NOTEBOOK
## Run top cell, then click its "RUN ALL" button

In [None]:
%%HTML
<button id="do_run_all" style="font-size:32px;">RUN ALL</button>
<script>
$("#do_run_all").click(
    function () {
        // assign port to Python variable
        var port_command = "port = " + location.port + "";
        Jupyter.notebook.kernel.execute(port_command);
        // write notebook url bases for target ports
        var url_parser = document.createElement("a");
        url_parser.href = location.href.substring(0, location.href.lastIndexOf("/"));
        url_parser.port = "9999";
        var url_9999_command = "url_9999 = '" + url_parser.href + "'";
        url_parser.port = "10000";
        var url_10000_command = "url_10000 = '" + url_parser.href + "'";
        // assign to Python variables
        Jupyter.notebook.kernel.execute(url_9999_command);
        Jupyter.notebook.kernel.execute(url_10000_command);
        // in %%javascript cell only:
        // element.html(port_command + '<br>' + url_9999_command + '<br>' + url_10000_command);
        $("#run_all_cells").click();
    });
</script>

In [None]:
## IMPORT

import csv
import glob
import os
import shutil

## SETTINGS

## project directory
project_dir = %pwd
print(project_dir)

## import global project settings from config.py
from settings import *

## Detect port / path / url environment

In [None]:
%%javascript

// for manual running if not using run-all button

// detect port
var port = location.port;
// assign to Python variable
var port_command = "port = " + port + "";

// write notebook url bases for target ports
var url_parser = document.createElement("a");
url_parser.href = location.href.substring(0, location.href.lastIndexOf("/"));
url_parser.port = "9999";
var url_9999_command = "url_9999 = '" + url_parser.href + "'";
url_parser.port = "10000";
var url_10000_command = "url_10000 = '" + url_parser.href + "'";

// assign to Python variables
Jupyter.notebook.kernel.execute(port_command);
Jupyter.notebook.kernel.execute(url_9999_command);
Jupyter.notebook.kernel.execute(url_10000_command);

// display results
element.html(port_command + '<br>' + url_9999_command + '<br>' + url_10000_command);

In [None]:
try:
    print(port)
    print(url_9999)
    print(url_10000)
except NameError as e:
    print("Not defined.")
    raise

In [None]:
# define non-url root path based on port
jupter_root = "/home/jovyan"
if (port==9999):
    jupter_root = jupter_root + "/work"
print('jupter_root =', jupter_root)

In [None]:
%%javascript
var url_parser = document.createElement("a");
url_parser.href = location.href;
if(url_parser.port == "10000"){
    url_parser.port = "9999";
} else {
    url_parser.port = "10000";
}
element.html('<p>If you wish, <strong>save first</strong> and then <a href="' + url_parser.href + '""><strong>switch to this notebook on ' + url_parser.port + '</strong></a>.')

// // assign to Python variable
// IPython.notebook.kernel.execute(url_switch_command);

## BROWSE: search zip filenames for keywords

Choose search_text to filter available data files.

In [None]:
search_text='mexico'

Run the cell and review the results.

In [None]:
import os
filespath = jupter_root + '/data/'
print("datafile_list = [")
for (dirname, _dirs, files) in os.walk(filespath):
    for filename in files:
        if filename.endswith('.zip') and search_text in filename:
            filepath = os.path.join(dirname.split(filespath)[1], filename)
            print("    '" + filepath + "',")
print("                 ]")

## LIST: define which zips will be used to import JSON files

Optionally cut-paste the entire cell output and replace the datafile_list array in the following cell.

In [None]:
jsondatadir = jupter_root + '/data/data-new/'
datafile_list = ['164282_deseretmorningnewssaltlakecity_bodypluralhumanitiesorhleadpluralhumanities_2017-01-01_2017-12-31.zip',
'6742_thenewyorktimes_bodypluralhumanitiesorhleadpluralhumanities_1980-01-01_1980-12-31.zip',
'164282_deseretmorningnews_bodypluralhumanitiesorhleadpluralhumanities_2017-01-01_2017-12-31.zip',
'300814_theforward_bodypluralhumanitiesorhleadpluralhumanities_2017-01-01_2017-12-31.zip',
'438278_thefreepressfernie_bodypluralhumanitiesorhleadpluralhumanities_2017-01-01_2017-12-31.zip']


## IMPORT: copy JSON from zip files to cache

JSON files will be stored in the /caches/ project directory. Original zip source data remains untouched.

In [None]:
%%time 

!rm -r caches/json
!mkdir -p caches/json

for datafile in datafile_list:
    datapath = jsondatadir + datafile
    !unzip -j -o -u "{datapath}" -d caches/json > /dev/null

!ls caches/json | wc -l
    
print('\n\n----------Time----------')

## FILTER: delete non-matching JSON

If you want to filter out any articles that do not contain a required keyword or phrase -- e.g. 'humanities' -- then write word here:

In [None]:
required_phrase = ''

Run the filter to delete JSON files that do not match. If no filter is defined, this step will be skipped.

In [None]:
%%time

import os, re, json

if required_phrase:
    
    json_directory = 'caches/json/'
    sorted_json = sorted(f for f in os.listdir(json_directory) if f.endswith(".json"))

    del_count = 0
    for filename in sorted_json:
        fpath = os.path.join(json_directory, filename)
        scrub_changed = False
        with open(fpath) as f:
            # json_decoded = json.load(json_file)
            json_decoded = json.loads(f.read())
            json_content = json_decoded['content']
            if not re.search(required_phrase, json_content, re.IGNORECASE):
                os.remove(os.path.join(json_directory, filename))
                del_count += 1
                if(del_count%10==0):
                    print('. ', end='')
    new_num_docs = len(os.listdir(json_directory))
    print('Number of documents deleted: ' + str(del_count))
    print('Number of documents containing "' + required_phrase + '": ' + str(new_num_docs))
else:
    print('No required phrase, no documents deleted.')


print('\n\n----------Time----------')

## SCRUB: add scrubbed content to JSON

Scrubbing is performed on each article JSON file, and the results are stored in a new key in the JSON file.

-  To perform, set this step to True.
-  If an article is already scrubbed it will be skipped unless rescrub is True.
-  To reduce the JSON cache size, set delete original content. If original content is deleted then scrubbing cannot be repeated without re-exporting JSON from zip above.

In [None]:
do_scrub = True
do_scrub_rescrub = False
do_scrub_delete_original_content = True 

Run to scrub.

In [None]:
%%time

import json
from scripts.scrub.scrub import scrub

if do_scrub:

    json_directory = 'caches/json/'
    sorted_json = sorted(f for f in os.listdir(json_directory) if f.endswith(".json"))

    scrub_count = 0
    for filename in sorted_json:
        fpath = os.path.join(json_directory, filename)
        scrub_changed = False
        with open(fpath) as f:
            # json_decoded = json.load(json_file)
            json_decoded = json.loads(f.read())
            if 'content' in json_decoded and (not 'content_scrubbed' in json_decoded or do_scrub_rescrub):
                json_decoded['content_scrubbed'] = scrub(json_decoded['content'])
                scrub_changed = True
            if do_scrub_delete_original_content and 'content_scrubbed' in json_decoded and 'content' in json_decoded:
                json_decoded.pop('content', None)
                scrub_changed = True
        if scrub_changed:
            with open(fpath, 'w') as json_file:
                json.dump(json_decoded, json_file)
            scrub_count += 1
            ## progress indicator
            if(scrub_count%100==0):
                print('. ', end='')
    print('Scrubbed ' + str(scrub_count) + ' files.')
else:
    print('Skipping scrub.')

print('\n\n----------Time----------')

## DE-DUPLICATE

**Deduplication is currently disabled, as it does not have an interface for large collections of JSON files.**

In [None]:
do_dedupe = True

In [None]:
## DE-DUPLICATE

## For help on script options:
## %run scripts/deduplicate/corpus_compare.py -h 

if do_dedupe:

    print(project_dir)
    print(dedup_dir)
    print(dedup_name)
    
    ## delete previous results
    !rm -f {dedup_dir}/{dedup_output}.csv
    !rm -f {dedup_dir}/{dedup_output}.log
    !rm -f {dedup_output}.log

    !mkdir -p {text_files_clean_dir}
    %run {dedup_dir}/{dedup} -i caches/json/ -f *.json --threshold 0.8 -o {dedup_dir}/{dedup_name}.csv -l {dedup_dir}/{dedup_name}.log

## --------------
## FOR DockerFile
## --------------
## relies on sklearn
## need to pip install or pip2 install or conda install scikit-learn?

else:
    print('Skipping de-deuplicate')



Delete duplicates

In [None]:
## MERGE METADATA
import os
import csv

csv.field_size_limit(100000000)

if do_dedupe:
    with open(project_dir + '/' + dedup_dir + '/' + dedup_name + '.csv','r') as fin:
        cfin = csv.reader(fin)
        # print(cfin, None)
        next(cfin) # skip header
        for row in cfin:
            if os.path.isfile(row[5]):
                print('Deleting: ' + row[5])
                os.remove(row[5])
            else:
                print('Missing:  '+ row[5])
    print('\n-----\nDuplicates deleted from:', dedup_dir + '/' + dedup_name + '.csv')

else:
    print('Skipping de-deuplicate')

## EXPORT: MALLET text files and DFR csv metadata

In [None]:
%%time 

## CREATE METADATA FROM JSON FILES

import json

## Delete old metadata files
!rm -fr {metadata_dir}
!mkdir -p {metadata_dir}

## Delete old text files
!rm -fr {text_files_clean_dir}
!mkdir -p {text_files_clean_dir}

json_directory = 'caches/json/'

## DEFINE METADATA STRINGCLEANER

import string
import unidecode

def string_cleaner(unistr):
    """Returns string in unaccented form, printable characters only."""
    unaccented = unidecode.unidecode(unistr)
    printonly = ''.join(filter(lambda x:x in string.printable, unaccented))
    return printonly

## MAP FIELDS FROM JSON TO DFRB METADATA

## id, publication, pubdate, title, articlebody, author, docUrl, wordcount

## idx       ->  id
## title     ->  title
##           ->  author
## pub       ->  publication
##           ->  docUrl
## length    ->  wordcount
## pub_date  ->  pubdate

## content   ->  articlebody


csv.field_size_limit(100000000)

metadata_csv_file = 'caches/metadata/metadata-dfrb.csv'

# ## infieldnames provides names for the original column order
# infieldnames = 'id', 'publication', 'pubdate', 'title', 'articlebody', 'pagerange', 'author', 'docUrl', 'wordcount'
# ## outfieldnames re-orders that name list into a new column order
# outfieldnames = 'id', 'title', 'author', 'publication', 'docUrl', 'wordcount', 'pubdate', 'pagerange'


with open(metadata_csv_file, 'w') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',')
#   csvwriter.writerow(['id'] + ['publication'] + ['pubdate'] + ['title'] + ['articlebody'] + ['author'] + ['docUrl'] + ['wordcount'])
#   csvwriter.writerow(['id'] + ['title'] + ['author'] + ['publication'] + ['docUrl'] + ['wordcount'] + ['pubdate'] + ['pagerange'])
    csvwriter.writerow(['id'] + ['title'] + ['author'] + ['journaltitle'] + ['volume'] + ['issue'] + ['pubdate'] + ['pagerange'])

    sorted_json = sorted(f for f in os.listdir(json_directory) if f.endswith(".json"))
    
    idx = 0
    for filename in sorted_json:

        # log: preview the first and last files only to prevent log overflow
        if(idx<5 or idx > len(sorted_json)-5):
            print(idx, ':', filename, '\n')
        if(idx==5 and len(sorted_json)>10):
            print('...\n')
            
        with open(os.path.join(json_directory, filename)) as f:
            j = json.loads(f.read())
            if not 'pagerange' in j:
                j['pagerange'] = 'no-pg'
            if not 'author' in j:
                j['author'] = 'unknown'
            if not 'volume'in j:
                j['volume'] = 'no-vol'
            if not 'issue' in j:
                j['issue'] = 'no-issue'

            # write article metadata to csv
            # csvwriter.writerow([idx] + [j['title']] + [] + [j['pub']] + [] + [j['length']] + [j['pub_date']])
            csvwriter.writerow(['json/' + filename] + [j['title']] + [j['author']] + [j['pub']] + [j['volume']] + [j['issue']] + [j['pub_date']] + [j['length']])

            # name article body file
            padded_id = str(idx).zfill(len(str(len(sorted_json))))
            
            # write article body file to txt
            with open(project_dir+'/' + text_files_clean_dir + '/'+ padded_id + '_.txt', 'w') as outfile:
                if 'content_scrubbed' in j:
                    outfile.write(string_cleaner(j['content_scrubbed']))
                else:
                    outfile.write(string_cleaner(j['content']))

            idx = idx+1

print('\n\n----------Time----------')

In [None]:
## CHECK METADATA

!echo CHECK METADATA
!echo
!echo {metadata_dir} :
!ls -1 {metadata_dir}
!echo
!echo {metadata_file_reorder} :
!head -n 5 {metadata_file_reorder}
!echo
!echo CHECK TEXT FILES
!echo
!echo {text_files_clean_dir} :
!ls -1 {text_files_clean_dir} | head
!echo ...
!ls -1 {text_files_clean_dir} | tail


## MODEL: build mallet topic model

In [None]:
!mkdir -p {model_dir}

In [None]:
%%time 

## 1. run mallet -- import

## build the mallet import command string
mallet_import_args = '--input ' + project_dir + '/' + text_files_clean_dir + '/ ' \
  + '--output ' + project_dir + '/' + model_dir + '/' + model_file + ' ' \
  + '--keep-sequence ' \
  + '--remove-stopwords ' \
  + '--extra-stopwords ' + project_dir + '/' + stopwords_dir + '/' + stopwords_file + ' '
mallet_import_command = 'mallet import-dir ' + mallet_import_args
print(mallet_import_command+'\n')

## run mallet; capture and display output
mout = !mallet import-dir {mallet_import_args}
print('\n'.join(mout)+'\n')

print(os.listdir(project_dir + '/' + model_dir))

print('\n-----\nModel import done.')

print('\n\n----------Time----------')

In [None]:
%%time

## 2. run mallet -- train

## only generate diagnostics if feature available -- running on port 10000
if(port==10000):
    generate_diagnostics = True
else:
    generate_diagnostics = False
    
## build the mallet training command string
mallet_train_args = '--input ' + project_dir + '/' + model_dir + '/' + model_file + ' ' \
  + '--num-topics ' + model_num_topics + ' ' \
  + '--optimize-interval 10 ' \
  + '--output-state ' + project_dir + '/' + model_dir + '/' + model_state + ' ' \
  + '--output-topic-keys ' + project_dir + '/' + model_dir + '/' + model_keys + ' ' \
  + '--output-doc-topics ' + project_dir + '/' + model_dir + '/' + model_composition + ' ' \
  + '--word-topic-counts-file ' + project_dir + '/' + model_dir + '/' + model_counts
if use_random_seed == True:
  mallet_train_args += ' --random-seed ' + model_random_seed
if generate_diagnostics == True:
  mallet_train_args += ' --diagnostics-file ' + project_dir + '/' + model_dir + '/diagnostics.xml'
    
mallet_train_command = 'mallet train-topics ' + mallet_train_args
print(mallet_train_command+'\n')

print('\nRunning:\n')

## run mallet
!mallet train-topics {mallet_train_args}
    
print(os.listdir(project_dir + '/' + model_dir))

print('\n-----\nModel training done.')

if generate_diagnostics == True:
    print('A diagnostics web page will be generated soon. This feature is not yet active. In the meantime, you can view the diagnostics.xml file in your model directory.')

print('\n\n----------Time----------')

In [None]:
if(generate_diagnostics):
    print('View diagnostics.xml in Edit mode:')
    diagnostics_edit_view = url_10000.replace('/notebooks/', '/edit/') + '/caches/model/diagnostics.xml'
    from IPython.display import display, HTML
    browser_link_html = HTML('<p><a href="' + diagnostics_edit_view + '" target="_blank"><strong>diagnostics.xml</strong></a></p>')
    display(browser_link_html)
else:
    print('No diagnostics generated when run on 9999.')

In [None]:
## NEXT
## Generate a link to the next notebook in the workflow

from IPython.display import display, HTML
browser_link_html = HTML('<p>The topic model is built.</p><h2>Make a <a href="' + url_9999 + '/4_make_topic_browser.ipynb" target="_blank"><strong>dfrbrowser</strong> topic browser</a> (on 9999)</h2><p>...or...</p><h2>Make a <a href="' + url_10000 + '/6_browser_pyldavis.ipynb" target="_blank"><strong>pyLDAvis</strong> topic browser</a> (on 10000)</h2>')
display(browser_link_html)

## PREPARE DATA FOR DFR-BROWSER (python prepare-data) IN DEVELOPMENT

In [None]:
prepare_dfr = True

In [None]:
import csv

if prepare_dfr:

    browser_meta_file_temp = 'caches/metadata/meta.temp.csv'
    browser_meta_file = 'caches/metadata/meta.csv'

    with open(metadata_csv_file, 'r') as csv_in:
        csvreader = csv.reader(csv_in, delimiter=',')
        next(csvreader)  # skip header row
        with open(browser_meta_file_temp, 'w') as csv_out:
            # enforce quoted fields
            csvwriter = csv.writer(csv_out, delimiter=',', quoting=csv.QUOTE_ALL)
            for row in csvreader:
                csvwriter.writerow(row)
            
    with open(browser_meta_file_temp, 'r') as fin:
        with open(browser_meta_file, 'w') as fout:
            for line in fin:
                fout.write(line.replace(',"",', ',NA,'))

    !rm {browser_meta_file_temp}
    !rm -r browser
    # copy dfrbrowser template from scripts to project browser folder
    !cp -r scripts/dfrbrowser-full/ browser/
    !mv browser/js/dfb.min.js.custom browser/js/dfb.min.js
    #rename customized minimized js file
    !mv browser/js/dfb.min.js.custom browser/js/dfb.min.js
    !mkdir browser/data/
    
    #dfbOutputDir = "browser/data/"
    #!rm -r {dfbOutputDir}
    #!mkdir -p {dfbOutputDir}
    !scripts/dfrbrowser-full/bin/prepare-data convert-state caches/model/topic-state.gz --tw browser/data/tw.json --dt browser/data/dt.json.zip
    !scripts/dfrbrowser-full/bin/prepare-data info-stub -o browser/data/info.json

## Generate Topic Scaling Metadata

In [None]:
"""scale_topics.py.

Create a topic_scaled.csv file from the Mallet state file.

Combines code by Jeri E. Wieringa (https://github.com/jerielizabeth/Gospel-of-Health-Notebooks/blob/master/blogPosts/pyLDAvis_and_Mallet.ipynb)
to transform Mallet data for use with pyLDAvis and uses code derived
from pyLDAvis to calculate topic coordinates using MDS.

Configure the paths to the mallet state and topic_scaled files below.
"""

# pylint: disable=E1101
# pylint: disable=W1201


# Python imports
import gzip
import logging
import os
import numpy as np
import pandas as pd
import sklearn.preprocessing
# Set fallback for MDS scaling
try:
    from sklearn.manifold import MDS, TSNE
    sklearn_present = True
except ImportError:
    sklearn_present = False
from past.builtins import basestring
from scipy.stats import entropy
from scipy.spatial.distance import pdist, squareform


# Configure the input and output file paths
output_state_file = os.path.join(project_dir + '/caches/model', 'topic-state.gz')
topic_scaled_file = os.path.join(project_dir + '/browser/data', 'topic_scaled.csv')


def __num_dist_rows__(array, ndigits=2):
    return array.shape[0] - int((pd.DataFrame(array).sum(axis=1) < 0.999).sum())


class ValidationError(ValueError):
    """Handle validation errors."""

    pass


def _input_check(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency):
    ttds = topic_term_dists.shape
    dtds = doc_topic_dists.shape
    errors = []
    def err(msg):
        """Append error message."""
        errors.append(msg)

    if dtds[1] != ttds[0]:
        err('Number of rows of topic_term_dists does not match number of columns of doc_topic_dists; both should be equal to the number of topics in the model.')

    if len(doc_lengths) != dtds[0]:
        err('Length of doc_lengths not equal to the number of rows in doc_topic_dists; both should be equal to the number of documents in the data.')

    W = len(vocab)
    if ttds[1] != W:
        err('Number of terms in vocabulary does not match the number of columns of topic_term_dists (where each row of topic_term_dists is a probability distribution of terms for a given topic).')
    if len(term_frequency) != W:
        err('Length of term_frequency not equal to the number of terms in the vocabulary (len of vocab).')

    if __num_dist_rows__(topic_term_dists) != ttds[0]:
        err('Not all rows (distributions) in topic_term_dists sum to 1.')

    if __num_dist_rows__(doc_topic_dists) != dtds[0]:
        err('Not all rows (distributions) in doc_topic_dists sum to 1.')

    if len(errors) > 0:
        return errors


def _input_validate(*args):
    res = _input_check(*args)
    if res:
        raise ValidationError('\n' + '\n'.join([' * ' + s for s in res]))


def _jensen_shannon(_P, _Q):
    _M = 0.5 * (_P + _Q)
    return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))


def _pcoa(pair_dists, n_components=2):
    """Principal Coordinate Analysis.

    AKA Classical Multidimensional Scaling
    code referenced from skbio.stats.ordination.pcoa
    https://github.com/biocore/scikit-bio/blob/0.5.0/skbio/stats/ordination/_principal_coordinate_analysis.py
    """
    # pairwise distance matrix is assumed symmetric
    pair_dists = np.asarray(pair_dists, np.float64)

    # perform SVD on double centred distance matrix
    n = pair_dists.shape[0]
    H = np.eye(n) - np.ones((n, n)) / n
    B = - H.dot(pair_dists ** 2).dot(H) / 2
    eigvals, eigvecs = np.linalg.eig(B)

    # Take first n_components of eigenvalues and eigenvectors
    # sorted in decreasing order
    ix = eigvals.argsort()[::-1][:n_components]
    eigvals = eigvals[ix]
    eigvecs = eigvecs[:, ix]

    # replace any remaining negative eigenvalues and associated eigenvectors with zeroes
    # at least 1 eigenvalue must be zero
    eigvals[np.isclose(eigvals, 0)] = 0
    if np.any(eigvals < 0):
        ix_neg = eigvals < 0
        eigvals[ix_neg] = np.zeros(eigvals[ix_neg].shape)
        eigvecs[:, ix_neg] = np.zeros(eigvecs[:, ix_neg].shape)

    return np.sqrt(eigvals) * eigvecs


def js_PCoA(distributions):
    """Perform dimension reduction.

    Works via Jensen-Shannon Divergence & Principal Coordinate Analysis
    (aka Classical Multidimensional Scaling)

    Parameters
    ----------
    distributions : array-like, shape (`n_dists`, `k`)
        Matrix of distributions probabilities.

    Returns
    -------
    pcoa : array, shape (`n_dists`, 2)

    """
    dist_matrix = squareform(pdist(distributions, metric=_jensen_shannon))
    return _pcoa(dist_matrix)


def js_MMDS(distributions, **kwargs):
    """Perform dimension reduction.

    Works via Jensen-Shannon Divergence & Metric Multidimensional Scaling

    Parameters
    ----------
    distributions : array-like, shape (`n_dists`, `k`)
        Matrix of distributions probabilities.

    **kwargs : Keyword argument to be passed to `sklearn.manifold.MDS()`

    Returns
    -------
    mmds : array, shape (`n_dists`, 2)

    """
    dist_matrix = squareform(pdist(distributions, metric=_jensen_shannon))
    model = MDS(n_components=2, random_state=0, dissimilarity='precomputed', **kwargs)
    return model.fit_transform(dist_matrix)


def js_TSNE(distributions, **kwargs):
    """Perform dimension reduction.

    Works via Jensen-Shannon Divergence & t-distributed Stochastic Neighbor Embedding

    Parameters
    ----------
    distributions : array-like, shape (`n_dists`, `k`)
        Matrix of distributions probabilities.

    **kwargs : Keyword argument to be passed to `sklearn.manifold.TSNE()`

    Returns
    -------
    tsne : array, shape (`n_dists`, 2)

    """
    dist_matrix = squareform(pdist(distributions, metric=_jensen_shannon))
    model = TSNE(n_components=2, random_state=0, metric='precomputed', **kwargs)
    return model.fit_transform(dist_matrix)


def _df_with_names(data, index_name, columns_name):
    if isinstance(data, pd.DataFrame):
        # we want our index to be numbered
        df = pd.DataFrame(data.values)
    else:
        df = pd.DataFrame(data)
    df.index.name = index_name
    df.columns.name = columns_name
    return df


def _series_with_name(data, name):
    if isinstance(data, pd.Series):
        data.name = name
        # ensures a numeric index
        return data.reset_index()[name]
    else:
        return pd.Series(data, name=name)


def _topic_coordinates(mds, topic_term_dists, topic_proportion):
    K = topic_term_dists.shape[0]
    mds_res = mds(topic_term_dists)
    assert mds_res.shape == (K, 2)
    mds_df = pd.DataFrame({'x': mds_res[:, 0], 'y': mds_res[:, 1], 'topics': range(1, K + 1), \
                            'cluster': 1, 'Freq': topic_proportion * 100})
    # note: cluster (should?) be deprecated soon. See: https://github.com/cpsievert/LDAvis/issues/26
    return mds_df


def get_topic_coordinates(topic_term_dists, doc_topic_dists, doc_lengths, \
            vocab, term_frequency, mds=js_PCoA, sort_topics=True):
    """Transform the topic model distributions and related corpus.

    Creates the data structures needed for topic bubbles.

    Parameters
    ----------
    topic_term_dists : array-like, shape (`n_topics`, `n_terms`)
        Matrix of topic-term probabilities. Where `n_terms`
        is `len(vocab)`.
    doc_topic_dists : array-like, shape (`n_docs`, `n_topics`)
        Matrix of document-topic probabilities.
    doc_lengths : array-like, shape `n_docs`
        The length of each document, i.e. the number of words
        in each document. The order of the numbers should be
        consistent with the ordering of the docs in `doc_topic_dists`.
    vocab : array-like, shape `n_terms`
        List of all the words in the corpus used to train the model.
    term_frequency : array-like, shape `n_terms`
        The count of each particular term over the entire corpus.
        The ordering of these counts should correspond with
        `vocab` and `topic_term_dists`.
    mds : function or a string representation of function
        A function that takes `topic_term_dists` as an input and
        outputs a `n_topics` by `2`  distance matrix. The output
        approximates the distance between topics. See :func:`js_PCoA`
        for details on the default function. A string representation
        currently accepts `pcoa` (or upper case variant), `mmds`
        (or upper case variant) and `tsne` (or upper case variant),
        if `sklearn` package is installed for the latter two.
    sort_topics : sort topics by topic proportion (percentage of
        tokens covered). Set to False to to keep original topic order.

    Returns
    -------
    topic_coordinates : A pandas dataframe containing
        scaled x and y coordinates.

    """
    # parse mds
    if isinstance(mds, basestring):
        mds = mds.lower()
        if mds == 'pcoa':
            mds = js_PCoA
        elif mds in ('mmds', 'tsne'):
            if sklearn_present:
                mds_opts = {'mmds': js_MMDS, 'tsne': js_TSNE}
                mds = mds_opts[mds]
            else:
                logging.warning('sklearn not present, switch to PCoA')
                mds = js_PCoA
        else:
            logging.warning('Unknown mds `%s`, switch to PCoA' % mds)
            mds = js_PCoA

    topic_term_dists = _df_with_names(topic_term_dists, 'topic', 'term')
    doc_topic_dists = _df_with_names(doc_topic_dists, 'doc', 'topic')
    term_frequency = _series_with_name(term_frequency, 'term_frequency')
    doc_lengths = _series_with_name(doc_lengths, 'doc_length')
    vocab = _series_with_name(vocab, 'vocab')
    _input_validate(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency)

    topic_freq = (doc_topic_dists.T * doc_lengths).T.sum()
    if sort_topics:
        topic_proportion = (topic_freq / topic_freq.sum()).sort_values(ascending=False)
    else:
        topic_proportion = (topic_freq / topic_freq.sum())

    topic_order = topic_proportion.index
    topic_term_dists = topic_term_dists.iloc[topic_order]

    scaled_coordinates = _topic_coordinates(mds, topic_term_dists, topic_proportion)

    return scaled_coordinates


def extract_params(statefile):
    """Extract the alpha and beta values from the statefile.

    Args:
        statefile (str): Path to statefile produced by MALLET.
    Returns:
        tuple: alpha (list), beta

    """
    with gzip.open(statefile, 'r') as state:
        params = [x.decode('utf8').strip() for x in state.readlines()[1:3]]
    return (list(params[0].split(":")[1].split(" ")), float(params[1].split(":")[1]))


def state_to_df(statefile):
    """Transform state file into pandas dataframe.

    The MALLET statefile is tab-separated, and the first two rows contain the alpha and beta hypterparamters.

    Args:
        statefile (str): Path to statefile produced by MALLET.
    Returns:
        datframe: topic assignment for each token in each document of the model

    """
    return pd.read_csv(statefile,
                        compression='gzip',
                        sep=' ',
                        skiprows=[1, 2]
                        )


def pivot_and_smooth(df, smooth_value, rows_variable, cols_variable, values_variable):
    """Turn the pandas dataframe into a data matrix.

    Args:
        df (dataframe): aggregated dataframe
        smooth_value (float): value to add to the matrix to account for the priors
        rows_variable (str): name of dataframe column to use as the rows in the matrix
        cols_variable (str): name of dataframe column to use as the columns in the matrix
        values_variable(str): name of the dataframe column to use as the values in the matrix
    Returns:
        dataframe: pandas matrix that has been normalized on the rows.

    """
    matrix = df.pivot(index=rows_variable, columns=cols_variable, values=values_variable).fillna(value=0)
    matrix = matrix.values + smooth_value

    normed = sklearn.preprocessing.normalize(matrix, norm='l1', axis=1)

    return pd.DataFrame(normed)


def convert_mallet_data(state_file):
    """Convert Mallet data to a structure compatible with pyLDAvis.

    Args:
        output_state_file (string): Mallet state file

    Returns:
        data: dict containing pandas dataframes for the pyLDAvis prepare method.

    """
    params = extract_params(state_file)
    alpha = [float(x) for x in params[0][1:]]
    beta = params[1]
    df = state_to_df(state_file)
    # Ensure that NaN is a string
    df['type'] = df.type.astype(str)
    # Get document lengths from statefile
    docs = df.groupby('#doc')['type'].count().reset_index(name='doc_length')
    # Get vocab and term frequencies from statefile
    vocab = df['type'].value_counts().reset_index()
    vocab.columns = ['type', 'term_freq']
    vocab = vocab.sort_values(by='type', ascending=True)
    phi_df = df.groupby(['topic', 'type'])['type'].count().reset_index(name='token_count')
    phi_df = phi_df.sort_values(by='type', ascending=True)
    phi = pivot_and_smooth(phi_df, beta, 'topic', 'type', 'token_count')
    theta_df = df.groupby(['#doc', 'topic'])['topic'].count().reset_index(name='topic_count')
    theta = pivot_and_smooth(theta_df, alpha, '#doc', 'topic', 'topic_count')
    data = {'topic_term_dists': phi,
            'doc_topic_dists': theta,
            'doc_lengths': list(docs['doc_length']),
            'vocab': list(vocab['type']),
            'term_frequency': list(vocab['term_freq'])
        }
    return data

# Convert the Mallet data and export the topic_scale.csv file
converted_data = convert_mallet_data(output_state_file)
topic_coordinates = get_topic_coordinates(**converted_data)
topic_coordinates.to_csv(topic_scaled_file, index=False, header=False)

In [None]:
## move metadata-dfrb to browser/data, zip up and rename, delete meta.csv copy
!rm browser/data/meta.csv.zip
!cp {browser_meta_file} browser/data/
!zip -j browser/data/meta.csv.zip browser/data/meta.csv
!rm browser/data/meta.csv

In [None]:
## copy json cache into local browser for links

# R code:
# system(paste0("if [ -d \"browser/json\" ]; then rm -rf \"browser/json\"; fi"))
# system(paste0("mkdir -p browser/json && cp -rf caches/json browser/"))
# system(paste0("chmod"," ","755"," ","browser/json/*.json"))

if prepare_dfr:
    !if [ -d browser/json ]; then rm -rf browser/json; fi
    !mkdir -p browser/json && cp -rf caches/json browser/
    !find browser/json -type f -print0 | xargs -0 chmod 755
    #!chmod 755 browser/json/*.json

In [None]:
## tweak default index.html to link to JSON, not JSTOR

# R code:
# tx  <- readLines("browser/index.html")
# tx2  <- gsub(pattern = "on JSTOR", replace = "JSON", x = tx)
# writeLines(tx2, con="browser/index.html")
if prepare_dfr:
    fpath_html = "browser/index.html"
    with open(fpath_html, 'r') as file:
        filedata = file.read()
    filedata = filedata.replace('on JSTOR', 'JSON')
    with open(fpath_html, 'w') as file:
        file.write(filedata)

In [None]:
## Generate an HTML menu with live browsing and download links
## based on the current working directory.

# R code:
# project_name <- basename(getwd())
# project_reldir <- strsplit( getwd(), "/write/" )[[1]][2]

# IRdisplay::display_html(data=paste(
    # "<h2>Live</h2>",
    # "<p>To view the browser live:</p>",
    # "  <ul>",
    # paste("    <li><a href='http://harbor.english.ucsb.edu:10001/", project_reldir, "/", dfbOutputDir, "/' target='_blank'>Browser LIVE</a></li>", sep = ""),
    #"  </ul>"))
import os
    
if prepare_dfr:
    project_name = os.path.basename(project_dir)
    project_reldir = project_dir.split("/write/")[1]
    project_link = "http://harbor.english.ucsb.edu:10001/" + project_reldir + "/browser/"
    ## Can't get HTML display to work, so hack:
    print("To view the browser live: " + project_link)

## Zip export

In [None]:
## create a zipped copy of the browser for export

zip_export = FALSE

if (zip_export) {
    zip(dfbZipFile, paste0(dfbOutputDir,"/"))
    IRdisplay::display_html(data=paste(
    "<h2>Download</h2>",
    "<p>To download and view the browser through a webserver hosted on your local machine:</p>",
    "  <ol>",
    "    <li><a href='",dfbZipFile,"' target='new'>Download browser.zip</a></li>",
    "    <li>Unzip browser.zip</li>",
    "    <li>Open a shell/terminal, and navigate to the browser directory</li>",
    "    <li>On Linux / OSX, launch local webserver by running:<br><code>./bin/server</code></li>",
    "    <li>View from your local webserver: <a href='http://localhost:8888/' target='_blank'>http://localhost:8888/</a></li>",
    "  </ol>"))
} else {
    IRdisplay::display_html(data=paste("<p>Zip export disabled.</p>"))
}

## TIME elapsed if Run All

In [None]:
stop_time = Sys.time()

cat("start: ")
print(start_time)
cat("stop:  ")
print(stop_time)

elapsed_time = stop_time - start_time
print(elapsed_time)


----------