## Text Analysis - Topic Modelling
### <span style='color: green'>SETUP </span> Prepare and Setup Notebook <span style='float: right; color: red'>MANDATORY</span>

In [2]:
# Setup
%load_ext autoreload
%autoreload 2

import sys, os, collections, zipfile

sys.path = [ '/home/roger/source/text_analytic_tools' ] + sys.path

import re, typing.re
import warnings
import nltk, textacy, spacy 
import pickle
import pandas as pd
import ipywidgets as widgets
import bokeh, bokeh.plotting, bokeh.models, matplotlib.pyplot as plt

import text_analytic_tools

import text_analytic_tools.utility.utils as utility
import text_analytic_tools.utility.widgets as widgets
import text_analytic_tools.common.text_corpus as text_corpus
import text_analytic_tools.common.textacy_utility as textacy_utility
import text_analytic_tools.text_analysis.topic_model as topic_model
import text_analytic_tools.text_analysis.derived_data_compiler as derived_data_compiler

from beakerx.object import beakerx
from beakerx import *
from IPython.display import display, set_matplotlib_formats

warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=UserWarning) 
logger = utility.getLogger('corpus_text_analysis')

utility.setup_default_pd_display(pd)

from text_analytic_tools.config import get_current_domain

domain_logic = get_current_domain()

%matplotlib inline

# set_matplotlib_formats('svg')
bokeh.plotting.output_notebook()

current_corpus_container = lambda: textacy_utility.CorpusContainer.container()
current_corpus           = lambda: textacy_utility.CorpusContainer.corpus()
current_state            = lambda: compiled_data.TopicModelContainer.singleton()
current_data             = lambda: current_state().data
current_topic_model      = lambda: current_state().topic_model


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
%%bash
#mkdir ./tmp
#ln -s /home/roger/source/STTM ./lib

In [None]:
### Convert corpus to sparse matrix

In [None]:

from sklearn.decomposition import LatentDirichletAllocation
import scipy

v_corpus = scipy.sparse.coo_matrix((df_corpus.tf, (df_corpus.document_id, df_corpus.token_id)))

n_topics = 50

lda = LatentDirichletAllocation(n_components=n_topics, learning_method='batch',)
lda.fit(v_corpus)


## <span style='color: green;'>MODEL</span> Compute Topic Model Based on Raw Source Text Corpus<span style='color: red; float: right'>ALTERNATIVE #1</span>

#### <span style='color: green'>PREPARE</span> Load (or Create) The Corpus <span style='float: right; color: red'>OPTIONAL</span>
Setup a new corpus from the raw source text files the reside in a zip archive. This step uses the spaCy and textaCy frameworks for PoS tagging. This will take some time, several minutes, For large text files. If the same processing and filtering rules are repeatedly, then it is recommended to prepare the corpus once and for all using "1_extract_corpus_text" (also see next step).


In [17]:
import text_analytic_tools.notebooks_gui.load_corpus_gui as load_corpus_gui

try:
    container = current_corpus_container()
    load_corpus_gui.display_corpus_load_gui(domain_logic.DATA_FOLDER, document_index=None, container=container)
except Exception as ex:
    raise
    logger.error(ex)

IndexError: list index out of range

#### <span style='color: green;'>MODEL</span> Compute the Topic Model<span style='color: red; float: right'>OPTIONAL</span>


In [None]:
import topic_model_gui

try:
    gui = topic_model_gui.TextacyCorpusUserInterface(
        data_folder=domain_logic.DATA_FOLDER,
        state=current_state(),
        document_index=domain_logic.compile_documents(current_corpus()),
        tagset=domain_logic.get_tagset(),
        substitution_filename=domain_logic.SUBSTITUTION_FILENAME
    )
    gui.display(current_corpus())
    
except Exception as ex:
    raise
    logger.error(ex)