<a href="https://colab.research.google.com/github/todnewman/coe_training/blob/master/nlp_templates/template_analyze_results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Analysis Notebook
Does various analysis of the text
## Functions:
1. process_text(): Takes the Knowledge Graph Dataframe and creates an instance of the know_graph class. Then offers various options within the know_graph class (LDA, Named Entity, Document Summarization).
2. print_kg():
3. inference_kg()
4. most_central_nodes()

In [None]:
# check if IS_MASTER exists, this variable will only exist if it's being called by MASTER notebook.
# if it does not exist, set it to False
try: IS_MASTER
except: IS_MASTER = False

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import re
import os
import pandas as pd
import numpy as np
import bs4
import requests
import glob
import networkx as nx
from networkx.convert_matrix import from_numpy_array
import matplotlib.pyplot as plt
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
%matplotlib inline

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
if not IS_MASTER:
    #
    # Set params for standalone mode
    #
    from google.colab import drive
    drive.mount('/content/drive')
    ROOT_PATH = f"/content/drive/My Drive/SCR-Analytics/angie/pima/housing_reports/output/"
    OUTPUT_DIR = ROOT_PATH
    SYMBOL = "ukraine"
    os.chdir(ROOT_PATH)

    import sys
    sys.path.insert(0,'/content/drive/My Drive/Libraries')
    import gentext
    import gengraph
    import know_graph

    SUMMARIZE_DOC = True
    PROCESS_LDA = False
    NE_FLAG = True
    NUM_TOPICS_TO_SHOW = 5
    HIER_CLUSTERS = False

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Begin Processing Text from saved text file outputs

* Create instance of gentext class
* Process LDA algorithm to reveal Topics.  Right now I have 8 topics defined, but a closer look may need to be taken on individual docs to see if that prior fits.
* Summarize the document by finding most central sentences (30 right now).  Save these most central sentences to a text file for later processing.
* Find named entities.  In the future one could go into the graph and figure out how each named entity contributes to the knowledge graph.
* Heirarchical Clustering: Along with LDA can help the analyst identify key themes and topics.
* Build knowledge graph from most central sentences.  Since we're limiting to 30 sentences, this might provide a cliff notes of the document?

In [None]:


def process_text(fnm, kg_df):
    '''
    Function: process_text()

    Description:

    Returns:
        text_processing: The instance of the GenText class
        topics: the gensim model method print_topics()
        opt_mod_topics
        dominant_topic
        corpus
        top_sent
    '''
    #with open(fnm, 'r', encoding='utf-8', errors='ignore') as raw_data:
    print(fnm)
    label = f'Evaluation of {fnm}'
    ignore_words = ['http', 'www', 'org', 'com', 'pdf', 'http:']
    out_file = f"{fnm}_txtout.txt"
    top_sent_file = f"{fnm[0:5]}_top_sents_centrality.txt"

    text_processing = gentext.GenText(
                                    fname = fnm,
                                    ignore_words=ignore_words,
                                    chunk_size = 100000,
                                    outfile=out_file,
                                    bigrams=False,
                                    label=label,
                                    verbose=False)

    if SUMMARIZE_DOC:
        #
        #  This is a nice summary method using sentence centrality.  We pass the
        #  GenText member function (summarize_with_vectors) the number of sentences we
        #  want to rank.  This is also a way to generate a list of "interesting" sentences
        #  to use later as input to a different algorithm.
        #
        nx_graph, sentences_clean, ranked_sent, top_sent = text_processing.summarize_with_vectors(500)

        for i,s in enumerate(top_sent[0:20]):
            print(f"{i}: {s}")

        with open(top_sent_file, 'w') as f:
            for item in top_sent:
                f.write("%s\n" % item)

    if NE_FLAG:
        #
        #  Named entities can be interesting.  Future work could build graphs of
        #  NE's and their neighbors to do something like stakeholder evaluation?
        #
        named_entities = text_processing.process_NE()
        print(f"\nNamed Entities: {named_entities}")

    if HIER_CLUSTERS:
        #
        # These are mildly interesting.  Sometimes evaluation of the heirarchical
        # clusters along with LDA topics yields unique insight.
        #
        hier_cluster_words = text_processing.hier_clustering(8,15)
        for w in hier_cluster_words:
            print (w)

    if PROCESS_LDA:
        topics, sents, opt_mod_topics, dominant_topic, corpus = text_processing.process_LDA_gensim(num_topics=8, num_words=20)

    return (text_processing, topics, opt_mod_topics, dominant_topic, corpus, top_sent)

def most_central_nodes(G, num, kg_top_sents, fnm, verbose=False):
    cent_arr = []

    #
    # Build an array of "significant" sentral nodes.  If shorter than 3, usually oddities like
    # numbers and if longer than 25 a web address.
    #
    for a, data in sorted(G.nodes(data=True), key=lambda x: x[1]['betweenness'], reverse=True):
        if verbose:
            print('{a} {w}'.format(a=a,  w=data['betweenness']))
        if len(a) > 3 and len(a) < 25:
            cent_arr.append(a)
        if len(cent_arr) == num: # Limit to desired array length
            break

    text_file = f'{OUTPUT_DIR}{SYMBOL}{fnm}.txt'
    #
    # One thought on the below is to filter the top sentences by some keyword
    # so we don't always get complex sentences that don't address questions
    #
    with open(text_file, "r") as txt_vals:
        sents = nltk.sent_tokenize(''.join(txt_vals))
        sent_nos = []
        for n in cent_arr:
            fil_n = kg_top_sents['Subject'] == n
            centr_sents = kg_top_sents[fil_n]['sentno'].unique()

            for i, s in enumerate(centr_sents):
                sent_nos.append(s)
        sent_nos = set(sent_nos)
        sent_out = []
        for i,s in enumerate(sent_nos):
            sent_out.append(sents[s])

    return cent_arr, sent_out


def plot_top_topics(top_topics, num_topics_to_print, fnm):
    for i,t in enumerate(top_topics):
        plt.figure(figsize=(15,10))
        top_topics_data = t[0]
        top_topics_coherence = t[1]
        plt.bar(range(len(top_topics_data)), [val[0] for val in top_topics_data], align='center')
        plt.title(f"Topic {i}, Coherence: {t[1]}", fontsize=18)
        plt.xticks(range(len(top_topics_data)), [val[1] for val in top_topics_data])
        plt.xticks(rotation=70, fontsize = 16)
        plt.ylabel('Probability of Topic Inclusion', fontsize = 16)
        plt.xlabel(f'Words Describing Topic {i}', fontsize = 16)
        plt.tight_layout()
        plt.savefig(f'{TOPIC_DIR}LDA_{fnm}_Topic_{i}.png',dpi=300, bbox_inches = "tight")
        plt.show()
        if i == num_topics_to_print:
            break

def most_common_words(text, num):

    # tokenize
    raw = ' '.join(word_tokenize(text.lower()))

    tokenizer = RegexpTokenizer(r'[A-Za-z]{2,}')
    words = tokenizer.tokenize(raw)

    # remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # count word frequency, sort and return the specified number
    counter = Counter()
    counter.update(words)
    most_common = counter.most_common(num)
    return most_common


## Main Function

In [None]:
#
#  Main Function below.
#
#os.chdir(OUTPUT_DIR)
#
# We want to deserialize the /content/tmp/*pkl files here instead of opening files
# Maybe this presents problems for going file by file though...  perhaps need
# to serialize by filename too?
# The sentences*txt files didn't get serialized though, so maybe this is OK
#

all_files_rollup = SEP_DOCS # Only perform LDA on the rolled up set of files, not individually

if not IS_MASTER:
    filenames = glob.glob(f"{SYMBOL}_all_files*.txt")
else:
    if not all_files_rollup:
        filenames = glob.glob(f"{OUTPUT_DIR}{SYMBOL}_all_files.txt")
    else:
        filenames = glob.glob(f"{OUTPUT_DIR}{SYMBOL}_*.txt")

files = [i for i in filenames]
docs_arr = []
debug = False
nodeval_arr = []
top_sent_LDA = False # True if we want to run LDA on the most central sentence data.



#
# Simple function to Flatten an array of arrays
#
def flatten(t):
    return [item for sublist in t for item in sublist]

for fnm in files:
    #
    # UPDATE the BELOW for different headers
    #
    print(f'Opening file: {fnm}')
    fnm_split = fnm.split('.')[0]
    actual_fnm = fnm_split.rsplit('/',1)[-1] # Gents rid of header info and extension
    print(f"ACTUAL_FNM: {actual_fnm}")
    print(f"FILENAME WITH PATH: {fnm}")
    #
    # Open up the Knowledge Graph Dataframe
    #
    if not IS_MASTER:
        kg_fnm = f'{SYMBOL}_kg_df{actual_fnm}.csv'
        kg_df = pd.read_csv(kg_fnm)
    else:
        kg_df = pd.read_pickle(PROCESSED_KGDF_FILE)

    print("Knowledge Graph DF info:", kg_df.columns, len(kg_df))
    #
    #  Call function that opens an instance of the know_graph class.
    #
    text_processing, topics, opt_model, dominant_topic, corpus, top_sents = process_text(fnm,
                                                                                         kg_df)
    #
    # Save off the top sentences by centrality to the output folder.  This can function
    # as a summary of the document.
    #
    with open(f'{OUTPUT_DIR}{SYMBOL}_top_sentences.txt', "w") as txt_file:
            for line in top_sents:
                txt_file.write("".join(line) + "\n")

    #
    # Based off our Gensim LDA algorithm, plot the number of topics desired
    #
    top_topics = opt_model.top_topics(corpus=corpus)
    plot_top_topics(top_topics, NUM_TOPICS_TO_SHOW, actual_fnm)

    #
    # Save off and show the dominant topic per sentence
    #
    dominant_topic.to_csv(f"{TOPIC_DIR}{SYMBOL}_dom_topic_{actual_fnm}.csv")
    print("\nDominant Topic Table")
    display(dominant_topic)
    if top_sent_LDA:
        #
        #  Call function that opens an instance of the know_graph class for the most central sentences.
        #
        top_sent_fnm = f'{OUTPUT_DIR}{SYMBOL}_top_sentences.txt'
        text_processing, topics, opt_model, dominant_topic, corpus, top_sents = process_text(top_sent_fnm, kg_df)
        NUM_TOPICS_TO_SHOW = 4
        actual_fnm = "Most_Central_Sentences"

        #
        # Based off our Gensim LDA algorithm, plot the number of topics desired
        #
        top_topics = opt_model.top_topics(corpus=corpus)
        plot_top_topics(top_topics, NUM_TOPICS_TO_SHOW, actual_fnm)

        #
        # Save off and show the dominant topic per sentence
        #
        dominant_topic.to_csv(f"{OUTPUT_DIR}{SYMBOL}_dom_topic_{actual_fnm}.csv")
        dominant_topic
    print("Most Common Words from the Top Sentences")
    common_vals = most_common_words(' '.join(top_sents), 20)
    df_cv = pd.DataFrame(common_vals, columns=['word', 'frequency'])
    display(df_cv)


NameError: ignored

In [None]:
!pwd

/content/drive/.shortcut-targets-by-id/1s4-Bt5nCUDG1Gqt7O9pQtSK4IuDREsVO/Reports for Tod
