# Text Analytics Exploration

## Import Data

In [None]:
report_url = "http://hansardpublic.parliament.sa.gov.au/Pages/HansardResult.aspx#/docid/HANSARD-10-25756"
report_text = "When I tried to get a copy of the commissioner's report after being tabled, why was I basically told that there was a very limited— The PRESIDENT: This is a matter of personal explanation in a supplementary. Just please, the Hon. Mr Wortley, ask your supplementary. The Hon. R.P. WORTLEY: Why weren't all members of parliament given a copy of the royal commission's report? The Hon. D.W. Ridgway: But you told us before you never read reports. The Hon. R.I. LUCAS (Treasurer) (15:26): Mr President, I won't go down that particular path, as delicious as that interjection might have been in relation to the Hon. Mr Wortley saying he couldn't trust himself to read his own reports. I don't know why the Hon. Mr Wortley was unable to get a copy of the royal commission report. It was certainly publicly available. If it pleases the member, I will see whether there is not a spare copy somewhere. If we do find a spare copy and give it to him, I will be asking questions afterwards of the honourable member just to make sure he did read it. The Hon. D.W. Ridgway: Do you want it delivered to Scuzzi or something more convenient for you? The PRESIDENT: Are you finished, the Hon. Mr Ridgway? The Hon. R.P. WORTLEY: You just worry about our trade exports, mate, for the state. The PRESIDENT: The Hon. Mr Wortley, I am waiting patiently here to give you the call for your question. Have you finished your private conversation with the Hon. Mr Ridgway? Yes? The Hon. Mr Wortley."
report_title = "Murray-Darling Basin Royal Commission"

In [None]:
# Import data from spreadsheet
import pandas as pd
import numpy as np

data = pd.read_excel ("..\\data\\HANSARDfullDataset.xlsx", sheet_name="text")
df = pd.DataFrame(data, columns= ['hansardID','text'])
df = df.astype({"hansardID":'str', "text":'str'}) 

#df.dtypes 
df.head(2)

In [None]:
#hansardFilesInfo = pd.read_excel ("..\\data\\HANSARDfullDataset.xlsx", sheet_name="HANSARDFilesInfo")
#hansardFilesInfo = pd.DataFrame(hansardFilesInfo, columns= ['FileName','URL'])
#hansardFilesInfo = hansardFilesInfo.astype({"FileName":'str', "URL":'str'}) 
#hansardFilesInfo.head(2)

In [None]:
#header = pd.read_excel ("..\\data\\HANSARDfullDataset.xlsx", sheet_name="header")
#header = pd.DataFrame(header)
#header.head(2)

In [None]:
#bill = pd.read_excel ("..\\data\\HANSARDfullDataset.xlsx", sheet_name="bill")
#bill = pd.DataFrame(bill, columns= ['question','bname'])
#bill.head(2)

In [None]:
# Group text into one document
grouped_text = df.groupby('hansardID')['text'].agg(lambda col: '. '.join(col))
grouped_text_df = pd.DataFrame(grouped_text, columns= ['text'])
grouped_text_df.head(5)

In [None]:
grouped_text.iloc[4].replace("..",".")

## Basic Text Analytics

### Word Count

In [None]:
import re

# Count number of words in sentence using regex
grouped_text_df['WordCount'] = grouped_text_df.apply(lambda x: len(re.findall(r'\w+', x.text)), axis=1)
grouped_text_df.head(2)

### Length of Sentences

In [None]:
from statistics import median

all_text = df['text'].agg(lambda col: ''.join(col))
all_text = ". ".join(all_text)

# Average number of words in a sentence
parts = [len(l.split()) for l in re.split(r'[?!.:] ', ' '.join(all_text)) if l.strip()]
print("Average = ", sum(parts)/len(parts))

# Median number of words in a sentence
print("Median = ", median(parts))
print("Min = ", min(parts))
print("Max = ", max(parts))
print("Q1 = ", np.percentile(parts,25))
print("Q3 = ", np.percentile(parts,75))

## Document Summarisation

Articles and libraries to look into further: 
* https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
* https://stackabuse.com/text-summarization-with-nltk-in-python/
* https://github.com/alanbuxton/PyTeaserPython3
* https://github.com/abisee/pointer-generator
* https://github.com/DerwenAI/pytextrank
* https://github.com/tensorflow/models/tree/master/research/textsum
* https://radimrehurek.com/gensim/models/lsimodel.html
* https://towardsdatascience.com/text-summarization-in-python-76c0a41f0dc4 (additional links to articles at the end)

### Feature Base Text Summarisation

In [None]:
# The feature base model extracts the features of the sentence, then evaluate its importance
# Feature base text summarization by TextTeaser
#from pyteaser import SummarizeUrl
#url = 'http://www.huffingtonpost.com/2013/11/22/twitter-forward-secrecy_n_4326599.html'
#summaries = SummarizeUrl(url)
#print summaries

In [None]:
# TextTeasor - automatic summarization algorithm that combines the power of natural language processing and machine learning
#from textteaser import TextTeaser
#tt = TextTeaser()
#tt.summarize(title, text)

### Topic Model Summary

In [None]:
# Topic Model summarisation
from gensim.test.utils import common_dictionary, common_corpus
from gensim.models import LsiModel
model = LsiModel(common_corpus, id2word=common_dictionary)
vectorized_corpus = model[common_corpus]
#print(vectorized_corpus)
#model.print_topics(1)

### Truncated Sentences

In [None]:
def smart_truncate(content, length=200, suffix='...'):
    if len(content) <= length:
        return content
    else:
        return ' '.join(content[:length+1].split(' ')[0:-1]) + suffix

smart_truncate(report_text)

In [None]:
grouped_text_df['Truncated'] = grouped_text_df.apply(lambda x: smart_truncate(x.text), axis=1)
grouped_text_df.head(5)

### Extract Keywords

In [None]:
# https://rare-technologies.com/text-summarization-with-gensim/
#
from gensim.summarization import keywords

def get_keywords(text):
    keyword_list = keywords(text, split=True, lemmatize=True, deacc=True)
    return ', '.join(keyword_list[:10])

grouped_text_df['KeyWords'] = grouped_text_df.apply(lambda x: 
                                                    get_keywords(x.text), 
                                                    axis=1)
grouped_text_df.head(5)

In [None]:
get_keywords(report_text)

### PyTextRank

In [None]:
# https://github.com/DerwenAI/pytextrank
# https://github.com/DerwenAI/pytextrank/blob/master/example.ipynb
# Requires JSON input

### TextRank Summary

In [None]:
from gensim.summarization.summarizer import summarize

#  TextRank summarization with default parameters
grouped_text_df['TextRank'] = grouped_text_df.apply(lambda x: 
                                                    summarize(x.text).replace("\n"," ").replace("..","."), 
                                                    axis=1)

#  TextRank summarization with no more than 50 words for the summary
grouped_text_df['TextRank50'] = grouped_text_df.apply(lambda x: 
                                                      summarize(x.text, word_count = 50).replace("\n"," ").replace("..","."), 
                                                      axis=1)
grouped_text_df.head(5)

In [None]:
print(summarize(report_text)) #  TextRank summarization

In [None]:
print(summarize(report_text, word_count = 50)) #  TextRank summarization - no more than 50 words for summary

In [None]:
print(summarize(report_text, ratio = 0.2)) #  TextRank summarization - use no more than 20% of original text for summary

In [None]:
summarize(grouped_text.iloc[4], word_count = 50)

In [None]:
summarize(grouped_text.iloc[2], word_count = 50)

In [None]:
def generate_text_rank_summary(text):
    
    if len(text) <= 200:
        return text
    
    sentences = []    
    text_sentences = re.split(r'[?!.] ', text)
    
    for sentence in text_sentences:
        processed = sentence.replace("[^a-zA-Z]", " ")
        word_count = len(re.findall(r'\w+', processed)) 
        if word_count > 1: # Include sentences with more than one word
            sentences.append(processed)
    
    summary = summarize('. '.join(sentences))             
    return summary.replace("\n"," ").replace("..",".")


In [None]:
#  TextRank summarisation with no more than 50 words for the summary
grouped_text_df['TextRankProcessed'] = grouped_text_df.apply(lambda x: generate_text_rank_summary(x.text), axis=1)
grouped_text_df.head(5)

In [None]:
generate_text_rank_summary(grouped_text.iloc[4])

In [None]:
generate_text_rank_summary(grouped_text.iloc[2])

In [None]:
generate_text_rank_summary(report_text)

In [None]:
# https://towardsdatascience.com/understand-text-summarization-and-create-your-own-summarizer-in-python-b26a9f09fc70
# Approach uses TextRank algorithm
# TextRank does not rely on any previous training data and can work with any arbitrary piece of text. 
# TextRank is a general purpose graph-based ranking algorithm for NLP

# Import all necessary libraries
import os
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
import re 

# Generate clean sentences
def read_article(file_name, output=True):
    file = open(file_name, "r")
    file_data = file.readlines()
    sentences = re.split(r'[?!.] ', file_data[0])
    return create_clean_sentences(sentences, output)

def read_text(text, output=True):
    sentences = re.split(r'[?!.] ', text)
    return create_clean_sentences(sentences, output)

def create_clean_sentences(article, output=True):
    sentences = []
    
    for sentence in article:
        processed = sentence.replace("[^a-zA-Z]", " ")
        word_count = len(re.findall(r'\w+', processed)) 
        if word_count > 4: # Include sentences with more than four words
            sentences.append(processed.split(" "))
        if output: 
            print(sentence, ": words = ", word_count)
        
    return sentences   

# Similarity matrix
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)
 
def build_similarity_matrix(sentences, stop_words):
    # Create an empty similarity matrix
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
 
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: #ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix

# Generate Summary Method
def generate_summary(file_name, text, top_n=5, output=True):
    stop_words = stopwords.words('english')
    summarize_text = []

    # Step 1 - Read text and split it
    if file_name is not None:
        sentences = read_article(file_name, output)
    elif len(text) <= 200:
        # If text is short don't return a summary. Return the text
        return text
    else:  
        sentences = read_text(text, output)

    # Step 2 - Generate Similarly Matrix across sentences
    sentence_similarity_matrix = build_similarity_matrix(sentences, stop_words)

    # Step 3 - Rank sentences in similarity matrix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
    try: 
        scores = nx.pagerank(sentence_similarity_graph, max_iter=100)
    except nx.NetworkXError:
        return ""
    except nx.PowerIterationFailedConvergence:
        return ""
    
    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
    if output:
        print("Indexes of top ranked_sentence order are ", ranked_sentence)    

    if len(ranked_sentence) < top_n:
        top_n = len(ranked_sentence)
        
    for i in range(top_n):
        summarize_text.append(" ".join(ranked_sentence[i][1]))

    # Step 5 - output the summary text        
    return '. '.join(summarize_text).replace("..",".")


In [None]:
# let's begin
generate_summary(os.getcwd() + "\\test.txt", None, 3, False)

In [None]:
generate_summary(None, grouped_text.iloc[4], 3, False)

In [None]:
generate_summary(None, grouped_text.iloc[2], 3, True)

In [None]:
generate_summary(None, report_text, 3, False)

In [None]:
# Iterate over all text in data frame and add summary
grouped_text_df['TextRankCustom'] = grouped_text_df.apply(lambda x: generate_summary(None, x.text, 3, False), axis=1)

In [None]:
grouped_text_df.head(5)

### Excel Output of Document Summaries

In [None]:
grouped_text_df.to_excel('.\\DocumentSummary.xlsx', sheet_name='TextRank', index=True)