In [38]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
from pathlib import Path

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.nmf import Nmf
from gensim.models import TfidfModel


# spacy for lemmatization
import spacy

from nltk.corpus import stopwords
import nltk

import json

# Plotting tools
import matplotlib.pyplot as plt

# Variable initialization
mallet_path = '../mallet-2.0.8/bin/mallet'
num_topics = 20
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
nlp = spacy.load('en', disable=['parser', 'ner'])
bookPaths = Path('../data/books_json')
books = [book for book in bookPaths.iterdir() if 'book_' in book.name]

with open('../data/chapters/chapter_numbers.json') as chapter_numerated:
    chapter_numbers = json.load(chapter_numerated)



Define all kind of functions that are used during this notebook:
----------------------------------------------------------------

In [102]:
def compute_coherence_values(modelType, dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        if modelType=='nmf':
            model = Nmf(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        else:
            model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values


def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()
    sent_dominant_df = pd.DataFrame()
    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row_sorted = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row_sorted):
            if j==0:
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_dominant_df = sent_dominant_df.append(pd.Series([i, int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
        for j, (topic_num, prop_topic) in enumerate(row):
            wp = ldamodel.show_topic(topic_num)
            topic_keywords = ", ".join([word for word, prop in wp])
            sent_topics_df = sent_topics_df.append(pd.Series([2000+i, int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
    sent_topics_df.columns = ['Chapter_No', 'Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    sent_dominant_df.columns = ['Chapter_No', 'Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    return sent_topics_df, sent_dominant_df


def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
        
def get_preprocessed_text(bookpath):
    preprocessed_sents = []
    preprocessed_sents_per_chapter = {}
    with open(bookpath) as book_json:
        chapter_par_dict = json.load(book_json)
        
        # for chapter, paragraphs in book
        for chapter, pars in chapter_par_dict.items():
            preprocessed_sents_per_chapter[chapter] = []
            
            # For actual paragraph in chapter
            for par in pars:
                sents = nltk.sent_tokenize(par)
                for sentence in sents:
                    sent = re.sub(r'\s+', ' ', sentence)  # remove newline chars
                    sent = re.sub(r'\"', '', sent)  # remove single quotes
                    sent = re.sub(r"\'", '', sent)  # remove single quotes
                    sent = re.sub(r"\*", '', sent)  # remove * in text
                    # Remove words that are smaller than given threshold (like J K R O W L I N G)
                    preprocessed_sents.append(sent)
                    preprocessed_sents_per_chapter[chapter].append(sent)
                    
    return preprocessed_sents, preprocessed_sents_per_chapter


def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]


def remove_shortwords(texts, wordlen_thresh=2):
    return [[word for word in doc if len(word.strip()) >= wordlen_thresh] for doc in texts]


def make_bigrams(texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]


def make_trigrams(texts, trigram_mod):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]


def lemmatization(nlp, texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


def preprocess_book(book):
    print(f'==> Start Book {book.name}')
    book_compl, book_chapters = get_preprocessed_text(str(book))
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(book_chapters.values(), min_count=1, threshold=65) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[book_chapters.values()],min_count=1, threshold=65)  

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    data_lemmatized = lemmatization(nlp, book_chapters.values())

    # Remove Stop Words
    data_words_nostops = remove_stopwords(data_lemmatized)
    data_wo_shortwords = remove_shortwords(data_words_nostops)
    
    # Form Bigrams
    data_words_bigrams = make_bigrams(data_wo_shortwords, bigram_mod)
    data_words_trigrams = make_trigrams(data_words_bigrams, trigram_mod)
    # Create Dictionary
    id2word = corpora.Dictionary(data_words_trigrams)

    # Create Corpus
    texts = data_words_trigrams

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]
    tfidf = TfidfModel(corpus, id2word=id2word)
    
    low_value = 0.0075
    low_value_words = []
    for bow in corpus:
        low_value_words += [id for id, value in tfidf[bow] if value < low_value]
    id2word.filter_tokens(bad_ids=low_value_words)
    corpus = [id2word.doc2bow(text) for text in texts]

    return corpus, texts, id2word
    

Compute LDA_Based csv files for topic modelling that contain topic information by book
======================================================================================

LDA Topic Modelling
-------------------

Still, seems to work better than NMF model.
--> Try to include NER tags to make Borgin_and_Burks to one word for example

In [95]:
for book in books:
    corpus, texts, id2word = preprocess_book(book)
    
    # Continue from 16 on LDA website
    ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
    
    # Show Topics
    df_topic_sents_keywords, df_dominant = format_topics_sentences(ldamodel=ldamallet, corpus=corpus, texts=texts)

    # Show
    df_topic_sents_keywords.to_csv(f'../data/topics/{book.name[:-5]}_gensim_lda.csv')
    df_dominant.to_csv(f'../data/topics/{book.name[:-5]}_gensim_lda_dom.csv')

==> Start Book book_2_coref.json
==> Start Book book_7_coref.json
==> Start Book book_3_coref.json
==> Start Book book_6_coref.json
==> Start Book book_4_coref.json
==> Start Book book_5_coref.json
==> Start Book book_1_coref.json


Coherence Computation for LDA
-----------------------------

In [106]:
for book in books:
    corpus, texts, id2word = preprocess_book(book)
    start = 6
    limit = 26
    step = 2
    model_list, coherence_values = compute_coherence_values('lda',dictionary=id2word, corpus=corpus, texts=texts, start=start, limit=limit, step=step)
    x = range(start, limit, step)
    for m, cv in zip(x, coherence_values):
        print(" ---> Num Topics =", m, " has Coherence Value of", round(cv, 4))

==> Start Book book_2_coref.json
 ---> Num Topics = 6  has Coherence Value of 0.236
 ---> Num Topics = 8  has Coherence Value of 0.2572
 ---> Num Topics = 10  has Coherence Value of 0.2467
 ---> Num Topics = 12  has Coherence Value of 0.3243
 ---> Num Topics = 14  has Coherence Value of 0.2947
 ---> Num Topics = 16  has Coherence Value of 0.3351
 ---> Num Topics = 18  has Coherence Value of 0.3574
 ---> Num Topics = 20  has Coherence Value of 0.3642
 ---> Num Topics = 22  has Coherence Value of 0.3711
 ---> Num Topics = 24  has Coherence Value of 0.3836
==> Start Book book_7_coref.json
 ---> Num Topics = 6  has Coherence Value of 0.2284
 ---> Num Topics = 8  has Coherence Value of 0.2603
 ---> Num Topics = 10  has Coherence Value of 0.2574
 ---> Num Topics = 12  has Coherence Value of 0.3111
 ---> Num Topics = 14  has Coherence Value of 0.3211
 ---> Num Topics = 16  has Coherence Value of 0.3319
 ---> Num Topics = 18  has Coherence Value of 0.36
 ---> Num Topics = 20  has Coherence Val

NMF_Based Topic Modelling cells:
================================

NMF Topic modelling
-------------------

In [96]:
for book in books:
    corpus, texts, id2word = preprocess_book(book)
    
    ldamallet = Nmf( corpus=corpus, num_topics=num_topics, id2word=id2word)
    
    # Show Topics
    df_topic_sents_keywords, dom_df = format_topics_sentences(ldamodel=ldamallet, corpus=corpus, texts=texts)

    # Show
    df_dominant_topic.to_csv(f'../data/topics/{book.name[:-5]}_gensim_nmf.csv')
    dom_df.to_csv(f'../data/topics/{book.name[:-5]}_gensim_nmf_dom.csv')    

==> Start Book book_2_coref.json
==> Start Book book_7_coref.json
==> Start Book book_3_coref.json
==> Start Book book_6_coref.json
==> Start Book book_4_coref.json
==> Start Book book_5_coref.json
==> Start Book book_1_coref.json


NMF coherence computation:
--------------------------

In [105]:
bookPaths = Path('../data/books_json')
books = [book for book in bookPaths.iterdir() if 'book_' in book.name]

for book in books:
    corpus, texts, id2word = preprocess_book(book)
    
    start = 6
    limit = 26
    step = 2
    model_list, coherence_values = compute_coherence_values('nmf',dictionary=id2word, corpus=corpus, texts=texts, start=start, limit=limit, step=step)
    x = range(start, limit, step)
    for m, cv in zip(x, coherence_values):
        print(" ---> Num Topics =", m, " has Coherence Value of", round(cv, 4))

==> Start Book book_2_coref.json


KeyboardInterrupt: 