# Topic modeling with SFCRs

Example of topic modeling with SFCRs.


In [None]:
import numpy as np
import os
import pickle

In [None]:
#import pyLDAvis
#import pyLDAvis.gensim

## Read SFCRs

In [None]:
language = 'EN'
local_path = '../SFCR_data/'
if not(os.path.isfile(local_path + 'SFCRs_' + language + '.dat')):
    print("Files not found.")
else:
    with open(local_path + 'SFCRs_' + language + '.dat', 'rb') as fp:
        documents = pickle.load(fp)

## Preprocess with NLTK and Gensim

In [None]:
import warnings
warnings.filterwarnings(action = 'ignore', category = UserWarning, module = 'gensim')
import gensim
import nltk

In [None]:
sentences = []
for document in documents:
    sent_list = nltk.tokenize.sent_tokenize(document)
    sentences.extend(sent_list)

In [None]:
stop_words = nltk.corpus.stopwords.words('english')

def remove_stopwords(texts):
    return [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

sentences = remove_stopwords(sentences)

In [None]:
sentences = [sentence for sentence in sentences if len(sentence) > 40]

In [None]:
print("Number of documents: " + str(len(documents)))
print("Number of sentences: " + str(len(sentences)))
print("Number of words: " + str(sum([len(word) for word in sentences])))

## Preliminary analysis

In [None]:
# first get a list of all words
all_words = [word for item in sentences for word in item]
# use nltk fdist to get a frequency distribution of all words
fdist = nltk.FreqDist(all_words)
print("Number of unique words: " +str(len(fdist)))

In [None]:
# choose k and visually inspect the bottom 10 words of the top k
k = 10000
top_k_words = fdist.most_common(k)
top_k_words[-10:]

In [None]:
# define a function only to keep words in the top k words
top_k_words,_ = zip(*fdist.most_common(k))
top_k_words = set(top_k_words)
def keep_top_k_words(text):
    return [word for word in text if word in top_k_words]

for idx in range(len(sentences)):
    sentences[idx] = keep_top_k_words(sentences[idx])

In [None]:
# document length
doc_lengths = [len(sentence) for sentence in sentences]

print("length of list:",len(doc_lengths),
      "\naverage length:", np.average(doc_lengths),
      "\nminimum length:", min(doc_lengths),
      "\nmaximum length:", max(doc_lengths))

## Start modelling

In [None]:
import spacy

In [None]:
fname = "data_lemmatized"

# Initialize spacy 'en' model
nlp = spacy.load('en', disable = ['parser', 'ner'])

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(sentences, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

#if not os.path.isfile(fname):
#    filehandler = open(fname, 'wb') 
#    pickle.dump(data_lemmatized, filehandler)
#else:
#    filehandler = open(fname, 'rb') 
#    data_lemmatized = pickle.load(filehandler)

In [None]:
# Create Dictionary
id2word = gensim.corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

## Mallet model

In [None]:
os.environ['MALLET_HOME'] = 'C:\\mallet\\'
mallet_path = r'C:\\mallet\\bin\\mallet'

In [None]:
def compute_model_list(dictionary, corpus, texts, limit, start=2, step=3):
    model_list = []
    for num_topics in range(start, limit, step):
        print(".", end="")
        model = gensim.models.wrappers.LdaMallet(mallet_path, 
                                                 corpus=corpus, 
                                                 num_topics=num_topics, 
                                                 id2word=id2word,
                                                 topic_threshold=0.0)
        model_list.append(model)
    return model_list

def compute_coherence(dictionary, corpus, model_list):
    coherence_values = []
    for model in model_list:
        print(".", end="")
        coherencemodel = gensim.models.CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return coherence_values

## Calculate and save model

In [None]:
# Can take a long time to run.
model_list = compute_model_list(dictionary = id2word,
                                corpus = corpus,
                                texts = data_lemmatized, 
                                start = 2, 
                                limit = 15,
                                step = 1)

In [None]:
#filehandler = open(local_path + "lda_mallet_models", 'wb') 
#pickle.dump(model_list, filehandler)
#filehandler.close()

## Calculate cohere score

In [None]:
coherence_values = compute_coherence(dictionary = id2word,
                                     corpus = corpus,
                                     model_list = model_list)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# Show graph
limit=15; start=2; step=1;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Number of topics")
plt.ylabel("Coherence score")
#plt.legend(("coherence_values"), loc = 'best')
plt.show()

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Select optimal model
optimal_model = model_list[8]
print(optimal_model.print_topics(10))