In [32]:
#  importing modules

# %load LDA-Copy1.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import ast
import util
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
import gensim
import pyLDAvis.gensim

In [21]:
# Getting the dataframe
df = util.get_processed_data("./data/collections_all_science_out-temp.csv", True)

In [22]:
df.head()

In [23]:
# Setting up the cleaning function to remove stop words and lemmatize them.
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

In [24]:
# Removing junk words from summarization and creating the dictionary to be used in LDA
df.Summarisation.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
df['clean'] = df.Summarisation.apply(clean)
df.clean = df.clean.apply(lambda x: x.split())
dictionary = gensim.corpora.Dictionary(df.clean)
df['clean_matrix'] = df.clean.apply(lambda x: dictionary.doc2bow(x))

In [9]:
# Building LDA model
lda = gensim.models.ldamodel.LdaModel
np.random.seed(1)
ldamodel = lda(
    df.clean_matrix, 
    num_topics=20, 
    id2word = dictionary, 
    passes=100,
    minimum_probability=0.0)

In [10]:
for idx, topic in ldamodel.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [36]:
# Visualizing the LDA
vis_data = pyLDAvis.gensim.prepare(ldamodel,df.clean_matrix,dictionary)
pyLDAvis.display(vis_data)

In [25]:
# Saving the model
ldamodel.save('./lda_25_sc/lda_25_sc.model')
ldamodel = ldamodel.load('./lda_25_sc/lda_25_sc.model')

In [1]:
# Creating a column for the LDA distribution pertaining to each LR
df['lda_topics'] = df.clean_matrix.apply(lambda x: ldamodel.get_document_topics(x,per_word_topics=True)[0])
df['lda_topics'] = df.lda_topics.apply(lambda x : [b for _ ,b in x ])

In [35]:
df.lda_topics[0]

In [28]:
df.head()

In [29]:
# Save the model
df.to_csv("./data/collections_all_science_out-temp_lda.csv",index=False)

In [30]:
df.shape