In [None]:
import pandas as pd
#read the 20** excel 
df = pd.read_excel("dataset_2017.xlsx")
df

In [None]:
list_abstract = df["Abstract"].tolist()

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import LancasterStemmer
from nltk.corpus import stopwords
import io
import gensim
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy

import string

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import string
def remove_punctuation(text):
    
    clean_text_from_punc = text.translate(str.maketrans('', '', string.punctuation))
    
    return clean_text_from_punc

In [None]:
def remove_stopwords(text):

    clean_text_from_stop_words = []
    stop_words = set(stopwords.words('english'))
    
    words = text.split()
    for r in words:
        if not r in stop_words and len(r) >= 3:
            clean_text_from_stop_words.append(r)

    clean_text_from_stop_words = " ".join(clean_text_from_stop_words)

    return (clean_text_from_stop_words)

In [6]:
def lemmatization(text):
    lematize_text = []
    # Init the Wordnet Lemmatizer
    lemmatizer = WordNetLemmatizer()
    for word in text:
        lemmatizer_word = lemmatizer.lemmatize(word)
        lematize_text.append(lemmatizer_word)
    return lematize_text

In [7]:
import spacy

def lemmatization_spacy(text , allowed_postags = ["NOUN","VERB","ADJ","ADV","NUM","PRON"]):
    nlp = spacy.load("en_core_web_sm",disable=["parser","ner"])
    

    doc = nlp(text)
    lemmatize_text = []
    for token in doc:
        if token.pos_ in allowed_postags:
            lemmatize_text.append(token.lemma_)
    return lemmatize_text

['evacuation', 'think', 'most', 'effective', 'method', 'protect', 'life', 'tsunamis']


In [8]:
def tokenization(text):

    token_list= []
    for line in text:
       token = gensim.utils.simple_preprocess( line, deacc=True)
       token_list.append(''.join(token))
       token_list = list(filter(None, token_list))
    return token_list


In [12]:
def preprocessing(list_abstract):
    clean_texts = []
    for text in list_abstract:

        clean_text = remove_punctuation(text)

        clean_text = remove_stopwords(clean_text)

        clean_text = lemmatization_spacy(clean_text)

        clean_text = tokenization(clean_text)

        clean_texts.append(clean_text)
    
    return clean_texts


In [None]:
clean_preprocessing_text = preprocessing(list_abstract)
clean_preprocessing_text

In [None]:
from wordcloud import WordCloud

sentences = str(clean_preprocessing_text)
sentences_as_one_string = "".join(sentences)
s = remove_punctuation(sentences_as_one_string)

wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white").generate(s)
plt.figure(figsize=(20,20))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
# Simple Vectorization
import gensim.corpora as corpora

# Create Dictionary
dictionary = corpora.Dictionary(clean_preprocessing_text)
# Create Corpus List
corpus = []
# Term Document Frequency
for text in clean_preprocessing_text:
    new_text = dictionary.doc2bow(text)
    corpus.append(new_text)
#Display
corpus[0]



In [None]:
#TfidfModel

# Create Dictionary
id2word = corpora.Dictionary(clean_preprocessing_text)
# Create Corpus
corpus = []
tfidf   = gensim.models.TfidfModel(dictionary=id2word, normalize=True)
# Term Document Frequency
for text in clean_preprocessing_text:
    new_text = tfidf[id2word.doc2bow(text)]
    corpus.append(new_text)
# View
corpus

In [29]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)


In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
visual_LDA = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
visual_LDA

In [None]:
topics_matrix = {}
  
for idx, topic in lda_model.show_topics(formatted=False, num_words= 10):
    w1 = [w[0] for w in topic]
    topics_matrix[f"Topic {1+idx}"] =  w1
    
print(topics_matrix)

df = pd.DataFrame(data=topics_matrix,)
df.style.set_properties(**{'border': '1.3px solid green','color': 'magenta'})
df



In [None]:
# Print the Keyword in the x topics
lda_model.print_topics()

In [None]:
doc_lda = lda_model[corpus]
print(doc_lda)

In [38]:
# Compute Perplexity
print('Perplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=clean_preprocessing_text, dictionary=dictionary, coherence='c_v')
coherence_LDA = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_LDA)

Coherence Score:  0.23884053567482622


In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

        model_list.append(lda_model)
        coherencemodel = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# Show graph
limit=40; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

print(model_list[1])

In [None]:
#show the most optimal model LDA
optimal_model = model_list[2]
model_topics = optimal_model.show_topics(formatted=False)
print(optimal_model.print_topics(num_words=10))