In [1]:
import pandas as pd
from wordcloud import WordCloud as cloud
import matplotlib.pyplot as plt
import string
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS as stopwords
import nltk
nltk.download("wordnet")
nltk.download("stopwords")
from nltk.stem import WordNetLemmatizer as lemm, SnowballStemmer as stemm
from nltk.stem.porter import *
from nltk.corpus import stopwords
import en_core_web_lg
nlp =  en_core_web_lg.load()

stop_words = stopwords.words('english')
stop_words.extend(['reuters','cnbc','year','last','banks','first','one','two','billion','due','rose','next','global','end','investors','lower','risk','back', 'week','however','policy', 'editing', 'figures','currencies','coronavirus','analysts','interest',
                   'level','currency','economy','long','term','likely','reporting','resistance','central','bank', 'tuesday', 'friday', 'march','april','june','july','august','september','october','november','december','january','february','barely','enough',
                   'expected','ahead','data','writing','around','today','index','reported','price','prices','inflation', 'market','markets', 'month', 'could','rate','rates','time',
                   'info','said','would','may','since','also','support','new','higher','day','high','low','trade','trading','wednesday','thursday','monday','economic','calendar'])

stop_words.extend(['january','february','march','april','may','june','july','august','september','october','november','december','monday','tuesday',
                  'wednesday','thursday','friday','saturday','sunday','will','day','today','week','weeks','yesterday','tomorrow'])

import numpy as np
np.random.seed(0)
from gensim import corpora, models
from gensim.models import CoherenceModel
from pprint import pprint
import pyLDAvis as pyldavis


from pymongo import MongoClient

connection=MongoClient("mongodb://localhost:27017/crawler.contents")

db=connection.get_database()

[nltk_data] Downloading package wordnet to /home/visnja/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/visnja/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

In [None]:

articles = pd.DataFrame(list(db.contents.find()))


articles = articles.drop(columns=['visited','created_at','contentType','date','icon','_id'])
articles.head()
articles = articles.dropna(how='any',axis=0)

import re

def clean(txt):
    excluded_tags = {"VERB", "ADV","ADJ","NUM","PRON","ADP","AUX","CCONJ","DET","INTJ","PART"}
    new_sentence = []
    for token in nlp(txt):
        if token.pos_ not in excluded_tags:
            new_sentence.append(token.text)
    res = " ".join(new_sentence)
    res = re.sub(r'[,\.!?]','',txt)
    res = str(res.lower())
    res = re.sub(r'\d* min read', '', str(res), flags=re.IGNORECASE)
    res = re.sub(r'by reuters staff', '', str(res), flags=re.IGNORECASE)
    res = re.sub(r'australian dollar', 'aud', str(res), flags=re.IGNORECASE)
    res = re.sub(r'us dollar', 'usd', str(res), flags=re.IGNORECASE)
    res = re.sub(r'british pound', 'gbp', str(res), flags=re.IGNORECASE)
    res = re.sub(r'canadian dollar', 'cad', str(res), flags=re.IGNORECASE)
    res = re.sub(r'euro ', 'eur ', str(res), flags=re.IGNORECASE)
#     new_sentence = []
#     for token in nlp(res):
#         if token.pos_ not in excluded_tags:
#             new_sentence.append(token.text)
    res = re.sub(r'[^\w\s]', '', res)
    res = re.sub(r'\d*', '', res)
    
#     res = re.sub(r'[^\d]', '', res)
#     lst_txt = res.split()
#     ps = nltk.stem.porter.PorterStemmer()
#     lst_txt = [ps.stem(word) for word in lst_txt]
#     lem = nltk.stem.wordnet.WordNetLemmatizer()
#     lst_txt = [lem.lemmatize(word) for word in lst_txt]
#     lst_txt = remove_stopwords(lst_txt)
#     txt = " ".join(txt)
    return res

articles['text_processed'] = articles['body'].map(lambda x: [clean(y)  if isinstance(x,list) else [] for y in list(x)])

articles['text_processed_join'] = articles['text_processed'].map(lambda x:  " ".join(x))
print(articles['text_processed_join'][0])

long_string = ','.join(list(articles['text_processed_join'].values))

In [None]:
wordcloud = cloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')

wordcloud.generate(long_string)

# Visualize the word cloud
wordcloud.to_image()

In [None]:
data = articles['text_processed_join'].values.tolist()
data_words = list(sent_to_words(data))

# remove stop words
data_words = remove_stopwords(data_words)

print(data_words[:1][0][:30])

In [None]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_words)

# Create Corpus
texts = data_words

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][:30])

In [None]:
from pprint import pprint

# number of topics
num_topics = 10

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
import pyLDAvis.gensim_models
import pickle 
import pyLDAvis
import os

# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_data_filepath = os.path.join('./results/01ldavis_prepared_'+str(num_topics))

LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)


In [None]:
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)


In [None]:
 pyLDAvis.save_html(LDAvis_prepared, './results/01ldavis_prepared_'+ str(num_topics) +'.html')