In [2]:
#Imports

#re
import re

#Import Pandas and Numpy
import numpy as np
import pandas as pd

#Web crawler
from crawler import crawler

#nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer 

#Gensim
import gensim
import gensim.corpora as corpora
import gensim.models.ldamodel as lda
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt


In [9]:
#################
#DATA RETRIEVAL

#Get articles in

#Create crawler object
web_crawl = crawler()

# Define Google Search query and number of searches
google_query = ['site:longreads.com america -tag']
num_search = 1

#Call the crawler
#Store the df that is returned in a pandas dataframe
data = web_crawl.write_crawl_results(google_query, num_search)

#1st article    
text = data["basic"][0]

#Col 1 - basic - Cleaned, but otherwise untouched body of text
#Col 2 - label - Query for the search query


In [47]:
#################
#PRE-PROCESSING - Article

def pre_processing(article):
    #Force everything to lower case
    article_lower = article.lower()
    
    ##Remove non-words
    article_clean = re.sub(r"[^\w\s\d]+", " ", article_lower)
    
    #Tokenise the article
    article_tokenise = word_tokenize(article_clean)
    
    ##Remove stopwords
    #Create a stopwords object
    stop_words = set(stopwords.words("english"))
    print(stop_words)
    
    #Extending the stopwords list with "u" - somehow this even showed up in the LDA
    #I think it comes from U.S. being separated to "u" and "s"
    #Longreads and wordpress are the names of the website and blog that put out the article and
    #add no meaning
    stop_words.add('world') #Why didn't this get rid of the u?
    
    #Keep only words not in the stopwords list
    article_sans_stopwords = [each_word for each_word in article_tokenise if each_word not in stop_words]
    
    ##Lemmatise
    #Create lemmatise object
    lemmatiser = WordNetLemmatizer()
    
    #Lemmatise~~ - should try to deal with this keeping only nouns, adjectives, verbs, and adverbs?
    article_lemmatised = [lemmatiser.lemmatize(word) for word in article_sans_stopwords]
    
    #Return lemmatised article
    #return article_lower, article_clean, article_tokenise, article_sans_stopwords, article_lemmatised
    return article_lemmatised

{"didn't", 'here', 'myself', "won't", 'very', 'has', "needn't", 'why', 'be', 's', 'll', "hasn't", 'above', 'off', 'me', 'yours', 'am', "don't", 'the', 'you', "couldn't", "wasn't", 'at', 'those', 'mightn', 'when', 'and', 're', 've', 'weren', 'needn', 'once', 'hasn', 'her', 'mustn', 'of', 'between', 'below', 'not', 'just', 'other', 'where', 'themselves', 'them', 'all', "you'll", "shan't", "doesn't", 'doesn', 'he', 'ain', 'further', 'do', "wouldn't", 'after', 'don', 'shan', 'but', 'yourself', 'having', 'isn', 'to', 'have', 'down', "you've", 'they', 'hadn', 'my', 'ours', 'this', 'during', 'what', 'in', 'out', 'hers', 'up', 'same', "hadn't", "aren't", 'ourselves', 'didn', 'i', 'been', 'through', 'so', 'their', 'is', "shouldn't", 'on', 'theirs', 'couldn', 'that', 'should', 'any', 'himself', 'now', 'she', 'm', 'only', 'yourselves', 'whom', 'haven', 'most', 'shouldn', 'herself', 'doing', 'under', 'how', 'd', 'few', 't', 'ma', 'your', 'did', 'can', "mightn't", "mustn't", 'because', "should've",

['x']

In [48]:
#################
#PRE-PROCESSING - Sentences
def sentence_processing(article):
    #Sentence tokenisation
    sentences = sent_tokenize(article)
    
    #Get clean, lemmatised sentences without stopwords
    sentences_lemmatised = [pre_processing(sentence) for sentence in sentences]
    
    #Return lemmatised sentence list
    return sentences_lemmatised

In [49]:
#################
#COUNTING - Sentences

def sent_tfidf(article_lemmatised_sentences):
    #Import sklearn
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    # Create the TFIDF Object
    tfidf_vectoriser = TfidfVectorizer()
    
    #Join sentences for the vectoriser
    sentence_list = [" ".join(sentence) for sentence in article_lemmatised_sentences]
    
    # Fit sentences to the TFIDF vectoriser
    senteces_TFIDFed = tfidf_vectoriser.fit_transform(sentence_list)
    
    # Put the TFIDF vectorised data into a pandas dataframe
    sent_df = pd.DataFrame(senteces_TFIDFed.toarray())
    
    # Rename the columns to the words in the sentences
    sent_df.columns = tfidf_vectoriser.get_feature_names()
    
    return sent_df

In [13]:
#TFIDF our sentences to see which has the important words
#tfidf_article_by_sent = sent_tfidf(sent_lemmatised)

In [50]:
#Lemmatise sentences
sent_lemmatised = sentence_processing(text)

{"didn't", 'here', 'myself', "won't", 'very', 'has', "needn't", 'why', 'be', 's', 'll', "hasn't", 'above', 'off', 'me', 'yours', 'am', "don't", 'the', 'you', "couldn't", "wasn't", 'at', 'those', 'mightn', 'when', 'and', 're', 've', 'weren', 'needn', 'once', 'hasn', 'her', 'mustn', 'of', 'between', 'below', 'not', 'just', 'other', 'where', 'themselves', 'them', 'all', "you'll", "shan't", "doesn't", 'doesn', 'he', 'ain', 'further', 'do', "wouldn't", 'after', 'don', 'shan', 'but', 'yourself', 'having', 'isn', 'to', 'have', 'down', "you've", 'they', 'hadn', 'my', 'ours', 'this', 'during', 'what', 'in', 'out', 'hers', 'up', 'same', "hadn't", "aren't", 'ourselves', 'didn', 'i', 'been', 'through', 'so', 'their', 'is', "shouldn't", 'on', 'theirs', 'couldn', 'that', 'should', 'any', 'himself', 'now', 'she', 'm', 'only', 'yourselves', 'whom', 'haven', 'most', 'shouldn', 'herself', 'doing', 'under', 'how', 'd', 'few', 't', 'ma', 'your', 'did', 'can', "mightn't", "mustn't", 'because', "should've",

In [51]:
#################
#LDA

#Create LDA dictionary and corpus
dictionary = corpora.Dictionary(sent_lemmatised)
corpus = [dictionary.doc2bow(text) for text in sent_lemmatised]
#Btw the lists are still sentences, and the tuples are word id, and the count (within the sentence)


#Create model
lda_model = lda.LdaModel(corpus,
                         id2word = dictionary,
                         num_topics = 6
                         )
topics = lda_model.print_topics(num_words = 10)

for topics in topics:
    print(topics)


(0, '0.031*"fay" + 0.027*"vincent" + 0.016*"wright" + 0.011*"though" + 0.010*"find" + 0.008*"america" + 0.007*"life" + 0.006*"hard" + 0.006*"mojave" + 0.006*"protest"')
(1, '0.031*"vincent" + 0.020*"fay" + 0.012*"like" + 0.011*"wright" + 0.009*"one" + 0.007*"never" + 0.006*"house" + 0.006*"become" + 0.006*"shelter" + 0.005*"come"')
(2, '0.015*"fay" + 0.009*"space" + 0.008*"love" + 0.008*"get" + 0.008*"nasa" + 0.008*"like" + 0.007*"start" + 0.006*"vincent" + 0.005*"find" + 0.005*"reader"')
(3, '0.019*"wright" + 0.017*"fay" + 0.014*"alcott" + 0.010*"one" + 0.009*"first" + 0.009*"vincent" + 0.008*"novel" + 0.008*"time" + 0.007*"never" + 0.007*"father"')
(4, '0.012*"fay" + 0.011*"wright" + 0.009*"vincent" + 0.008*"moon" + 0.006*"mother" + 0.006*"right" + 0.006*"mourning" + 0.006*"two" + 0.006*"across" + 0.006*"ever"')
(5, '0.018*"alcott" + 0.012*"change" + 0.012*"berrigan" + 0.008*"america" + 0.007*"life" + 0.007*"fay" + 0.007*"like" + 0.007*"way" + 0.006*"love" + 0.006*"question"')


In [52]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, sort_topics = False)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [54]:
# Topic coherence
# Read the explanations from https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
# and https://rare-technologies.com/what-is-topic-coherence/
#But don't completely get it. What's important though, is that topic coherence gives us a quantitative measure
# of topic extraction quality from our LDA model.


#THIS DOESN'T WORK FOR SOME REASON ):
coherence_model_lda = CoherenceModel(model = lda_model,
                                     texts = sent_lemmatised,
                                     dictionary = dictionary,
                                     coherence='c_v'
                                     )
coherence_lda = coherence_model_lda.get_coherence()

In [55]:
print(coherence_lda)

0.46806453717447005
