## TOPIC ANALYSIS

https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/
    
https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0 

In [49]:
from wordcloud import WordCloud, STOPWORDS

import pandas as pd

from pprint import pprint

import gensim
from gensim import models
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from nltk.corpus import stopwords
from nltk import corpus
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import spacy

import pyLDAvis
import pyLDAvis.gensim 

from nltk.stem import PorterStemmer  

In [8]:
def remove_stopwords(texts):
    stop_words = stopwords.words('english')
    stop_words += ['from', 'subject', 're', 'edu', 'use','user', 'com', 'co', 'con', 'be', 'else', 'http', 'would','send', 
                   'do', 'try', 'tell', 'go', 'get', 'can', 'think', 'know', 'give', 'ask', 
               'next', 'find', 're']
    return [[word for word in simple_preprocess(str(doc)) if word.strip() not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def stem_and_lemmatize(tweet):
    tweet = ' '.join(tweet)
    stem = PorterStemmer().stem(tweet)
    return WordNetLemmatizer().lemmatize(stem)

### Getting the dataset

In [9]:
cdf = pd.read_csv('cdf.csv')

In [10]:
cdf['PM'].value_counts()/len(cdf)

none             0.901269
Boris Johnson    0.053743
Theresa May      0.044988
Name: PM, dtype: float64

In [11]:
cdf.shape

(305304, 7)

In [12]:
#reducing dataset from 300k to 30k (removing tweets that do not refer to either Boris or Theresa May)
df = cdf.drop(cdf[(cdf.PM == 'none')].index)

In [13]:
df.shape

(30143, 7)

In [14]:
#dropping nan values
df.dropna(inplace=True)

In [15]:
df['PM'].value_counts()

Boris Johnson    16407
Theresa May      13735
Name: PM, dtype: int64

In [16]:
#Dropping duplicates
df.drop_duplicates(subset='TEXT', inplace=True)

### Splitting the dataset by months

In [17]:
df_jan = df[(df.MONTH_STR == 'Jan')]
df_fev = df[(df.MONTH_STR == 'Fev')]
df_march = df[(df.MONTH_STR == 'Mar')]
df_april = df[(df.MONTH_STR == 'Apr')]
df_may = df[(df.MONTH_STR == 'May')]
df_june = df[(df.MONTH_STR == 'Jun')] 
df_july = df[(df.MONTH_STR == 'Jul')]
df_aug = df[(df.MONTH_STR == 'Aug')] 
df_sep = df[(df.MONTH_STR == 'Sep')] 
df_oct = df[(df.MONTH_STR == 'Oct')] 
df_nov = df[(df.MONTH_STR == 'Nov')] 
df_dec = df[(df.MONTH_STR == 'Dec')]

In [18]:
df_march = pd.concat([df_jan,df_fev,df_march,df_april])
df_june = pd.concat([df_may,df_june,df_july, df_aug])
df_sep = pd.concat([df_sep,df_oct,df_nov, df_dec])

In [19]:
df.MONTH_STR.value_counts()

Apr    3604
Dec    3149
Oct    3052
Mar    2964
Jan    2741
Nov    2415
Sep    2115
Jul    2031
Feb    1879
Jun    1555
Aug    1479
May    1182
Name: MONTH_STR, dtype: int64

In [20]:
#transforming the series into lists for text processing
df_march = df_march['TWEET_CLEANED'].tolist()
df_july = df_july['TWEET_CLEANED'].tolist()
df_nov = df_nov['TWEET_CLEANED'].tolist()

## Preparing the text

### Creating a dictionary out of bigrams

Biagrams are 2 words frequently appearing together. This way I will be able to observe which words appear frequently before or after brexit.

#### January - April Tweets

In [21]:
bigram = gensim.models.Phrases(df_march, min_count=1, threshold=1) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [35]:
#remove Stop Words
data_words_nostops = remove_stopwords(df_march)
#creating bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
#Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en', disable=['parser', 'ner'])
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized_m = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB'])

In [36]:
print(data_lemmatized_m[:1][0][:30])

['uphold', 'rule', 'crime', 'laughable', 'pmqs', 'brexitshamble']


In [37]:
# Create Dictionary from the unique bigrams 
id2word_m = corpora.Dictionary(data_lemmatized_m)
id2word_m.filter_extremes(no_below=10, no_above=0.2) #excluding tokens that ocurred in less than 10 tweets and bigrams that occurred in more than 50% of the tweets
# Rebuild corpus based on the dictionary
texts_m = data_lemmatized_m
for i in texts_m:
    for k in i:
        remove = ['from', 'subject', 're', 'edu', 'use','user', 'com', 'co', 'con', 'be', 'else', 'http', 'would','send', 
                   'do', 'try', 'tell', 'go', 'get', 'can', 'think', 'know', 'give', 'ask', 's','mean','take','name','local',
                  'next', 'find', 're', 'semi','week', 'day', 'want', 'mail', 'run', 'ree', 'other', 'many', 'day', 'year']
        if k in remove:
            i.remove(k)
# Term Document Frequency
corpus_m = [id2word_m.doc2bow(text) for text in texts_m]

#### May - August Tweets

In [38]:
bigram = gensim.models.Phrases(df_july, min_count=50, threshold=1) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [39]:
#remove Stop Words
data_words_nostops = remove_stopwords(df_july)
#creating bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
#Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en', disable=['parser', 'ner'])
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized_j = lemmatization(data_words_bigrams, allowed_postags=['NOUN','ADJ', 'VERB', 'ADV'])

In [40]:
print(data_lemmatized_j[:1][0][:30])

['tell', 'offer', 'democracy']


In [41]:
# Create Dictionary from the unique bigrams 
id2word_j = corpora.Dictionary(data_lemmatized_j)
id2word_j.filter_extremes(no_below=10, no_above=0.2) #excluding tokens that ocurred in less than 10 tweets and bigrams that occurred in more than 50% of the tweets
# Rebuild corpus based on the dictionary
texts_j = data_lemmatized_j
for i in texts_j:
    for k in i:
        remove = ['from', 'subject', 're', 'edu', 'use','user', 'com', 'co', 'con', 'be', 'else', 'http', 'would','send', 
                   'do', 'try', 'tell', 'go', 'get', 'can', 'think', 'know', 'give', 'ask', 's','mean','take','name','local',
                  'next', 'find', 're', 'semi','week', 'day', 'want', 'mail', 'run', 'ree', 'other', 'many', 'day', 'year']
        if k in remove:
            i.remove(k)
# Term Document Frequency
corpus_j = [id2word_j.doc2bow(text) for text in texts_j]

#### September - December Tweets

In [42]:
bigram = gensim.models.Phrases(df_nov, min_count=50, threshold=1) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

In [43]:
#remove Stop Words
data_words_nostops = remove_stopwords(df_nov)
#creating bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
#Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en', disable=['parser', 'ner'])
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized_n = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'VERB', 'ADJ'])

In [44]:
print(data_lemmatized_n[:1][0][:30])

['corbyn', 'supporter', 'check', 'remainer', 'need', 'hold', 'nose', 'good', 'judgement', 'stop', 'get', 'majority', 'ensure', 'deal', 'brexit', 'good', 'site', 'compare', 'tactical', 'voting', 'sit']


In [45]:
# Create Dictionary from the unique bigrams 
id2word_n = corpora.Dictionary(data_lemmatized_n)
id2word_n.filter_extremes(no_below=10, no_above=0.2) #excluding tokens that ocurred in less than 10 tweets and bigrams that occurred in more than 50% of the tweets
# Rebuild corpus based on the dictionary
texts_n = data_lemmatized_n
for i in texts_n:
    for k in i:
        remove = ['from', 'subject', 're', 'edu', 'use','user', 'com', 'co', 'con', 'be', 'else', 'http', 'would','send', 
                   'do', 'try', 'tell', 'go', 'get', 'can', 'think', 'know', 'give', 'ask', 's','mean','take','name','local','next', 'find', 're', 'semi','week', 'day', 'want', 'mail', 'run', 'ree', 'other', 'many', 'day', 'year']
        if k in remove:
            i.remove(k)
# Term Document Frequency
corpus_n = [id2word_n.doc2bow(text) for text in texts_n]

### Using TF-IDF

A problem with this approach is that highly frequent words start to dominate in the document, but may not be representative to the model as less-frequent words. One way to fix this is to measure how unique (or how infrequent) a word is across all documents (or tweets), which is called the “inverse document frequency” or IDF. By introducing IDF, frequent words that are also frequent across all documents get penalized with less weight.

https://towardsdatascience.com/topic-modeling-of-2019-hr-tech-conference-twitter-d16cf75895b6

In [46]:
tfidf = models.TfidfModel(corpus_m)
tfidf_corpus_m = tfidf[corpus_m]

tfidf = models.TfidfModel(corpus_j)
tfidf_corpus_j = tfidf[corpus_j]

tfidf = models.TfidfModel(corpus_n)
tfidf_corpus_n = tfidf[corpus_n]

## Topic Modeling Via LDA

#### Jan - April

In [47]:
lda_model_m = gensim.models.ldamodel.LdaModel(corpus=tfidf_corpus_m,
                                           id2word=id2word_m,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=200,
                                           alpha='auto',
                                           per_word_topics=True)

In [53]:
# Print the Keyword in the 4 topics
pprint(lda_model_m.print_topics())
doc_lda_m = lda_model_m[corpus_m]

[(0,
  '0.028*"talk" + 0.025*"delay" + 0.020*"sign" + 0.020*"happen" + 0.018*"lie" '
  '+ 0.017*"help" + 0.015*"turn" + 0.015*"promise" + 0.014*"control" + '
  '0.014*"hope"'),
 (1,
  '0.046*"vote" + 0.038*"polling" + 0.027*"remain" + 0.026*"labour" + '
  '0.021*"need" + 0.020*"face" + 0.017*"today" + 0.017*"good" + '
  '0.017*"government" + 0.015*"call"'),
 (2,
  '0.036*"claim" + 0.033*"tory" + 0.026*"could" + 0.022*"election" + '
  '0.021*"leader" + 0.016*"month" + 0.014*"corbyn" + 0.014*"news" + '
  '0.013*"stay" + 0.012*"right"'),
 (3,
  '0.026*"conservative" + 0.025*"brexit" + 0.023*"deal" + 0.019*"say" + '
  '0.019*"people" + 0.017*"leave" + 0.016*"make" + 0.015*"deliver" + '
  '0.015*"party" + 0.014*"time"')]


In [51]:
for idx, topic in lda_model_m.show_topics(formatted=False, num_words= 10):
    print('Topic: {} \nWords: {}'.format(idx, '|'.join([w[0] for w in topic])))

Topic: 0 
Words: talk|delay|sign|happen|lie|help|turn|promise|control|hope
Topic: 1 
Words: vote|polling|remain|labour|need|face|today|good|government|call
Topic: 2 
Words: claim|tory|could|election|leader|month|corbyn|news|stay|right
Topic: 3 
Words: conservative|brexit|deal|say|people|leave|make|deliver|party|time


#### May - August

In [52]:
lda_model_j = gensim.models.ldamodel.LdaModel(corpus=tfidf_corpus_j,
                                           id2word=id2word_j,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=200,
                                           alpha='auto',
                                           per_word_topics=True)

In [54]:
# Print the Keyword in the 10 topics
pprint(lda_model_j.print_topics())
doc_lda_j = lda_model_j[corpus_j]

[(0,
  '0.050*"deal" + 0.049*"brexit" + 0.038*"say" + 0.028*"people" + '
  '0.026*"leave" + 0.021*"time" + 0.020*"election" + 0.019*"need" + '
  '0.018*"call" + 0.017*"promise"'),
 (1,
  '0.094*"deliver" + 0.044*"stop" + 0.041*"conservative" + 0.040*"come" + '
  '0.028*"good" + 0.028*"theresa" + 0.026*"corbyn" + 0.021*"face" + '
  '0.018*"show" + 0.018*"also"'),
 (2,
  '0.037*"make" + 0.035*"tory" + 0.029*"borisjohnson" + 0.028*"back" + '
  '0.027*"even" + 0.027*"plan" + 0.026*"country" + 0.025*"remain" + '
  '0.024*"could" + 0.024*"become"'),
 (3,
  '0.051*"vote" + 0.033*"labour" + 0.033*"happen" + 0.031*"poll" + '
  '0.025*"party" + 0.024*"voter" + 0.024*"work" + 0.024*"look" + 0.023*"talk" '
  '+ 0.023*"well"')]


In [55]:
for idx, topic in lda_model_j.show_topics(formatted=False, num_words= 10):
    print('Topic: {} \nWords: {}'.format(idx, '|'.join([w[0] for w in topic])))

Topic: 0 
Words: deal|brexit|say|people|leave|time|election|need|call|promise
Topic: 1 
Words: deliver|stop|conservative|come|good|theresa|corbyn|face|show|also
Topic: 2 
Words: make|tory|borisjohnson|back|even|plan|country|remain|could|become
Topic: 3 
Words: vote|labour|happen|poll|party|voter|work|look|talk|well


#### September - December

In [56]:
lda_model_n = gensim.models.ldamodel.LdaModel(corpus=tfidf_corpus_n,
                                           id2word=id2word_n,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=200,
                                           alpha='auto',
                                           per_word_topics=True)

In [65]:
# Print the Keyword in the 10 topics
pprint(lda_model_n.print_topics())
doc_lda_n = lda_model_n[corpus_n]

[(0,
  '0.033*"say" + 0.033*"election" + 0.032*"conservative" + 0.027*"need" + '
  '0.025*"corbyn" + 0.024*"make" + 0.023*"deliver" + 0.022*"remain" + '
  '0.020*"campaign" + 0.020*"farage"'),
 (1,
  '0.031*"keep" + 0.027*"full" + 0.024*"fact" + 0.023*"call" + 0.023*"sign" + '
  '0.022*"help" + 0.021*"referendum" + 0.020*"today" + 0.019*"enough" + '
  '0.019*"change"'),
 (2,
  '0.055*"vote" + 0.051*"deal" + 0.034*"leave" + 0.032*"tory" + 0.026*"people" '
  '+ 0.025*"stop" + 0.022*"labour" + 0.021*"good" + 0.021*"could" + '
  '0.020*"party"'),
 (3,
  '0.046*"come" + 0.036*"support" + 0.034*"fake" + 0.033*"believe" + '
  '0.032*"plan" + 0.030*"money" + 0.028*"thing" + 0.026*"well" + 0.022*"fail" '
  '+ 0.022*"win"')]


In [59]:
for idx, topic in lda_model_n.show_topics(formatted=False, num_words= 10):
    print('Topic: {} \nWords: {}'.format(idx, '|'.join([w[0] for w in topic])))

Topic: 0 
Words: say|election|conservative|need|corbyn|make|deliver|remain|campaign|farage
Topic: 1 
Words: keep|full|fact|call|sign|help|referendum|today|enough|change
Topic: 2 
Words: vote|deal|leave|tory|people|stop|labour|good|could|party
Topic: 3 
Words: come|support|fake|believe|plan|money|thing|well|fail|win


### Checking Coherence Score

In [62]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model_n, texts=data_lemmatized_n, dictionary=id2word_n, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.42304886230384486


https://www.kdnuggets.com/2018/04/robust-word2vec-models-gensim.html

### Visualisation of the Topics

In [63]:
lda_data =  pyLDAvis.gensim.prepare(lda_model_m, tfidf_corpus_m, id2word_m, mds='mmds')
pyLDAvis.display(lda_data)