## TOPIC ANALYSIS

https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/
    
https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0 

In [1]:
from wordcloud import WordCloud, STOPWORDS

import pandas as pd

from pprint import pprint

import gensim
from gensim import models
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from nltk.corpus import stopwords
from nltk import corpus
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import spacy

import pyLDAvis
import pyLDAvis.gensim 

from nltk.stem import PorterStemmer  

In [2]:
def remove_stopwords(texts):
    stop_words = stopwords.words('english')
    stop_words += ['from', 'subject', 're', 'edu', 'use','user', 'com', 'co', 'con', 'be', 'else', 'http', 'would','send', 
                   'do', 'try', 'tell', 'go', 'get', 'can', 'think', 'know', 'give', 'ask', 
               'next', 'find', 're']
    return [[word for word in simple_preprocess(str(doc)) if word.strip() not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def stem_and_lemmatize(tweet):
    tweet = ' '.join(tweet)
    stem = PorterStemmer().stem(tweet)
    return WordNetLemmatizer().lemmatize(stem)

### Getting the dataset

In [3]:
cdf = pd.read_csv('cdf.csv')

In [4]:
cdf['PM'].value_counts()/len(cdf)

none             0.901269
Boris Johnson    0.053743
Theresa May      0.044988
Name: PM, dtype: float64

In [5]:
cdf.shape

(305304, 7)

In [6]:
#reducing dataset from 300k to 30k (removing tweets that do not refer to either Boris or Theresa May)
df = cdf.drop(cdf[(cdf.PM == 'none')].index)

In [7]:
df.shape

(30143, 7)

In [8]:
#dropping nan values
df.dropna(inplace=True)

In [9]:
df['PM'].value_counts()

Boris Johnson    16407
Theresa May      13735
Name: PM, dtype: int64

In [10]:
#Dropping duplicates
df.drop_duplicates(subset='TEXT', inplace=True)

In [11]:
df

Unnamed: 0,MONTH_INT,MONTH_STR,USERNAME,TEXT,TWEET_PROCESSED,TWEET_CLEANED,PM
7,7,Jul,Macetrain,Nicola Sturgeon tells Boris Johnson: Brexit of...,"['nicola', 'sturgeon', 'tells', 'boris', 'john...",nicola sturgeon tells boris johnson brexit off...,Boris Johnson
9,7,Jul,DerekJa09788684,If we ever get brexit.and Boris is looking mor...,"['ever', 'brexit', 'boris', 'looking', 'dodgy'...",ever brexit boris looking dodgy continental qu...,Boris Johnson
12,7,Jul,johnleremainer,Media have finally realised that Brexit will b...,"['media', 'finally', 'realised', 'brexit', 'di...",media finally realised brexit disaster johnson...,Boris Johnson
22,7,Jul,Endoxa66,@BorisJohnson this better be a joke or else yo...,"['borisjohnson', 'better', 'joke', 'else', 'to...",borisjohnson better joke else tories toast bre...,Boris Johnson
24,7,Jul,KamalJoshi108,"""Turbo-charge"" Brexit plans, with ""all necessa...","['turbo', 'charge', 'brexit', 'plans', 'necess...",turbo charge brexit plans necessary funding pr...,Boris Johnson
...,...,...,...,...,...,...,...
305173,2,Feb,guardian,The Guardian view on Boris Johnson in court: B...,"['guardian', 'view', 'boris', 'johnson', 'cour...",guardian view boris johnson court brexit editori,Boris Johnson
305174,2,Feb,Col_Bogey,"Johnson is Trump's poodle -- a weak, unpatriot...","['johnson', 'trump', 'poodle', 'weak', 'unpatr...",johnson trump poodle weak unpatriotic irritati...,Boris Johnson
305179,2,Feb,PoetintheWoods,Poet in the Woods: The Voice of Many? https://...,"['poet', 'woods', 'voice', 'many', 'brexit', '...",poet woods voice many brexit boris muddling co...,Boris Johnson
305213,2,Feb,cpseed,"I don't need to learn anything about Brexit, I...","['need', 'learn', 'anything', 'brexit', 'fully...",need learn anything brexit fully understand tr...,Boris Johnson


### Splitting the dataset by months

In [None]:
# groupby your key and freq
g = df.groupby(pd.Grouper(key='MONTH_INT'))
# groups to a list of dataframes with list comprehension
dfss = [group for _,group in g]

In [None]:
months = df.MONTH_INT.unique()
monthsdict = {elem : pd.DataFrame() for elem in months}
for key in monthsdict.keys():
    monthsdict[key] = df[:][df.MONTH_INT == key]

In [None]:
df_names = ['df_jan', 'df_feb', 'df_march', 'df_april', 'df_may', 'df_june', 'df_july', 'df_aug', 'df_sep',
           'df_oct', 'df_nov', 'df_dec']        #a list of all the dataframes you want to create
for name in df:
    monthsdict[name] = df(name)

In [12]:
df_jan = df[(df.MONTH_STR == 'Jan')]
df_fev = df[(df.MONTH_STR == 'Fev')]
df_march = df[(df.MONTH_STR == 'Mar')]
df_april = df[(df.MONTH_STR == 'Apr')]
df_may = df[(df.MONTH_STR == 'May')]
df_june = df[(df.MONTH_STR == 'Jun')] 
df_july = df[(df.MONTH_STR == 'Jul')]
df_aug = df[(df.MONTH_STR == 'Aug')] 
df_sep = df[(df.MONTH_STR == 'Sep')] 
df_oct = df[(df.MONTH_STR == 'Oct')] 
df_nov = df[(df.MONTH_STR == 'Nov')] 
df_dec = df[(df.MONTH_STR == 'Dec')]

In [13]:
df_march = pd.concat([df_jan,df_fev,df_march,df_april])
df_june = pd.concat([df_may,df_june,df_july, df_aug])
df_sep = pd.concat([df_sep,df_oct,df_nov, df_dec])

In [14]:
df.MONTH_STR.value_counts()

Apr    3604
Dec    3149
Oct    3052
Mar    2964
Jan    2741
Nov    2415
Sep    2115
Jul    2031
Feb    1879
Jun    1555
Aug    1479
May    1182
Name: MONTH_STR, dtype: int64

In [15]:
#transforming the series into lists for text processing
df_march = df_march['TWEET_CLEANED'].tolist()
df_july = df_july['TWEET_CLEANED'].tolist()
df_nov = df_nov['TWEET_CLEANED'].tolist()

## Preparing the text

#### January - April Tweets

In [16]:
#creating bigrams and trigrams
bigram = gensim.models.Phrases(df_march, min_count=5, threshold=100) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram = gensim.models.Phrases(bigram[df_march], threshold=100)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [17]:
#remove Stop Words
data_words_nostops = remove_stopwords(df_march)
#creating bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
#Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en', disable=['parser', 'ner'])
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized_m = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB'])

In [53]:
print(data_lemmatized_m[:1][0][:30])

['upholding', 'rule', 'crime', 'laughable', 'brexitshamble']


In [50]:
# Create Dictionary
id2word_m = corpora.Dictionary(data_lemmatized_m)
id2word_m.filter_extremes(no_below=10, no_above=0.2) #excluding tokens that ocurred in less than 10 tweets and bigrams that occurred in more than 50% of the tweets
# Rebuild corpus based on the dictionary
texts_m = data_lemmatized_m
# Term Document Frequency
corpus_m = [id2word_m.doc2bow(text) for text in texts_m]

#### May - August Tweets

In [20]:
#creating bigrams and trigrams
bigram = gensim.models.Phrases(df_july, min_count=5, threshold=100) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram = gensim.models.Phrases(bigram[df_july], threshold=100)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [21]:
#remove Stop Words
data_words_nostops = remove_stopwords(df_july)
#creating bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
#Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en', disable=['parser', 'ner'])
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized_j = lemmatization(data_words_bigrams, allowed_postags=['NOUN','ADJ', 'VERB', 'ADV'])

In [52]:
print(data_lemmatized_j[:1][0][:30])

['offer']


In [51]:
# Create Dictionary
id2word_j = corpora.Dictionary(data_lemmatized_j)
id2word_j.filter_extremes(no_below=10, no_above=0.2) #excluding tokens that ocurred in less than 10 tweets and bigrams that occurred in more than 50% of the tweets
# Rebuild corpus based on the dictionary
texts_j = data_lemmatized_j
# Term Document Frequency
corpus_j = [id2word_j.doc2bow(text) for text in texts_j]

#### September - December Tweets

In [24]:
#creating bigrams and trigrams
bigram = gensim.models.Phrases(df_nov, min_count=5, threshold=100) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram = gensim.models.Phrases(bigram[df_nov], threshold=100)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [25]:
#remove Stop Words
data_words_nostops = remove_stopwords(df_nov)
#creating bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
#Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en', disable=['parser', 'ner'])
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized_n = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'VERB', 'ADJ'])

In [26]:
print(data_lemmatized_n[:1][0][:30])

['check', 'remainer', 'nee', 'hold', 'nose', 'good', 'judgement', 'stop', 'get', 'majority', 'ensure', 'deal', 'brexit', 'good', 'site', 'compare', 'tactical', 'voting', 'sit']


In [45]:
# Create Dictionary
id2word_n = corpora.Dictionary(data_lemmatized_n)
id2word_n.filter_extremes(no_below=10, no_above=0.2) #excluding tokens that ocurred in less than 10 tweets and bigrams that occurred in more than 50% of the tweets
# Rebuild corpus based on the dictionary
texts_n = data_lemmatized_n
# Term Document Frequency
corpus_n = [id2word_n.doc2bow(text) for text in texts_n]

### Using TF-IDF - not using

A problem with this approach is that highly frequent words start to dominate in the document, but may not be representative to the model as less-frequent words. One way to fix this is to measure how unique (or how infrequent) a word is across all documents (or tweets), which is called the “inverse document frequency” or IDF. By introducing IDF, frequent words that are also frequent across all documents get penalized with less weight.

https://towardsdatascience.com/topic-modeling-of-2019-hr-tech-conference-twitter-d16cf75895b6

In [None]:
#tfidf = models.TfidfModel(corpus_m)
#tfidf_corpus_m = tfidf[corpus_m]

#tfidf = models.TfidfModel(corpus_j)
#tfidf_corpus_j = tfidf[corpus_j]

#tfidf = models.TfidfModel(corpus_n)
#tfidf_corpus_n = tfidf[corpus_n]

## Topic Modeling Via LDA

#### Jan - April

In [54]:
lda_model_m = gensim.models.ldamodel.LdaModel(corpus=corpus_m,
                                           id2word=id2word_m,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=200,
                                           alpha='auto',
                                           per_word_topics=True)

In [55]:
for idx, topic in lda_model_m.show_topics(formatted=False, num_words= 20):
    print('Topic: {} \nWords: {}'.format(idx, '|'.join([w[0] for w in topic])))

Topic: 0 
Words: talk|good|british|agree|look|right|last|work|sign|open|keep|word|promise|destroy|agreement|control|well|feel|hold|part
Topic: 1 
Words: vote|conservative|tory|party|remain|could|election|polling|government|face|claim|come|today|delay|call|thank|voter|parliament|fail|break
Topic: 2 
Words: brexit|say|leave|labour|deliver|country|stop|trade|change|thing|force|stay|bad|show|month|political|letter|news|poor|become
Topic: 3 
Words: deal|people|make|time|support|leader|need|must|lie|happen|lose|referendum|plan|turn|democracy|resign|jeremycorbyn|public|new|woman


#### May - August

In [31]:
lda_model_j = gensim.models.ldamodel.LdaModel(corpus=corpus_j,
                                           id2word=id2word_j,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=200,
                                           alpha='auto',
                                           per_word_topics=True)

In [32]:
for idx, topic in lda_model_j.show_topics(formatted=False, num_words= 10):
    print('Topic: {} \nWords: {}'.format(idx, '|'.join([w[0] for w in topic])))

Topic: 0 
Words: stop|become|leader|political|much|today|point|warn|great|push
Topic: 1 
Words: leave|borisjohnson|remain|happen|theresa|well|plan|must|support|really
Topic: 2 
Words: deal|brexit|vote|tory|say|party|make|people|even|country
Topic: 3 
Words: deliver|election|conservative|call|good|fail|right|promise|hard|thing


#### September - December

In [47]:
lda_model_n = gensim.models.ldamodel.LdaModel(corpus=corpus_n,
                                           id2word=id2word_n,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=200,
                                           alpha='auto',
                                           per_word_topics=True)

In [48]:
for idx, topic in lda_model_n.show_topics(formatted=False, num_words= 20):
    print('Topic: {} \nWords: {}'.format(idx, '|'.join([w[0] for w in topic])))

Topic: 0 
Words: agree|plan|policy|option|split|become|risk|seem|great|face|back|today|enough|turn|table|fall|clear|block|live|feel|accept|part|idea|ditch|fool|different|truth|big|side|explain
Topic: 1 
Words: need|seat|majority|lose|work|government|candidate|win|real|debate|pact|choose|power|let|move|problem|fight|form|remember|leaver|blame|break|clean|generalelection|company|serious|beat|fake|hand|possible
Topic: 2 
Words: remain|keep|forget|chance|country|remainer|look|must|full|brexiteer|well|position|force|public|fail|thank|wrong|report|parti|increase|dem|bori|surrender|bring|politic|russian|put|corrupt|police|one
Topic: 3 
Words: deal|vote|tory|brexit|party|leave|people|election|stop|say|farage|labour|conservative|make|deliver|happen|good|could|lie|time|campaign|come|borisjohnson|stand|voter|trade|support|parliament|thing|read


### Checking Coherence Score

In [59]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model_n, texts=data_lemmatized_n, dictionary=id2word_n, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.4009366399074937


https://www.kdnuggets.com/2018/04/robust-word2vec-models-gensim.html

### Visualisation of the Topics

In [49]:
lda_data =  pyLDAvis.gensim.prepare(lda_model_n, corpus_n, id2word_n, mds='mmds')
pyLDAvis.display(lda_data)

In [None]:
#with nlkt sentiment analysys
sid = SentimentIntensityAnalyzer()

df_march['SENTIMENT_CP'] = cdf['TWEET_CLEANED'].apply(lambda x: sid.polarity_scores(x)['compound'])
df_march['SENTIMENT_NEUT'] = cdf['TWEET_CLEANED'].apply(lambda x: sid.polarity_scores(x)['neu'])
cdf['SENTIMENT_NEG'] = cdf['TWEET_CLEANED'].apply(lambda x: sid.polarity_scores(x)['neg'])
cdf['SENTIMENT_POS'] = cdf['TWEET_CLEANED'].apply(lambda x:sid.polarity_scores(x)['pos'])

cdf.loc[cdf.SENTIMENT_CP > 0,'SENTIMENT'] = 'positive'
cdf.loc[cdf.SENTIMENT_CP == 0,'SENTIMENT'] = 'neutral'
cdf.loc[cdf.SENTIMENT_CP < 0,'SENTIMENT'] = 'negative'