In [None]:
!pip install pyLDAvis
!python3 -m spacy download en

import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
# NLTK Stop words
import nltk; nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

import spacy

In [None]:
# training 19 LDA models

# Import Dataset
df = pd.read_csv("./osr_tweets_without_S_T_U_U_v2.csv", engine='python')

# df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/osr_tweets_without_S_T_U_U_v2.csv", engine='python')

# Convert to list
data = df.text.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm")
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

def get_topic_id(topic_distribution):
    topic = 0
    prob = 0
    for cur_topic, cur_prob in topic_distribution:
        if(cur_prob > prob):
            topic = cur_topic
            prob = cur_prob
    return topic

def get_num_unique_topics(merged_topic):
    exist_topic = []
    for topic in merged_topic:
        if not topic in exist_topic and not topic == '':
            exist_topic.append(topic)
    print(len(exist_topic))

num_topics_list = range(10,101,5)
# num_topics_list = [10]
topic_cluster_list = []
for num_topics in num_topics_list:
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

    # predict topic for each doc
    pred_list = []
    for doc in df.text:
        res = lda_model.get_document_topics(id2word.doc2bow(doc.split()))
        topic = get_topic_id(res)
        pred_list.append(topic)

    topic_for_each_cluster = []
    for i in range(0,num_topics):
        cur_index_list = []
        topic2counter = {}
        # reserve all the index predicted as ith cluster
        for j in range(len(pred_list)):
            if pred_list[j] == i:
                cur_index_list.append(j)
        # from all the tweet in ith cluster, count their appearance time
        for idx in cur_index_list:
            topic = df.topic[idx]
            if topic in topic2counter:
                topic2counter[topic] += 1
            else:
                topic2counter[topic] = 1

        topic4i = ""
        num4i = 0
        # assign topic for current cluster to be the most appeared topic
        for topic, counter in topic2counter.items():
            if counter > num4i:
                topic4i = topic

        # the index in list has a direct match to the predicted label
        topic_for_each_cluster.append(topic4i)

    print(topic_for_each_cluster)
    get_num_unique_topics(topic_for_each_cluster)
    topic_cluster_list.append(topic_for_each_cluster)
    filename = '{num_topics}_lda_model'.format(num_topics = num_topics)
    lda_model.save(filename)


In [13]:
id2label = {
0 : 'Children Education and Skills',
1 : 'Health and Social Care',
2 : 'Crime and Security',
3 : 'Economy',
4 : 'Housing Planning and Local Services',
5 : 'Labour Market and Welfare',
6 : 'Population and Society',
7 : 'Transport Environment and Climate Change'
}
map = {}
for id, label in id2label.items():
    map[label] = id

In [None]:
# evaluate 19 LDA models on testset
import pandas as pd

df = pd.read_csv("./test_set.csv", engine='python')
# df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/data/test_set.csv", engine='python')

for num_topics, topic_cluster in zip(num_topics_list, topic_cluster_list):
    filename = '{num_topics}_lda_model'.format(num_topics = num_topics)
    model = gensim.models.LdaModel.load(filename, mmap='r')
    docs = df.text.to_list()
    # predict topic for each doc
    pred_list = []
    for doc in docs:
        res = model.get_document_topics(id2word.doc2bow(doc.split()))
        topic = get_topic_id(res)
        pred_list.append(topic)

    predicted_classes = []
    for cluster_num in pred_list:
        # not doc in training set is classified in cluster_num, hence no corresponding topic in topic_cluster
        if not topic_cluster[cluster_num] == '':
            predicted_classes.append(map[topic_cluster[cluster_num]])
    accuracy = np.mean(np.array(predicted_classes) == df.label.values)
    print('Number of topic:', num_topics, 'Accuracy:',accuracy)

In [None]:
# calculate perplexity and coherence score for each model

for num_topics, topic_cluster in zip(num_topics_list, topic_cluster_list):
    filename = '{num_topics}_lda_model'.format(num_topics = num_topics)
    model = gensim.models.LdaModel.load(filename, mmap='r')
    # Compute Perplexity
    print('Perplexity: ', model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('Coherence Score: ', coherence_lda,'\n')