#### This corpus includes 2,225 documents from BBC's news website corresponding to stories in five topical areas (business, entertainment, politics, sport, tech) from 2004-2005. The CSV file includes two columns: category (the five class labels) and text (pre-processed article content). In this exerceise I will use only the text column in the document.

Importing necessary libraries

In [1]:
import re
import os
import pandas as pd
import numpy as np
import collections
from collections import Counter

#Genism
import gensim
from gensim import corpora
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.models import LsiModel, CoherenceModel
from gensim.models import LdaModel, LdaMulticore

#NLTK
from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#TextBlob
from textblob import TextBlob

stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

### Task:-
###### -----------------------------------------------------------------------------------------------------------

#### Extract the top five topics for each article that can be used for keyword search and retrieval.Using LSI/LSA and LDA algorithms, after vectorizing the text using TF-IDF vector in three different ways:

#### (1) after normal cleaning of the text corpus (punctuation removal, stopword removal, etc.),
##### (2) with term frequency filter, to exclude the top 10% of the most frequent words and words that appear less than 5 times in the documents (drawing from Zipf's Law), and
###### (3) with a part of speech filter, to limit your TD-IDF matrix to nouns only. 

In [2]:
###Loading Data
os.chdir('D:/USF/Text Analytics/Assignment3-Topic Modelling of BBC News Articles')
articles_df=pd.read_csv("BBC-articles.csv")

In [3]:
articles_df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


###### Data Cleaning

In [4]:
#This function returns a list of words after text cleaning and list of words with only nouns.
def text_corpus_cleaning(input_text):
    input_text = input_text.lower()    #Converting all the words in article into lower case
    for char in '!#$%&@?,.:;+-*/=<>"\'()[\\]X{|}~\n\t': #Removing all the special characters from the text inside article
        input_text = input_text.replace(char, ' ')
    word_list=re.findall(r"([a-zA-z]+)\s",input_text) #Make a list of words present in the article.
    #Remove all the stop words and include only the words with minimum length of 2.
    word_list=[w for w in word_list if w not in stop_words and len(w) > 2]
    #Lemmatization of words
    word_list = [lemmatizer.lemmatize(w) for w in word_list]
    modified_text=' '.join([w for w in word_list])
    blob_object = TextBlob(modified_text)
    #Limiting the word list with nouns
    word_list_nouns = [word for word,pos in blob_object.tags if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]
    return word_list,word_list_nouns

In [36]:
tokens=[]
tokens_nouns=[]
for i in range(len(articles_df['text'])):
    tokens.append(text_corpus_cleaning(articles_df['text'][i])[0])
    tokens_nouns.append(text_corpus_cleaning(articles_df['text'][i])[1])
#Creating dictionary based after cleaning the data according to method1.
myDict = corpora.Dictionary(tokens)
dtm = [myDict.doc2bow(doc) for doc in tokens]
tfidf_vectorizer = TfidfModel(dtm) 
tfidf = tfidf_vectorizer[dtm] 
#LSI model
lsi_model = LsiModel(corpus=tfidf, id2word=myDict, num_topics=5)
#Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=tfidf,
                                       id2word=myDict,
                                       num_topics=5)

#### To reduce dimensionality, we can filter out tokens that occur in less than  5 documents (absolute number) or more than 0.90 (fraction of total corpus size), 

In [37]:
#Creating dictionary based after cleaning the data according to method2.
myDict_2 = Dictionary(tokens)
myDict_2.filter_extremes(no_below=5, no_above=0.90)
dtm_2 = [myDict_2.doc2bow(doc) for doc in tokens]
tfidf_vectorizer = TfidfModel(dtm_2) 
tfidf = tfidf_vectorizer[dtm_2] 
#LSI model
lsi_model_2 = LsiModel(corpus=tfidf, id2word=myDict_2, num_topics=5)
#Build LDA model
lda_model_2 = gensim.models.LdaMulticore(corpus=tfidf,
                                       id2word=myDict_2,
                                       num_topics=5)

In [38]:
#Creating dictionary based after cleaning the data according to method3.
myDict_nouns=corpora.Dictionary(tokens_nouns)
dtm_nouns=[myDict_nouns.doc2bow(doc) for doc in tokens_nouns]
tfidf_vectorizer = TfidfModel(dtm_nouns) 
tfidf = tfidf_vectorizer[dtm_nouns] 
#LSI Model
lsi_model_nouns = LsiModel(corpus=tfidf, id2word=myDict_nouns, num_topics=5)
lsi_model_nouns.print_topics(num_topics=5, num_words=5)
#LDA Model
lda_model_nouns = gensim.models.LdaMulticore(corpus=tfidf,
                                       id2word=myDict_nouns,
                                       num_topics=5)

### Function to return top keywords for each document based on the model trained

In [39]:
def getTopicWords(model,corpus,n=10):
    topic=sorted(model[corpus],key=lambda tup: -1+tup[1])[0]
    top10=model.show_topic(topic[0],n)
    words,_=zip(*top10)
    return ','.join(words)

In [43]:
for i in range(len(articles_df['text'])):
    articles_df.at[i,'Method1_LSI']=getTopicWords(lsi_model,dtm[i],n=5)
    articles_df.at[i,'Method2_LSI']=getTopicWords(lsi_model_2,dtm_2[i],n=5)
    articles_df.at[i,'Method3_LSI']=getTopicWords(lsi_model_nouns,dtm_nouns[i],n=5)
    articles_df.at[i,'Method1_LDA']=getTopicWords(lda_model,dtm[i],n=5)
    articles_df.at[i,'Method2_LDA']=getTopicWords(lda_model_2,dtm_2[i],n=5)
    articles_df.at[i,'Method3_LDA']=getTopicWords(lda_model_nouns,dtm_nouns[i],n=5)
    list7=articles_df['Method1_LSI'][i].split(",")+articles_df['Method2_LSI'][1].split(",")+articles_df['Method3_LSI'][i].split(",")+articles_df['Method1_LDA'][i].split(",")+articles_df['Method2_LDA'][i].split(",")+articles_df['Method3_LDA'][i].split(",")    
    list7= [word for word, word_count in Counter(list7).most_common(5)]
    Top5_words=','.join([w for w in list7])
    articles_df.at[i,'Top5_words']=Top5_words

#### References:

https://www.machinelearningplus.com/nlp/gensim-tutorial/

https://usflearn.instructure.com/courses/1389096/files/85339820/download

https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0

In [46]:
articles_df.head()

Unnamed: 0,category,text,Method1_LSI,Method2_LSI,Method3_LSI,Method1_LDA,Method2_LDA,Method3_LDA,Top5_words
0,tech,tv future in the hands of viewers with home th...,"labour,election,blair,tax,game","labour,election,game,film,blair","phone,film,economy,growth,technology","film,game,mobile,music,player","film,rate,mobile,music,game","phone,music,film,company,sale","film,game,music,labour,election"
1,business,worldcom boss left books alone former worldc...,"labour,election,blair,tax,game","labour,election,game,film,blair","phone,film,economy,growth,technology","labour,film,brown,england,party","game,mobile,technology,phone,company","film,game,election,phone,party","game,film,labour,election,phone"
2,sport,tigers wary of farrell gamble leicester say ...,"labour,election,blair,tax,game","labour,election,game,film,blair","film,game,england,oscar,award","film,game,mobile,music,player","film,labour,election,party,tax","game,film,music,election,player","game,film,election,labour,blair"
3,sport,yeading face newcastle in fa cup premiership s...,"mobile,phone,film,award,best","film,award,oscar,england,best","film,game,england,oscar,award","film,award,band,best,company","film,labour,election,party,tax","film,game,injury,player,england","film,award,game,best,labour"
4,entertainment,ocean s twelve raids box office ocean s twelve...,"labour,election,blair,tax,game","labour,election,game,film,blair","film,growth,economy,rate,bank","labour,film,brown,england,party","film,rate,mobile,music,game","game,film,music,election,player","film,game,labour,election,blair"


### Evaluating models using Coherence Score