<a href="https://colab.research.google.com/github/yuvaravii/BBC-News-article-Topic-Identification/blob/main/LDA_Topic_Modelling_Theme_extraction_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Problem Description**

In this project your task is to identify major themes/topics across a collection of BBC news articles. You can use clustering algorithms such as Latent Dirichlet Allocation (LDA), Latent Semantic Analysis (LSA) etc.

In [None]:
# for dataframes
import pandas as pd
import numpy as np
import re

#for ignoring warnings
import warnings
warnings.filterwarnings("ignore")

import json
import glob
import os


#gensim
import gensim
import gensim.corpora as corpora 
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel


from spacy import displacy
from gensim.corpora import Dictionary
from gensim.models import LdaModel

import sklearn
import keras

#spacy
import spacy 
from nltk.corpus import stopwords

# for visualisation of data
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
processed_data_filepath='/content/drive/MyDrive/Colab Notebooks/Capstone Project/BBC article/2. Cleaned and Preprocessed data/3rd_cleaned_dataset_stg.csv'
new_df=pd.read_csv(processed_data_filepath)
df=new_df.copy()
df=df.drop(columns={'Unnamed: 0'})
df.head()

## building LDA MODEL

While using the gensim for topic modelling it does not need DTM(Document term matrix) as it has its internal mechanism to create DTM.

In [None]:
corpus=df['cleaned_doc_token'].apply(eval)
corpus = [d for d in corpus]
dict_=corpora.Dictionary(corpus)
print(dict_)

In [None]:
for i in dict_.values():
  print(i)

In [None]:
# Converting the dict into Document term matrix
doc_term_matrix = [dict_.doc2bow(i) for i in corpus]

# def doc2bow(document, allow_update=False, return_missing=False)
# Convert document into the bag-of-words (BoW) format = list of (token_id, token_count) tuples.

doc_term_matrix # Bag of words looks like

In [None]:
# Since we obtained the bag of words we implement lda

########################### BLOCK -1 LDA IMPLEMENTATION #######################

# Creating object for gensim library for lda model

lda = gensim.models.ldamodel.LdaModel

# Training and running the LDA model.
lda_model = lda(doc_term_matrix,num_topics=10,id2word=dict_, passes=10,random_state=100,eval_every=None)

In [None]:
 # most frequent words #said,one,would,could,Mr,Ms,also,last,first,year,told,new,ask,two,like,many,take,years,people
 lda_model.print_topics()

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, doc_term_matrix, dict_)
vis


In [None]:
import nltk
nltk.download('stopwords')

import nltk
from nltk.corpus import stopwords  #stopwords
from nltk.stem import WordNetLemmatizer  
from sklearn.feature_extraction.text import TfidfVectorizer
stop_words=set(nltk.corpus.stopwords.words('english'))

vect =TfidfVectorizer(stop_words=stop_words,max_features=1000)
vect_text=vect.fit_transform(df['cleaned_doc'])

from sklearn.decomposition import LatentDirichletAllocation
lda_model=LatentDirichletAllocation(n_components=10,learning_method='online',random_state=42,max_iter=10) 
lda_top=lda_model.fit_transform(vect_text)

vocab = vect.get_feature_names()
for i, comp in enumerate(lda_model.components_):
     vocab_comp = zip(vocab, comp)
     sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10]
     
     print("Topic "+str(i)+": ")
     for t in sorted_words:
            print(t[0],end=" ")
     print("\n")
     

## Set 2

In [None]:
# Since we obtained the bag of words we implement lda

########################### BLOCK -1 LDA IMPLEMENTATION #######################

# Creating object for gensim library for lda model

lda = gensim.models.ldamodel.LdaModel

# Training and running the LDA model.
lda_model = lda(doc_term_matrix,num_topics=3,id2word=dict_, passes=10,chunksize=400,random_state=100,eval_every=None)

import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, doc_term_matrix, dict_)
vis

## Set 3

In [None]:
# Since we obtained the bag of words we implement lda

########################### BLOCK -1 LDA IMPLEMENTATION #######################

# Creating object for gensim library for lda model

lda = gensim.models.ldamodel.LdaModel

# Training and running the LDA model.
lda_model = lda(doc_term_matrix,num_topics=5,id2word=dict_, passes=10,chunksize=400,random_state=100,eval_every=None)

import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model,doc_term_matrix, dict_)
vis

## Evaluation metric

In [None]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=corpus, dictionary=dict_, coherence='c_v',)
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# Hyper parameter tuning using the coherence score

lda = gensim.models.ldamodel.LdaModel

# hyperparamters = num_topics

num_topics=list(range(3,15))
dict_hyp={}
for i in num_topics:
  # Training and running the LDA model.
  lda_model = lda(doc_term_matrix,num_topics=i,id2word=dict_, passes=5,chunksize=40,eval_every=None)

  coherence_model_lda = CoherenceModel(model=lda_model, texts=corpus, dictionary=dict_, coherence='c_v',)
  coherence_lda = coherence_model_lda.get_coherence()
  dict_hyp[i]=coherence_lda
  print('\nCoherence Score when num_of_topic is : '+str(i)+'  -------->',coherence_lda)

In [None]:
import matplotlib.pyplot as plt

D = dict_hyp

plt.figure(figsize=(12,6))
ax=plt.barh(range(len(D))[::-1], D.values(), align='center')
plt.yticks(range(len(D))[::-1], list(D.keys()))
plt.ylabel('Number of topics')
plt.xlabel('Coherence Score')

plt.show()

In [None]:
######################################## TUNING OF ALPHA *****************************************


Choose αm from [0.05,0.1,0.5,1,5,10]
Choose βm from [0.05,0.1,0.5,1,5,10]