<a href="https://colab.research.google.com/github/yuvaravii/BBC-News-article-Topic-Identification/blob/main/Non_negative_Matrix_Theme_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Problem Description**

In this project your task is to identify major themes/topics across a collection of BBC news articles. You can use clustering algorithms such as Latent Dirichlet Allocation (LDA), Latent Semantic Analysis (LSA) etc.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# for dataframes
import pandas as pd
import numpy as np
import re

#for ignoring warnings
import warnings
warnings.filterwarnings("ignore")

import json
import glob
import os

#skelearn libraries
from sklearn import decomposition


#gensim
import gensim
import gensim.corpora as corpora 
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel


from spacy import displacy
from gensim.corpora import Dictionary
from gensim.models import LdaModel

import sklearn
import keras

#spacy
import spacy 
from nltk.corpus import stopwords

# for visualisation of data
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
processed_data_filepath='/content/drive/MyDrive/Colab Notebooks/Capstone Project/BBC article/2. Cleaned and Preprocessed data/3rd_cleaned_dataset_stg.csv'
new_df=pd.read_csv(processed_data_filepath)
df=new_df.copy()
df=df.drop(columns={'Unnamed: 0'})
df.head()

In [None]:
df.topics.unique()

In [None]:
# application of tf-idf vectorizer

# Input corpus for the vectorizer
corpus=df['cleaned_doc']

# importing the necessary libraries for performing
import nltk
from nltk.corpus import stopwords  #stopwords
from nltk.stem import WordNetLemmatizer  
from sklearn.feature_extraction.text import TfidfVectorizer

# creation of vectorizing model
vectorizer = TfidfVectorizer(stop_words=None, max_df=0.8, max_features=1000, ngram_range=(1,2)) # as sometimes the bigrams gives more meaning thus their frequency becomes significant
vectors=vectorizer.fit_transform(corpus)

In [None]:
print(vectors)

In [None]:
vectorizer.get_feature_names_out()

In [None]:
# we have tokenized and retrieved its significance. Lets decompose the redundant


#**************************************** NMF -- NON NEGATIVE MATRIX FACTORIZATION **************************************
clf = decomposition.NMF(n_components= 5, random_state=0)

W1 =clf.fit_transform(vectors)   #  summation of baye's vector
H1= clf.components_   # coeffecient matrix

In [None]:
W1

In [None]:
# Getting the top 15 words of 6 topics

num_word=15

vocab = np.array(vectorizer.get_feature_names())

top_words= lambda t: [vocab[i] for i in np.argsort(t)[:-num_word-1:-1]]

In [None]:
topic_words = [top_words(t) for t in H1]
topics = [' '.join(t) for t in topic_words]

In [None]:
topics

#'business', 'entertainment', 'politics', 'sport', 'tech'

# topic 0 ; business
# topic 1 : sport
# topic 2 : politics
# topic 3 : entertainment
# topic 4 : tech

In [None]:
col_names = ["topics"+str(i) for i in range(clf.n_components)]
doc_names = ["Docs" + str(i) for i in range(len(df['cleaned_doc']))]
df_doc_topic = pd.DataFrame(np.round(W1,2),columns=col_names,index=doc_names)
significant_topic= np.argmax(df_doc_topic.values,axis=1)
df_doc_topic['dominant_topic']=significant_topic

In [None]:
df_doc_topic.head()

In [None]:
df_doc_topic.rename(columns={'topics0':'business','topics1':'sport','topics2':'politics','topics3' : 'entertainment','topics4' : 'tech'})
df_doc_topic['dominant_topic'].loc[df_doc_topic['dominant_topic']==0] = 'business'
df_doc_topic['dominant_topic'].loc[df_doc_topic['dominant_topic']==1] = 'sport'
df_doc_topic['dominant_topic'].loc[df_doc_topic['dominant_topic']==2] = 'politics'
df_doc_topic['dominant_topic'].loc[df_doc_topic['dominant_topic']==3] = 'entertainment'
df_doc_topic['dominant_topic'].loc[df_doc_topic['dominant_topic']==4] = 'tech'

In [None]:
dict1=df_doc_topic['dominant_topic'].to_dict()
df['dominant_pred_topic']=dict1.values()

In [None]:
df.columns

In [None]:
incorrect_pred_df=df[df['topics']!=df['dominant_pred_topic']][['cleaned_doc', 'topics','dominant_pred_topic']]
incorrect_pred_df

In [None]:
accuracy_NNM=(1-len(incorrect_pred_df)/len(df))*100
accuracy_NNM

In [None]:
df.head(3)

# Try with different set 2

In [None]:
# application of tf-idf vectorizer

# Input corpus for the vectorizer
corpus=df['cleaned_doc']

# importing the necessary libraries for performing
import nltk
from nltk.corpus import stopwords  #stopwords
from nltk.stem import WordNetLemmatizer  
from sklearn.feature_extraction.text import TfidfVectorizer

# ******************************************** tweaks ,max_df=0.8 to 0.3,max_features=1000 to 2000 **********

# creation of vectorizing model
vectorizer = TfidfVectorizer(stop_words=None, max_df=0.3, max_features=2500, ngram_range=(1,2)) # as sometimes the bigrams gives more meaning thus their frequency becomes significant
vectors=vectorizer.fit_transform(corpus)

#**************************************** NMF -- NON NEGATIVE MATRIX FACTORIZATION **************************************
clf = decomposition.NMF(n_components= 5, random_state=100)

W1 =clf.fit_transform(vectors)   #  summation of baye's vector
H1= clf.components_   # coeffecient matrix

# Getting the top 15 words of 6 topics

num_word=100
vocab = np.array(vectorizer.get_feature_names())
top_words= lambda t: [vocab[i] for i in np.argsort(t)[:-num_word-1:-1]]

topic_words = [top_words(t) for t in H1]
topics = [' '.join(t) for t in topic_words]

# creation of df
col_names = ["topics"+str(i) for i in range(clf.n_components)]
doc_names = ["Docs" + str(i) for i in range(len(df['cleaned_doc']))]
df_doc_topic = pd.DataFrame(np.round(W1,2),columns=col_names,index=doc_names)
significant_topic= np.argmax(df_doc_topic.values,axis=1)
df_doc_topic['dominant_topic']=significant_topic

#**********************************************************************************************
df_doc_topic.rename(columns={'topics0':'business','topics1':'sport','topics2':'politics','topics3' : 'entertainment','topics4' : 'tech'})
df_doc_topic['dominant_topic'].loc[df_doc_topic['dominant_topic']==0] = 'business'
df_doc_topic['dominant_topic'].loc[df_doc_topic['dominant_topic']==1] = 'sport'
df_doc_topic['dominant_topic'].loc[df_doc_topic['dominant_topic']==2] = 'politics'
df_doc_topic['dominant_topic'].loc[df_doc_topic['dominant_topic']==3] = 'entertainment'
df_doc_topic['dominant_topic'].loc[df_doc_topic['dominant_topic']==4] = 'tech'

#**********************************************************************************************
dict1=df_doc_topic['dominant_topic'].to_dict()
df['dominant_pred_topic']=dict1.values()

#**********************************************************************************************
incorrect_pred_df=df[df['topics']!=df['dominant_pred_topic']][['cleaned_doc', 'topics','dominant_pred_topic']]

#**********************************************************************************************
accuracy_NNM=(1-len(incorrect_pred_df)/len(df))*100
accuracy_NNM


In [None]:
vectorizer.get_feature_names()

In [None]:
# application of tf-idf vectorizer

# Input corpus for the vectorizer
corpus=df['cleaned_doc']

# importing the necessary libraries for performing
import nltk
from nltk.corpus import stopwords  #stopwords
from nltk.stem import WordNetLemmatizer  
from sklearn.feature_extraction.text import TfidfVectorizer

# ******************************************** tweaks ,max_df=0.8 to 0.3,max_features=1000 to 2000 **********

# creation of vectorizing model
vectorizer = TfidfVectorizer(stop_words=None, max_df=0.3, max_features=2500, ngram_range=(1,2)) # as sometimes the bigrams gives more meaning thus their frequency becomes significant
vectors=vectorizer.fit_transform(corpus)

#**************************************** NMF -- NON NEGATIVE MATRIX FACTORIZATION **************************************
clf = decomposition.NMF(n_components= 10, random_state=100)

W1 =clf.fit_transform(vectors)   #  summation of baye's vector
H1= clf.components_   # coeffecient matrix


vocab = vectorizer.get_feature_names()
for i, comp in enumerate(clf.components_):
     vocab_comp = zip(vocab, comp)
     sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10]
     
     print("Topic "+str(i)+": ")
     for t in sorted_words:
            print(t[0],end=" ")
     print("\n")

# **Summary**


1. We started with loading of data set. It had 5 topics with different documents in it. Imported the dataset using pandas.

2. We preprocessed the data by using nltk ,sklearn libraries by removing punctuations, stop words, numerical values, removing special characters

3. Performed word normalization technique such as lemmatization and stemming for identifying the root words

4. We created bag of words using countvectorizers and calculated the TF-IDF for the given corpus.

5. Trained model such as LDA, LSA, NMF for topic modelling and created clusters

6. We were able to check the predictability of topic with topic mentioned in NMF.

7. We hypertuned the LDA model with coherence and perplexity score for betterment. The resultant words are expressed in word cloud for better visualisation.

# **Conclusion**

1. Preprocessing step in our corpus has reduced the corpus almost 43% in average. Which is huge step for model imputation.

2. As far the time consumption, NMF was pretty much faster relative to LDA and LSA.

3. During the visualization we obeserve that pyLDAvis uninstall the pandas thus version controls becomes an issue during the usage of this library.

4. Most important n_grams were given by TF-IDF. Statistical model were performed like LDA, Decomposition model = LSA.

5. The optimal measure of selection of topic were done using coherence and perplexity score.