In [2]:
#Importing the required libraries
import pandas as pd
import numpy as np

In [5]:
#Loading the dataset and printing part of it
reviews_datasets = pd.read_csv(r'C:\Users\zakri\Downloads\Reviews.csv')
reviews_datasets = reviews_datasets.head(20000)
reviews_datasets.dropna()
reviews_datasets.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [6]:
# We will be applying LDA on the "Text" column since it contains the reviews, the rest of the columns will be ignored.
#Before we can apply LDA, we need to create vocabulary of all the words in our data
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
doc_term_matrix = count_vect.fit_transform(reviews_datasets['Text'].values.astype('U'))

In [7]:
#Each of 20k documents is represented as 14546 dimensional vector, which means that our vocabulary has 14546 words.
doc_term_matrix

<20000x14546 sparse matrix of type '<class 'numpy.int64'>'
	with 594703 stored elements in Compressed Sparse Row format>

In [8]:
#We use LDA to create topics along with the probability distribution for each word in our vocabulary for each topic
from sklearn.decomposition import LatentDirichletAllocation

#The parameter n_components specifies the number of categories, or topics, that we want our text to be divided into
LDA = LatentDirichletAllocation(n_components=5, random_state=42)
LDA.fit(doc_term_matrix)

LatentDirichletAllocation(n_components=5, random_state=42)

In [9]:
#find 10 words with the highest probability for the first topic.
first_topic = LDA.components_[0]

In [10]:
#To sort the indexes according to probability values, we can use the argsort() function.
#Once sorted, the 10 words with the highest probabilities will now belong to the last 10 indexes of the array. 
top_topic_words=first_topic.argsort()[-10:]
for i in top_topic_words:
    print(count_vect.get_feature_names()[i])

water
great
just
drink
sugar
good
flavor
taste
like
tea


The words show that the first topic might be about tea.

In [11]:
# printing the 10 words with highest probabilities for all the five topics
for i,topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic #{i}:')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['water', 'great', 'just', 'drink', 'sugar', 'good', 'flavor', 'taste', 'like', 'tea']


Top 10 words for topic #1:
['br', 'chips', 'love', 'flavor', 'chocolate', 'just', 'great', 'taste', 'good', 'like']


Top 10 words for topic #2:
['just', 'drink', 'orange', 'sugar', 'soda', 'water', 'like', 'juice', 'product', 'br']


Top 10 words for topic #3:
['gluten', 'eat', 'free', 'product', 'like', 'dogs', 'treats', 'dog', 'br', 'food']


Top 10 words for topic #4:
['cups', 'price', 'great', 'like', 'amazon', 'good', 'br', 'product', 'cup', 'coffee']




In [12]:
#we add a column to the original data frame that will store the topic for the text.
#To do so, we can use LDA.transform() method and pass it our document-term matrix.
#This method will assign the probability of all the topics to each document. 
topic_values = LDA.transform(doc_term_matrix)
topic_values.shape

(20000, 5)

(20000, 5) means that each of the document has 5 columns where each column corresponds to the probability value of a particular topic. 

In [13]:
#To find the topic index with maximum value, we can call the argmax() method 
reviews_datasets['Topic'] = topic_values.argmax(axis=1)

#how the data set looks
reviews_datasets.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Topic
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,3
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,1
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,1
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,0
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,1


CONCLUSION:

In this Project we have used the Reviews data set.The data set contains user reviews for different products in the food category.

We used LDA to group the user reviews into 5 categories.In the script above we used the CountVectorizer class to create a document-term matrix.It is used to transform a given text into a vector on the basis of the frequency (count) of each word that occurs in the entire text.
We tuned it such that only to include those words that appear in less than 80% of the document and appear in at least 2 documents. We also remove all the stop words as they do not really contribute to topic modeling.

The output which prints the top ten words of a topic shows that the second topic might contain reviews about chocolates, etc.Similarly, the third topic might again contain reviews about sodas or juices.
You can see that there a few common words in all the categories. This is because there are few words that are used for almost all the topics. For instance "good", "great", "like" etc.

Finally, We have added another column in the dataset which tells us which Topic the review most probably belongs to.Hence we have classified the reviews based on Text analysis using LDA.
