In [1]:
import pandas as pd
import numpy as np

# import the 1000 random food reviews csv file (unlabelled and raw)
reviews_datasets = pd.read_csv(r'E:\xeeva task\dataset amazon foodzip\1000_food_reviews.csv')
reviews_datasets.dropna()

Unnamed: 0,Text
0,Bergamot is reminiscent of orange or lemon ext...
1,These are the best chocolate covered coffee be...
2,I love these cinnamon candies. They are not t...
3,Seems Brach's is doing away with this mint whi...
4,I love the Brach's Star Brites Cinnamon Mints....
...,...
995,WOW! Not much to say. Its cereal just as if it...
996,"Well, it's fruity pebbles- obviously it's awes..."
997,Arrived properly packed and a great charm for ...
998,I saw this as a great deal on cereal that beat...


In [2]:
#feature engineering using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#min_df to ignore words used in less than 2 documents
count_vect = CountVectorizer(min_df=2, stop_words='english')
doc_term_matrix = count_vect.fit_transform(reviews_datasets['Text'].values.astype('U'))

In [3]:
# using LDA for unsupervised topic modelling
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=5, random_state=42)
LDA.fit(doc_term_matrix)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [4]:
import random

for i in range(10):
    random_id = random.randint(0,len(count_vect.get_feature_names()))
    print(count_vect.get_feature_names()[random_id])

vinegar
does
arrived
popper
lab
dark
castor
edamame
list
occasional


In [5]:
first_topic = LDA.components_[0]

In [6]:
first_topic

array([0.20496789, 0.20020131, 4.19164002, ..., 0.20000441, 0.20014217,
       0.20000351])

In [7]:
top_topic_words = first_topic.argsort()[-10:]
top_topic_words

array([ 427, 1296, 1563,  991,  125, 1014,  335,  346,  855, 1751],
      dtype=int64)

In [8]:
for i in top_topic_words:
    print(count_vect.get_feature_names()[i])

chips
like
order
good
amazon
great
cake
candy
fiber
product


In [9]:
for i,topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic #{i}:')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['chips', 'like', 'order', 'good', 'amazon', 'great', 'cake', 'candy', 'fiber', 'product']


Top 10 words for topic #1:
['product', 'love', 'amazon', 'good', 'like', 'organic', 'dogs', 'newman', 'dog', 'food']


Top 10 words for topic #2:
['chips', 'love', 'just', 'taste', 'good', 'great', 'almonds', 'tea', 'like', 'flavor']


Top 10 words for topic #3:
['best', 'taste', 'price', 'love', 'buy', 'product', 'like', 'great', 'good', 'just']


Top 10 words for topic #4:
['use', 'just', 'popcorn', 'blue', 'diamond', 'chocolate', 'good', 'cup', 'magnesium', 'coffee']




In [10]:
topic_values = LDA.transform(doc_term_matrix)
topic_values

array([[0.01439348, 0.01433796, 0.94164478, 0.01512282, 0.01450097],
       [0.01687393, 0.0169031 , 0.01688863, 0.65120354, 0.2981308 ],
       [0.01395714, 0.01352156, 0.01348377, 0.94537281, 0.01366472],
       ...,
       [0.43748839, 0.01904546, 0.50627631, 0.0186894 , 0.01850044],
       [0.01130604, 0.01118156, 0.14234767, 0.82399381, 0.01117092],
       [0.00923224, 0.16138631, 0.64616463, 0.17406524, 0.00915158]])

In [11]:
topic_values.shape

(1000, 5)

In [12]:
reviews_datasets['Topic'] = topic_values.argmax(axis=1)

In [13]:
#labeled text descriptions with proper Topic category (0 to 4)
reviews_datasets.head(50)

Unnamed: 0,Text,Topic
0,Bergamot is reminiscent of orange or lemon ext...,2
1,These are the best chocolate covered coffee be...,3
2,I love these cinnamon candies. They are not t...,3
3,Seems Brach's is doing away with this mint whi...,3
4,I love the Brach's Star Brites Cinnamon Mints....,3
5,"Have always kept these available at work, a lo...",0
6,I quit smoking 6 1/2 years ago and these Brach...,0
7,"These are wonderful ""desk"" candies...everyone ...",3
8,We had been looking for a long time for a dece...,2
9,This is so flavorful and decaf! This is the be...,2
