In [1]:
import pandas as pd
import spacy        
nlp = spacy.load('en_core_web_sm', parser=False, entity=False)
npr = pd.read_excel('Review.xlsx')

In [2]:
npr['rev']=npr['Review']
npr['Review'] = npr.Review.str.replace("[^\w\s]", "")
npr['Review']=npr['Review'].str.lower() 
npr = npr[npr['Review'].notnull()]

In [3]:
stopwords=pd.read_csv('stop words.csv')
customize_stop_words=stopwords['stop_words'].to_list()

In [4]:
for w in customize_stop_words:
    nlp.vocab[w].is_stop = True
    nlp.Defaults.stop_words.add(w)

In [5]:
new_words=nlp.Defaults.stop_words
n=list(new_words)
npr['Review'] = npr['Review'].apply(lambda x: ' '.join([item for item in x.split() if item not in n]))

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df=0.99, min_df=1, stop_words='english')
dtm = cv.fit_transform(npr['Review'].values.astype('U'))

In [7]:
from sklearn.decomposition import LatentDirichletAllocation
LDA = LatentDirichletAllocation(n_components=7,random_state=42)
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=7, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [8]:
LDA.components_

array([[0.14355628, 0.14306587, 0.14317679, ..., 1.14266419, 0.14285733,
        0.14285717],
       [0.14313468, 2.9706906 , 0.14285716, ..., 0.14288192, 1.14017671,
        0.14285715],
       [4.48824867, 5.77251538, 0.1429464 , ..., 0.14285715, 0.14508248,
        0.14303139],
       ...,
       [5.62693544, 4.28122508, 0.14312471, ..., 0.14285716, 0.1432022 ,
        0.14285716],
       [6.03001485, 4.31102477, 4.63128529, ..., 0.14285716, 0.14296691,
        0.14285716],
       [0.14345111, 0.14332619, 0.14285717, ..., 0.14285715, 0.14285719,
        1.14268282]])

In [9]:
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 10 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')

THE TOP 10 WORDS FOR TOPIC #0
['diet', 'flavor', 'lens', 'cats', 'eat', 'dogs', 'food', 'cat', 'taste', 'dog']


THE TOP 10 WORDS FOR TOPIC #1
['writing', 'characters', 'written', 'life', 'author', 'reading', 'story', 'books', 'read', 'book']


THE TOP 10 WORDS FOR TOPIC #2
['rock', 'heard', 'listen', 'band', 'sound', 'song', 'songs', 'music', 'album', 'cd']


THE TOP 10 WORDS FOR TOPIC #3
['came', 'looks', 'order', 'looking', 'disappointed', 'toy', 'box', 'purchase', 'plastic', 'water']


THE TOP 10 WORDS FOR TOPIC #4
['graphics', 'card', 'sound', 'software', 'computer', 'dvd', 'games', 'player', 'play', 'game']


THE TOP 10 WORDS FOR TOPIC #5
['printer', 'tv', 'working', 'power', 'sound', 'batteries', 'cable', 'camera', 'battery', 'phone']


THE TOP 10 WORDS FOR TOPIC #6
['plot', 'watching', 'funny', 'series', 'story', 'movies', 'watch', 'dvd', 'film', 'movie']




In [10]:
topic_results = LDA.transform(dtm)
npr['Topic'] = topic_results.argmax(axis=1)

In [11]:
my_dict={0:'Pets/ Food/ Skin Care',1:'Books',2:'Music',3:'Toys/ Kid Products/ Plastic Items',4:'PC, PS Games/ Software/ CD, DVD',5:'Electronic Items',6:'Movie/ TV Series/ Fiction Novels/ Comics'}
npr['Label']=npr['Topic'].map(my_dict)
npr.head(10)

Unnamed: 0,Review,rev,Topic,Label
0,cd lovely pat voices generation listened cd mo...,Great CD: My lovely Pat has one of the GREAT v...,2,Music
1,game music soundtracks game play despite fact ...,One of the best game music soundtracks - for a...,2,Music
2,batteries died charger jul 2003 ok design conv...,Batteries died within a year ...: I bought thi...,5,Electronic Items
3,fine maha energy check maha energys website po...,"works fine, but Maha Energy is better: Check o...",5,Electronic Items
4,nonaudiophile reviewed bit combo players hesit...,Great for the non-audiophile: Reviewed quite a...,5,Electronic Items
5,dvd player crapped began having incorrect disc...,DVD Player crapped out after one year: I also ...,4,"PC, PS Games/ Software/ CD, DVD"
6,incorrect disc style couple dvd giving problem...,"Incorrect Disc: I love the style of this, but ...",4,"PC, PS Games/ Software/ CD, DVD"
7,dvd menu select problems scroll dvd menu verti...,DVD menu select problems: I cannot scroll thro...,4,"PC, PS Games/ Software/ CD, DVD"
8,unique weird orientalia 1930s exotic tales ori...,Unique Weird Orientalia from the 1930's: Exoti...,1,Books
9,ultimate guide firstlyi enjoyed format tone bo...,"Not an ""ultimate guide"": Firstly,I enjoyed the...",1,Books


In [12]:
npr.groupby('Label').count()[['Review']].sort_values(by=['Review'],ascending=False)

Unnamed: 0_level_0,Review
Label,Unnamed: 1_level_1
Books,121076
Toys/ Kid Products/ Plastic Items,103051
Movie/ TV Series/ Fiction Novels/ Comics,58030
Music,56677
Electronic Items,30758
"PC, PS Games/ Software/ CD, DVD",23013
Pets/ Food/ Skin Care,7393


In [13]:
New=npr[['rev','Label']]
New.to_csv('output.csv',index=False)