#  Topic Modelling

- It is a type of statistical model for discovering the abstract topics that can occur in a collection of documents
- It helps to discover the hidden topics
- Examples:
    - amazon has great customer support. very quick response = customer support
- Terms define topics:
    - Customer Support (support,service, call center, response, polite)
    - Delivery (speed, packing, damaged, protection)
    - Deals (offers, sales, promotions, coupons, offer)
- Poplar methods
    - Latent Sematic Analysis(LSA)
    - Latent Dirichlet Allocation(LDA)
- Applications:
    - Clustering
    - Reduce Dimmensions
    - Multi-Tagging

In [1]:
!pip install gensim



You are using pip version 18.1, however version 19.0.3 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [2]:
import gensim

In [6]:
import pandas as pd
import numpy as np
import nltk

In [7]:
docs_clean = []
amazon = pd.read_csv('amazon_reviews_big.csv')
amazon.head()

Unnamed: 0,asin,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,B000HDJXNA,1,What I recieved is not what is pictured here O...,"12 26, 2012",A29YXBFTD7QUP3,HHA,Buyer be ware,1356480000.0
1,B006KKS7XQ,5,Excellent unit and a pretty simple install usi...,"09 20, 2013",A3IMTXFYD7CGDN,"Peter W. George ""soyflakeman""",high quality without high price,1379635000.0
2,B002NP8XJ0,5,"I'm enjoying this keyboard, I'm getting anothe...","08 31, 2010",AXNOW20FQKHVW,B. Hayashi,Superb keyboard + solution for slow wake up an...,1283213000.0
3,B000EITTLE,4,"Overall, this is a fantastic camera that I'm e...","02 3, 2008",A10KCAK279LO0W,"mmcwatters ""macdadi80""",One qualm: not great in low light,1201997000.0
4,B006CRXK4S,5,These work very well with mySamsung PN64D7000 ...,"01 28, 2012",A19XXLMZXR764J,S. Garfinkle,"Work great, fit well",1327709000.0


In [8]:
from nltk.sentiment import SentimentIntensityAnalyzer



In [9]:
docs = amazon['reviewText'].fillna('').str.lower().str.replace('[^a-z ]', '')
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['','use', 'good', 'like', 'great'])
stemmer = nltk.stem.PorterStemmer()
for doc in docs:
    words=doc.split(' ')
    words_clean = [stemmer.stem(word) for word in words if word not in stopwords]
    docs_clean.append(words_clean)
len(docs_clean)

100000

In [10]:
dictionary = gensim.corpora.Dictionary(docs_clean)
dictionary

<gensim.corpora.dictionary.Dictionary at 0x19aebb59160>

In [11]:
docs_bow = []
for doc in docs_clean:
    doc_bow = dictionary.doc2bow(doc)
    docs_bow.append(doc_bow)

In [12]:
dictionary.doc2bow(docs_clean[0])

[(0, 1),
 (1, 2),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 5),
 (8, 1),
 (9, 1),
 (10, 3),
 (11, 2),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 2),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 1),
 (24, 1),
 (25, 1),
 (26, 1),
 (27, 1),
 (28, 2),
 (29, 1),
 (30, 1),
 (31, 1),
 (32, 1),
 (33, 1),
 (34, 2),
 (35, 2),
 (36, 1),
 (37, 1),
 (38, 1),
 (39, 1),
 (40, 1),
 (41, 1),
 (42, 1),
 (43, 1),
 (44, 1),
 (45, 1),
 (46, 1),
 (47, 1),
 (48, 1),
 (49, 1),
 (50, 1)]

In [13]:
lda_model = gensim.models.LdaModel(docs_bow,
                                  id2word = dictionary,
                                  num_topics = 4)

In [14]:
lda_model.get_document_topics(docs_bow[0])

[(0, 0.7245697), (3, 0.26686355)]

In [15]:
lda_model.get_document_topics(docs_bow[1])

[(0, 0.17184341), (1, 0.27048036), (3, 0.553214)]

In [16]:
lda_model.print_topics()

[(0,
  '0.016*"work" + 0.011*"cabl" + 0.010*"drive" + 0.010*"one" + 0.008*"tv" + 0.007*"usb" + 0.007*"connect" + 0.007*"use" + 0.007*"get" + 0.007*"devic"'),
 (1,
  '0.024*"sound" + 0.013*"speaker" + 0.009*"qualiti" + 0.009*"headphon" + 0.008*"mous" + 0.007*"music" + 0.007*"get" + 0.006*"use" + 0.006*"one" + 0.006*"ear"'),
 (2,
  '0.029*"camera" + 0.016*"card" + 0.013*"len" + 0.008*"pictur" + 0.008*"use" + 0.007*"video" + 0.007*"get" + 0.006*"take" + 0.006*"qualiti" + 0.006*"one"'),
 (3,
  '0.015*"case" + 0.012*"one" + 0.010*"work" + 0.010*"batteri" + 0.008*"well" + 0.008*"fit" + 0.007*"would" + 0.007*"use" + 0.007*"charg" + 0.007*"screen"')]

In [17]:
doc2topic_prob = lda_model.get_document_topics(docs_bow[0])

doc2topic_prob = pd.DataFrame(doc2topic_prob, columns=['topic','prob'])

doc2topic_prob.sort_values('prob', ascending = False).iloc[0]['topic']

0.0

In [18]:
topics = []
for doc_bow in docs_bow:
    doc2topic_prob = lda_model.get_document_topics(doc_bow)
    doc2topic_prob = pd.DataFrame(doc2topic_prob, columns=['topic','prob'])
    topic = doc2topic_prob.sort_values('prob', ascending = False).iloc[0]['topic']
    topics.append(topic)

In [19]:
amazon['topics'] = topics
amazon['topics'].value_counts()

0.0    36226
3.0    35208
1.0    15546
2.0    13020
Name: topics, dtype: int64