In [8]:
import pandas as pd
import numpy as np
import pickle

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD

In [3]:
data = pd.read_pickle('all_data.pkl')
data.shape

(95003, 14)

In [4]:
add_stop_words=['app','use','good','just','google','like','work',
                'time','need','dont','make','phone','update',
                'great','im','feature','option']
stop_words1=text.ENGLISH_STOP_WORDS.union(add_stop_words)

In [5]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

# Topic Modeling - Attempt #1 (All Text)

In [6]:
cv1=CountVectorizer(stop_words=stop_words1, token_pattern="\\b[a-z][a-z]+\\b",
                    min_df=3,max_df=0.85, max_features=10000, ngram_range=(1,2))#, preprocessor=' '.join)
df_cv1 = cv1.fit_transform(data.content)
df_review1=pd.DataFrame(df_cv1.toarray(), index= [data.title], columns=cv1.get_feature_names())
df_review1.shape

(95003, 10000)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv2 = TfidfVectorizer(stop_words=stop_words1,token_pattern="\\b[a-z][a-z]+\\b", 
                      min_df=5,max_df=0.90, #max_features=20000, 
                      ngram_range=(1,2))#,preprocessor=' '.join)
df_cv2 = cv2.fit_transform(data.content)
df_review2=pd.DataFrame(df_cv2.toarray(), index= [data.title], columns=cv2.get_feature_names())
df_review2.shape

(95003, 32153)

### CountVectorizer

In [11]:
num_topic=10
lsa = TruncatedSVD(num_topic)
doc_topic = lsa.fit_transform(df_cv1)
U=lsa.explained_variance_ratio_

In [12]:
index=[str('component {}'.format(i)) for i in range(1,num_topic+1)]
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = index,
             columns = cv1.get_feature_names())
#topic_word
display_topics(lsa, cv1.get_feature_names(), 10)


Topic  0
music, play, fix, try, doesnt, want, problem, new, add, say

Topic  1
music, play, youtube, play music, youtube music, song, listen, playlist, download, listen music

Topic  2
add, photo, want, really, love, better, way, thing, nice, change

Topic  3
device, android, apps, screen, tv, home, account, set, doesnt, turn

Topic  4
photo, device, apps, try, play, file, android, download, account, delete

Topic  5
video, screen, fix, issue, device, photo, android, play, watch, tv

Topic  6
add, play, doesnt, new, search, contact, fix, open, device, game

Topic  7
video, add, download, play, account, device, watch, try, game, kid

Topic  8
device, music, new, fix, youtube, problem, account, android, youtube music, add

Topic  9
fix, really, play, change, love, problem, apps, game, issue, device


### TfidfVectorizer

In [16]:
num_topic=10
lsa = TruncatedSVD(num_topic)
doc_topic = lsa.fit_transform(df_cv2)
U=lsa.explained_variance_ratio_

In [17]:
index=[str('component {}'.format(i)) for i in range(1,num_topic+1)]
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = index,
             columns = cv2.get_feature_names())
#topic_word
display_topics(lsa, cv2.get_feature_names(), 10)


Topic  0
nice, nice application, nice apps, application, apps, easy, add, really, love, problem

Topic  1
ok, ok ok, doesnt, say, problem, bad, love, fix, better, apps

Topic  2
love, best, bad, fix, doesnt, really, play, music, apps, problem

Topic  3
best, best best, best apps, best camera, best application, best edit, best thing, best music, best way, best thank

Topic  4
bad, doesnt, problem, fix, try, want, play, better, apps, open

Topic  5
love, bad, best, ok, nice, love love, bad experience, love thank, really love, love easy

Topic  6
excellent, super, apps, thank, application, easy, really, excellent service, add, excellent application

Topic  7
super, cool, easy, useful, super easy, amaze, slow, thank, super cool, awesome

Topic  8
awesome, useful, thank, helpful, really, easy, add, amaze, apps, better

Topic  9
useful, helpful, apps, thank, application, really, add, nice useful, cool, really useful


### Topic Modeling- Attempt # 2 (Nouns Only)

In [18]:
data_nouns = pd.read_pickle('data_nouns.pkl')
#data_nouns

In [19]:
stop_words1=text.ENGLISH_STOP_WORDS.union(add_stop_words)
cv1_2=CountVectorizer(stop_words=stop_words1, token_pattern="\\b[a-z][a-z]+\\b",
                    min_df=3,max_df=0.85, #max_features=10000, 
                      ngram_range=(1,2))
df_cv1_2 = cv1_2.fit_transform(data_nouns.content)
df_review1_2=pd.DataFrame(df_cv1_2.toarray(), index= [data.title], columns=cv1_2.get_feature_names())

In [20]:
cv2_2=CountVectorizer(stop_words=stop_words1, token_pattern="\\b[a-z][a-z]+\\b",
                    min_df=3,max_df=0.85, #max_features=10000, 
                      ngram_range=(1,2))
df_cv2_2 = cv2_2.fit_transform(data_nouns.content)
df_review2_2=pd.DataFrame(df_cv2_2.toarray(), index= [data.title], columns=cv2_2.get_feature_names())

In [22]:
num_topic=10
lsa = TruncatedSVD(num_topic)
doc_topic = lsa.fit_transform(df_cv1_2)
U=lsa.explained_variance_ratio_

index=[str('component {}'.format(i)) for i in range(1,num_topic+1)]
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = index,
             columns = cv1_2.get_feature_names())
#topic_word
display_topics(lsa, cv1_2.get_feature_names(), 10)


Topic  0
music, play, device, problem, play music, youtube, issue, way, video, thing

Topic  1
music, play music, play, music music, song, youtube music, playlist, youtube, music play, music youtube

Topic  2
photo, photo photo, picture, camera, video, music, image, edit, file, folder

Topic  3
device, photo, account, home, device device, apps, access, location, way, photo photo

Topic  4
problem, device, file, photo, account, music, problem problem, device device, download, storage

Topic  5
video, device, youtube, kid, ad, problem, video video, playlist, quality, tv

Topic  6
screen, home, problem, tv, home screen, button, device, photo, ad, lock

Topic  7
issue, video, message, device, camera, notification, pixel, watch, music, fix

Topic  8
file, map, issue, version, apps, folder, drive, data, share, location

Topic  9
play, apps, game, store, play store, service, play service, podcast, tv, day


In [23]:
num_topic=10
lsa = TruncatedSVD(num_topic)
doc_topic = lsa.fit_transform(df_cv2_2)
U=lsa.explained_variance_ratio_

index=[str('component {}'.format(i)) for i in range(1,num_topic+1)]
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = index,
             columns = cv2_2.get_feature_names())
#topic_word
display_topics(lsa, cv2_2.get_feature_names(), 10)


Topic  0
music, play, device, problem, play music, youtube, issue, way, video, thing

Topic  1
music, play music, play, music music, song, youtube music, playlist, youtube, music play, music youtube

Topic  2
photo, photo photo, picture, camera, video, music, image, edit, file, folder

Topic  3
device, photo, account, home, device device, access, apps, location, way, photo photo

Topic  4
problem, device, file, photo, music, account, problem problem, device device, data, download

Topic  5
video, device, youtube, problem, kid, ad, tv, playlist, video video, account

Topic  6
screen, home, problem, tv, home screen, button, device, photo, lock, alarm

Topic  7
issue, message, video, device, camera, watch, file, pixel, notification, music

Topic  8
play, game, apps, store, service, message, tv, play store, account, photo

Topic  9
file, apps, play, game, map, tv, folder, version, issue, drive


# Topic Modeling - Attempt #3 (Nouns and Adjectives and Verb)

In [24]:
data_nouns_adj = pd.read_pickle('data_nouns_adj_v.pkl')

In [25]:
stop_words1=text.ENGLISH_STOP_WORDS.union(add_stop_words)
cv1_3=CountVectorizer(stop_words=stop_words1, token_pattern="\\b[a-z][a-z]+\\b",
                    min_df=5,max_df=0.90, #max_features=10000, 
                      ngram_range=(1,2))
df_cv1_3 = cv1_3.fit_transform(data_nouns_adj.content)
df_review1_3=pd.DataFrame(df_cv1_3.toarray(), index= [data.title], columns=cv1_3.get_feature_names())

In [26]:
cv2_3=TfidfVectorizer(stop_words=stop_words1, token_pattern="\\b[a-z][a-z]+\\b",
                    min_df=3,max_df=0.90, #max_features=10000, 
                      ngram_range=(1,2))
df_cv2_3 = cv2_3.fit_transform(data_nouns_adj.content)
df_review2_3=pd.DataFrame(df_cv2_3.toarray(), index= [data.title], columns=cv2_3.get_feature_names())

In [28]:
num_topic=10
lsa = TruncatedSVD(num_topic)
doc_topic = lsa.fit_transform(df_cv1_3)
U=lsa.explained_variance_ratio_

index=[str('component {}'.format(i)) for i in range(1,num_topic+1)]
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = index,
             columns = cv1_3.get_feature_names())
#topic_word
display_topics(lsa, cv1_3.get_feature_names(), 10)


Topic  0
music, play, try, fix, want, doesnt, problem, new, say, device

Topic  1
music, play, youtube, play music, youtube music, song, listen, playlist, download, listen music

Topic  2
add, photo, want, love, way, change, video, thing, nice, new

Topic  3
device, screen, android, apps, home, tv, set, account, add, turn

Topic  4
device, photo, play, apps, account, try, download, android, file, data

Topic  5
video, screen, play, fix, android, photo, watch, tv, problem, ad

Topic  6
doesnt, screen, music, want, change, open, photo, turn, map, button

Topic  7
play, add, open, doesnt, android, screen, photo, map, fix, new

Topic  8
device, music, fix, issue, message, problem, add, screen, youtube, photo

Topic  9
try, add, screen, doesnt, video, watch, connect, open, file, home


In [29]:
num_topic=10
lsa = TruncatedSVD(num_topic)
doc_topic = lsa.fit_transform(df_cv2_3)
U=lsa.explained_variance_ratio_

index=[str('component {}'.format(i)) for i in range(1,num_topic+1)]
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = index,
             columns = cv2_3.get_feature_names())
#topic_word
display_topics(lsa, cv2_3.get_feature_names(), 10)


Topic  0
nice, nice application, nice apps, application, apps, easy, add, nice aap, aap, nice easy

Topic  1
ok, ok ok, say, doesnt, problem, thank, love, fix, ok guess, video

Topic  2
love, best, bad, music, play, doesnt, fix, problem, apps, try

Topic  3
best, best apps, best best, best camera, best application, best edit, best thing, best music, best thank, best way

Topic  4
bad, experience, bad experience, doesnt, problem, try, fix, want, open, play

Topic  5
love, bad, best, ok, nice, love love, love thank, love easy, bad experience, best best

Topic  6
excellent, super, useful, thank, excellent service, application, excellent application, quality, excellent apps, service

Topic  7
super, easy, awesome, useful, cool, super easy, slow, helpful, super cool, super annoy

Topic  8
awesome, useful, thank, know, experience, amaze, cool, nice useful, language, awesome thank

Topic  9
useful, easy, apps, know, application, doesnt, nice useful, simple, device, useful tool
