## Topic Modeling with LDA (count vectorizer) is explored here.  Though topics somewhat make sense, the top 10 words in each topic are not as relevant as to each other when compared to the top words from NMF (TF-IDF Vectroizer).  Thus, LDA topic modeling is not considered for this project.

In [2]:
import pandas as pd
import pickle
from gensim import matutils, models
import scipy.sparse
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

  from collections import Callable
  from collections import Mapping, defaultdict
scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [3]:
data = pd.read_pickle('dtm_stop.pkl')

In [4]:
tdm = data.transpose()


In [5]:
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [6]:
cv = pickle.load(open("cv_stop.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [6]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=8, passes=10)

[(0,
  '0.001*"tapioca im" + 0.001*"good" + 0.001*"line long" + 0.001*"ridiculous long" + 0.001*"great" + 0.001*"line" + 0.001*"one hour" + 0.000*"wouldnt" + 0.000*"full" + 0.000*"charge"'),
 (1,
  '0.001*"service" + 0.001*"first" + 0.001*"wait" + 0.001*"bobas" + 0.001*"table" + 0.001*"yall" + 0.001*"self" + 0.001*"self service" + 0.001*"ask" + 0.001*"know"'),
 (2,
  '0.001*"wish" + 0.000*"must try" + 0.000*"richmond" + 0.000*"personal" + 0.000*"personal favorite" + 0.000*"great" + 0.000*"sunset" + 0.000*"werent" + 0.000*"try like" + 0.000*"side"'),
 (3,
  '0.003*"cup" + 0.002*"tea" + 0.002*"good" + 0.002*"pretty" + 0.002*"huge" + 0.001*"milk" + 0.001*"boba" + 0.001*"wait" + 0.001*"dont" + 0.001*"also"'),
 (4,
  '0.001*"puff" + 0.001*"cheese puff" + 0.001*"room sit" + 0.000*"always ice" + 0.000*"pant" + 0.000*"way last" + 0.000*"sweetened version" + 0.000*"golden gate" + 0.000*"gate park" + 0.000*"gate"'),
 (5,
  '0.009*"wait" + 0.008*"boba" + 0.005*"line" + 0.005*"minute" + 0.004*"tea

In [7]:
word_dict = {};
for i in range(8):
    words = lda.show_topic(i, topn = 10)
    word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words]
pd.DataFrame(word_dict)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07,Topic # 08
0,tapioca im,service,wish,cup,puff,wait,tea,minute cup
1,good,first,must try,tea,cheese puff,boba,milk,nothing great
2,line long,wait,richmond,good,room sit,line,boba,bad ha
3,ridiculous long,bobas,personal,pretty,always ice,minute,milk tea,hardly
4,great,table,personal favorite,huge,pant,tea,like,added tapioca
5,line,yall,great,milk,way last,long,line,closer house
6,one hour,self,sunset,boba,sweetened version,waited,wait,like kind
7,wouldnt,self service,werent,wait,golden gate,good,good,walking distance
8,full,ask,try like,dont,gate park,min,cup,pretty expensive
9,charge,know,side,also,gate,worth,long,like syrup


## Topic modeling with nouns

In [11]:
import nltk

  regargs, varargs, varkwargs, defaults, formatvalue=lambda value: ""
  from collections import Sequence, defaultdict


In [17]:
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [18]:
data_clean = pd.read_pickle('data_clean.pkl')

In [19]:
data_nouns = pd.DataFrame(data_clean.lem.apply(nouns))

In [20]:
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
# Create a new document-term matrix using only nouns

# Re-add the additional stop words since we are recreating the document-term matrix
stopwords = nltk.corpus.stopwords.words('english')
newStopWords = ['purple','kow','drink','people','purple kow','place','wa','half','socal',
                'norcal','something','menu','grass','time','make','even','another','order',
               'sf','friend','back','get','got','came','come','went','go']
stopwords.extend(newStopWords)



In [None]:
# Recreate a document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stopwords)
data_cvn = cvn.fit_transform(data_nouns.lem)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())

In [22]:
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [23]:
# Let's start with 2 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=8, id2word=id2wordn, passes=10)

In [45]:
word_dict = {};
for i in range(8):
    words = ldan.show_topic(i, topn = 10)
    word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words]
pd.DataFrame(word_dict)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07,Topic # 08
0,coconut,food,chicken,line,line,boba,cup,tea
1,tea,chicken,salt,tea,minute,line,tea,milk
2,dessert,customer,pepper,minute,tapioca,wait,milk,boba
3,food,yum,nugget,wait,service,tea,boba,line
4,crema,business,snack,service,hour,milk,car,wait
5,school,outer,stick,customer,wait,cup,holder,taste
6,tapioca,service,review,milk,fruit,minute,line,ice
7,quality,milktea,hr,min,cup,hour,size,flavor
8,ice,fry,ice,cashier,door,pudding,minute,pudding
9,boba,almond,piece,boba,way,door,wait,tapioca


## topic modeling adjective and noun

In [24]:
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [25]:
data_nouns_adj = pd.DataFrame(data_clean.lem.apply(nouns_adj))

In [26]:
cvna = CountVectorizer(stop_words=stopwords, min_df=3)
data_cvna = cvna.fit_transform(data_nouns_adj.lem)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())

In [27]:
# Create the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [28]:
ldana = models.LdaModel(corpus=corpusna, num_topics=7, id2word=id2wordna, passes=300)

In [30]:
word_dict = {};
for i in range(7):
    words = ldana.show_topic(i, topn = 7)
    word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words]
pd.DataFrame(word_dict)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07
0,boba,cup,line,tea,customer,bubble,tea
1,milk,boba,wait,milk,service,bad,milk
2,sweet,milk,minute,boba,minute,card,green
3,pudding,tea,long,chicken,tapioca,crap,fresh
4,tea,good,tea,good,tea,credit,boba
5,line,size,hour,ive,cashier,isnt,sweet
6,good,huge,good,great,milk,hey,matcha
