In [1]:
# UserReviewsEDA.ipynb
# Exploratory Data Analysis on the reviews written by users in our dataset
# Written for NYCDSA Bootcamp 7, Capstone Project by LC 2016-12-14

# note without POS Tagging, the Lemmatizer's functionality is limited. All words are assumed to be nouns.
import cPickle as pickle
import gensim
import pandas as pd
from nltk.stem import WordNetLemmatizer



In [9]:
# nested list, each list corresponds to all reviews of a single beer
with open('words_sc_proc.p', 'rb') as f:
    nested_words = pickle.load(f)


In [10]:
# need to download nltk WordNet pattern library on local machine first
WordNetLemmatizer().lemmatize('cars')

u'car'

In [11]:
# matches beer name with index
with open('beer_dict.p', 'rb') as f:
    beer_names = pickle.load(f)

In [12]:
# cross check number of reviews and number of beers line up
len(beer_names.keys())

1269

In [13]:
len(nested_words)

1269

In [14]:
# flatten our list
flattened_words = [item for sublist in nested_words for item in sublist]
print len(flattened_words)

11370657


In [15]:
freq_count = {}

for word in flattened_words:
    if word in freq_count:
        freq_count[word] +=1
    else:
        freq_count[word] = 1

In [16]:
word_count = sorted(freq_count.items(), key = lambda x: x[1], reverse = True )
print len(word_count)

127654


In [17]:
# for output purposes 
a = pd.DataFrame(word_count[:20], columns = ['word', 'count'], index= None)
b = pd.DataFrame(word_count[20:40], columns = ['word', 'count'], index= None)
c = pd.DataFrame(word_count[40:60], columns = ['word', 'count'], index= None)
d = pd.DataFrame(word_count[60:80], columns = ['word', 'count'], index= None)
e = pd.DataFrame(word_count[80:100], columns = ['word', 'count'], index= None)
pd.concat([a,pd.DataFrame(range(20,40),columns = ['']),b, pd.DataFrame(range(40,60),columns = ['']), c, \
          pd.DataFrame(range(60,80),columns = ['']),d,pd.DataFrame(range(80,100),columns = ['']), e ],axis=1 )

Unnamed: 0,word,count,Unnamed: 3,word.1,count.1,Unnamed: 6,word.2,count.2,Unnamed: 9,word.3,count.3,Unnamed: 12,word.4,count.4
0,head,202336,20,bitter,70922,40,thick,44537,60,sweetness,32477,80,malty,24569
1,aroma,181938,21,fruit,69217,41,tan,43302,61,strong,32197,81,fruity,24222
2,chocolate,157036,22,note,68554,42,full,42951,62,palate,31967,82,hazy,24017
3,flavor,151063,23,roasted,67037,43,orange,42928,63,small,31822,83,38,23809
4,dark,140109,24,vanilla,63879,44,little,41814,64,thanks,31686,84,sour,23459
5,beer,137862,25,body,63641,45,bitterness,41209,65,much,30509,85,glass,23335
6,malt,133818,26,medium,63570,46,really,40971,66,grapefruit,29271,86,poured,22830
7,hop,120424,27,caramel,61161,47,big,40794,67,41,29075,87,cherry,22524
8,taste,118579,28,like,59020,48,4,39795,68,smell,27644,88,ipa,22334
9,sweet,117786,29,well,54050,49,lot,39757,69,hoppy,27111,89,almost,22011


In [18]:
with open('corpus_tfidf.p', 'rb') as f:
    corpus_tfidf = pickle.load(f)

with open('textDict.p', 'rb') as f:
    text_dict = pickle.load(f)

In [19]:
flattened_tfidf = [score for tfidf in corpus_tfidf for score in tfidf]
flattened_tfidf[:10]

[(0, 0.0038368293758318872),
 (1, 0.008472975529699832),
 (2, 0.012205423044964435),
 (3, 0.00898807900647986),
 (4, 0.023992771792722207),
 (5, 0.04729075105207276),
 (6, 0.015047503393871233),
 (7, 0.03930914720646423),
 (8, 0.021648049512845862),
 (9, 0.03887724859033931)]

In [20]:
top_tfidf = sorted(flattened_tfidf, key = lambda x: x[1], reverse = True)
top_tfidf[:10]

[(7204, 0.9536042340669351),
 (8356, 0.9335206010328101),
 (7204, 0.9306965020138535),
 (3829, 0.9249370009926164),
 (15445, 0.9208818376003169),
 (15445, 0.9172853867272102),
 (231, 0.9147621246391323),
 (2692, 0.8955494165028615),
 (3128, 0.8926721150416248),
 (7204, 0.8796357444618146)]

In [21]:
text_dict.items()[:10]

[(10002, u'raining'),
 (6449, u'nuttyness'),
 (15704, u'overaged'),
 (543, u'yellow'),
 (8453, u'huileux'),
 (2577, u'maltage'),
 (6079, u'ratman'),
 (3169, u'hanging'),
 (546, u'woody'),
 (6176, u'genre')]

In [123]:
len(top_tfidf)

1454568

In [126]:
len(set([text_dict[key] for key,value in top_tfidf[:1000]]))

243

In [23]:
# some tests
print top_tfidf[:2][0][0]
print text_dict[7204]

7204
pumpkin


In [None]:
# magic number 381 is from manually findin cutoff for 100 unique values in top_100
top_100_tfidf = list(set([text_dict[key] for key,value in top_tfidf[:381]]))


In [152]:
# for output purposes
a = pd.DataFrame(top_100_tfidf[:10], columns = ['word'], index= None)
for i in range(10,91,10):
    a = pd.concat([a, pd.DataFrame(top_100_tfidf[i:i+10], columns = ['word'], index= None)], axis = 1)
    

a

Unnamed: 0,word,word.1,word.2,word.3,word.4,word.5,word.6,word.7,word.8,word.9
0,bluejacket,terminal,hemp,hibiscus,raspberry,abita,aquavit,fermentoren,neapolitan,cranberry
1,founder,cognac,boysenberry,jalapeno,maltcaramel,himmeriget,blueberry,zakoon,reno,brett
2,tequila,tg,coriander,strawberry,dubbel,firehouse,belgian,chardonnay,triple,mead
3,queen,bourbon,aluminum,tiramisu,oktoberfest,apple,chocolate,apricot,spruce,3708
4,sage,rice,betty,esb,plum,gose,pecan,whiskey,mint,montana
5,dorothy,meridian,kauai,skyview,raisin,maple,ginger,sour,peach,oyster
6,doughnut,pumpkin,bozeman,porter,brandy,blackberry,honolulu,cab,rye,potato
7,nectarine,coffee,crawler,whaleman,cinnamon,envie,hazelnut,cbc,chicory,nelson
8,sockeye,cedar,currant,rum,cherry,mosaic,quad,coconut,abraxas,smoked
9,peanut,knot,port,corn,gingerbread,gin,chili,gesho,papaya,saison


In [154]:
# run LDA. warning: computationally expensive!
ldamodel = gensim.models.ldamodel.LdaModel(corpus_tfidf, num_topics=5, id2word = text_dict, passes=40)

In [159]:
for i,result in ldamodel.print_topics(5):
    print result

0.012*"chocolate" + 0.010*"coffee" + 0.009*"bourbon" + 0.006*"black" + 0.006*"roasted" + 0.004*"roast" + 0.004*"brown" + 0.004*"vanilla" + 0.003*"dark" + 0.003*"stout"
0.000*"chocolate" + 0.000*"coffee" + 0.000*"black" + 0.000*"roasted" + 0.000*"bourbon" + 0.000*"roast" + 0.000*"brown" + 0.000*"pumpkin" + 0.000*"vanilla" + 0.000*"stout"
0.000*"coffee" + 0.000*"chocolate" + 0.000*"bourbon" + 0.000*"grapefruit" + 0.000*"pine" + 0.000*"citrus" + 0.000*"tropical" + 0.000*"orange" + 0.000*"mead" + 0.000*"black"
0.006*"pine" + 0.006*"citrus" + 0.006*"grapefruit" + 0.005*"orange" + 0.004*"ipa" + 0.004*"tropical" + 0.004*"hop" + 0.004*"golden" + 0.003*"funk" + 0.003*"mango"
0.000*"raspberry" + 0.000*"bourbon" + 0.000*"citrus" + 0.000*"chocolate" + 0.000*"tropical" + 0.000*"pine" + 0.000*"golden" + 0.000*"grapefruit" + 0.000*"funk" + 0.000*"coffee"


In [157]:
with open('lda_5top_40pass.p', 'wb') as f:
    pickle.dump(ldamodel, f)