In [99]:
## Read in Review Files and create a pandas dataframe (code borrowed from Julian McAuley website) ##
import pandas as pd
import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

##Make all text lowercase and remove special characters for the review text
food_review = getDF('data/reviews_Grocery_and_Gourmet_Food_5.json.gz')
food_review['reviewText'] = food_review['reviewText'].str.lower()
food_review['reviewText'] = food_review['reviewText'].str.replace("'", "")
food_review['reviewText'] = food_review['reviewText'].str.replace('[^a-zA-Z\s]',' ')

In [100]:
##Test file:  Exclude five star reviews from 5 Core
food_review_nofive = food_review[food_review['overall'] < 5.0]

In [101]:
##Tokenize review text and stem each individual word for each review
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()

tokens = [word_tokenize(review) for review in food_review_nofive['reviewText']]

In [102]:
##Remove stopwords and stem
stopwords = stopwords.words('english')
stemmed_token = np.empty((len(tokens),0)).tolist()
for review in tokens:
    n = tokens.index(review)
    stemmed_token[n] = [st.stem(word) for word in review if word not in stopwords]

In [103]:
##Manipulate stemmed text to be string instead of list (needed for count vectorizer)
final_review_text = []
for review in stemmed_token:
    for word in review:
        n = review.index(word)
        if n == 0:
            string = review[n]
        else:
            string = string + " " + review[n]
    final_review_text.append(string)

In [104]:
##Count Vectorizer Matrix
import numpy as np
import scipy
from scipy.sparse import coo_matrix, vstack
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=False, ngram_range=(1, 1)) ##Removed stopwords before stemming so don't apply here
food_review_text = vectorizer.fit_transform(final_review_text)

##Remove if word is in less than 10 reviews
counts = scipy.sparse.coo_matrix.sum(food_review_text, axis=0)
food_review_text = np.transpose(vstack([food_review_text,counts]))
food_review_text = pd.DataFrame(food_review_text.todense(), index = vectorizer.get_feature_names())
last_col = food_review_text.shape[1] - 1
food_review_text = food_review_text[food_review_text[last_col] > 9]
del food_review_text[last_col]

In [105]:
##TFIDF Weighting
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
weighted_food_review_text = transformer.fit_transform(food_review_text)
tfidf_matrix = weighted_food_review_text.transpose()
tfidf_matrix

<63808x6248 sparse matrix of type '<class 'numpy.float64'>'
	with 2255039 stored elements in Compressed Sparse Column format>

In [106]:
##Non-negative matrix factorization
n_topics = 15

from sklearn.decomposition import NMF
model = NMF(init="nndsvd", n_components=n_topics, random_state=1)
W_matrix = model.fit_transform(tfidf_matrix)
H_matrix = model.components_

In [107]:
feature_names = food_review_text.index
for topic_index in range( H_matrix.shape[0] ):
    top_indices = np.argsort( H_matrix[topic_index,:] )[::-1][0:10]  ##show top 10 words associated with each topic
    term_ranking = [feature_names[i] for i in top_indices]
    print ("Topic %d: %s" % ( topic_index, ", ".join( term_ranking ) ))

Topic 0: eat, snack, chocol, lik, pack, good, box, bar, would, tast
Topic 1: chia, nutry, gr, worthless, cashewc, hemp, insulin, flax, regardless, phytosterol
Topic 2: roastgreen, diamondgreen, magicoth, revvgreen, veronastarbuck, mahogony, reservegreen, roaststarbuck, lodg, kop
Topic 3: mickleg, boson, sir, majesty, mangosteen, langu, fij, legend, journey, you
Topic 4: hydrochlorid, niacinamid, riboflavin, palmit, thiamin, biotin, pyridoxin, fol, sulf, acet
Topic 5: wedderspoon, scab, unt, eu, unfilt, pol, honey, rejuv, stamp, ass
Topic 6: compass, gratitud, uplift, soul, grac, wisdom, anch, prevail, enlight, attitud
Topic 7: enzymolys, cfr, oleoresin, der, unexplain, constitu, thereof, poultry, distil, bark
Topic 8: preterm, newborn, inf, vuln, cocain, mo, docosahexaeno, cholin, polysaccharid, synthes
Topic 9: umam, glutam, arroz, inosin, msg, autolys, silicon, mi, brothy, overus
Topic 10: gprotein, mgtotal, gsug, gcholesterol, gsat, mgsodium, gtrans, gdiet, gmonouns, mgpotassium
Top

In [98]:
##For testing - this review had strange text after removing special characters
#food_review[food_review.reviewText.str.contains('veronastarbuck')]
food_review[45163:45164]

Unnamed: 0,reviewerID,reviewText,reviewTime,asin,helpful,summary,reviewerName,overall,unixReviewTime
45163,A26NFIQ7KWI8Y7,"I'm not a fan of decaf. This one is drinkable with just a tinge of ""conference coffee"" taste. I also like Green Mountain Dark Magic Decaf.For reference purposes my in store drink is a Starbucks Americano.My favorite k-cups are:Starbucks French RoastStarbucks Caffe VeronaStarbucks Pike Place RoastGreen Mountain Xtra Bold Sumatran ReserveGreen Mountain Double Black DiamondGreen Mountain RevvGreen Mountain Dark MagicOther k-cups I've tried: Coffee People Jet Fuel ,Green Mountain Dark Magic Decaf, Starbucks Caffe Verona, Coffee People Black Tiger, Starbucks House Blend, Starbucks Breakfast Blend, Starbucks Sumatra, Wolfgang Puck French Roast, Green Mountain Lake and Lodge, Green Mountain French Roast, Caribou Mahogony, Wolfgang Puck Sumatra Kopi Raya, Emeril Big Easy Bold","02 20, 2012",B001D0GVAO,"[0, 0]",drinkable for decaf,kt rose,3.0,1329696000
