In [1]:
## Read in Review Files and create a pandas dataframe (code borrowed from Julian McAuley website) ##
import pandas as pd
import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

food_review = getDF('../data/reviews_Grocery_and_Gourmet_Food_5.json.gz')

In [2]:
##Remove special characters for the review text
food_review['reviewText'] = food_review['reviewText'].str.replace("'", "")
food_review['reviewText'] = food_review['reviewText'].str.replace('[^a-zA-Z\s]',' ')

In [3]:
##Test file:  Exclude five star reviews from 5 Core
food_review_nofive = food_review[food_review['overall'] < 5.0]

In [4]:
##Tokenize review text for each review
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()

tokens_I = [word_tokenize(review) for review in food_review_nofive['reviewText']]

In [5]:
##Separate strings with multiple uppercase characters (e.g., gCholesterol, VeronaStarbucks).  This should (hopefully)
## take care of situations where the reviews included returns that were not treated as spaces in the raw text file
import re
def split_uppercase(value):
    return re.sub(r'([A-Z])', r' \1', value)

tokens_II = np.empty((len(tokens_I),0)).tolist()
for review in tokens_I:
    n = tokens_I.index(review)
    tokens_II[n] = [split_uppercase(word) for word in review]

In [6]:
##Make all text lower case
tokens = np.empty((len(tokens_II),0)).tolist()
for review in tokens_II:
    n = tokens_II.index(review)
    tokens[n] = [word.lower() for word in review]

In [7]:
##Remove stopwords and stem
stopwords = stopwords.words('english')
stemmed_token = np.empty((len(tokens),0)).tolist()
for review in tokens:
    n = tokens.index(review)
    stemmed_token[n] = [st.stem(word) for word in review if word not in stopwords]

In [8]:
##Manipulate stemmed text to be string instead of list (needed for count vectorizer)
final_review_text = []
for review in stemmed_token:
    for word in review:
        n = review.index(word)
        if n == 0:
            string = review[n]
        else:
            string = string + " " + review[n]
    final_review_text.append(string)

In [13]:
##Count Vectorizer Matrix
import numpy as np
import scipy
from scipy.sparse import coo_matrix, vstack
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=False, ngram_range=(1, 1)) ##Removed stopwords before stemming so don't apply here
food_review_text = vectorizer.fit_transform(final_review_text)

##Remove if word is in less than 10 reviews
counts = scipy.sparse.coo_matrix.sum(food_review_text, axis=0)
food_review_text = np.transpose(vstack([food_review_text,counts]))
food_review_text = pd.DataFrame(food_review_text.todense(), index = vectorizer.get_feature_names())
last_col = food_review_text.shape[1] - 1
food_review_text = food_review_text[food_review_text[last_col] > 9]
del food_review_text[last_col]

In [14]:
##TFIDF Weighting
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
weighted_food_review_text = transformer.fit_transform(food_review_text)
tfidf_matrix = weighted_food_review_text.transpose()
tfidf_matrix

<63808x7051 sparse matrix of type '<class 'numpy.float64'>'
	with 2061758 stored elements in Compressed Sparse Column format>

In [15]:
##Non-negative matrix factorization
n_topics = 20

from sklearn.decomposition import NMF
model = NMF(init="nndsvd", n_components=n_topics, random_state=1)
W_matrix = model.fit_transform(tfidf_matrix)
H_matrix = model.components_

In [16]:
##Prints tops and keywords

feature_names = food_review_text.index
for topic_index in range( H_matrix.shape[0] ):
    top_indices = np.argsort( H_matrix[topic_index,:] )[::-1][0:10]  ##show top 10 words associated with each topic
    term_ranking = [feature_names[i] for i in top_indices]
    print ("Topic %d: %s" % ( topic_index, ", ".join( term_ranking ) ))

Topic 0: eat, snack, the, chocol, pack, they, bar, lik, box, good
Topic 1: nutry, insulin, canyon, worthless, cm, sesam, hemp, highest, rancid, regardless
Topic 2: mahogony, lodge, kopi, raya, jazzed, emeril, xtra, wolfgang, mountain, reserve
Topic 3: folic, sodium, color, calcium, sugars, mononitrate, niacinamide, dietary, total, sulfate
Topic 4: compass, gratitud, uplift, soul, wisdom, prevail, grac, enlight, attitud, mercy
Topic 5: unfiltered, unrefined, unknown, fahrenheit, virgin, trade, press, certified, kosher, unrefin
Topic 6: preterm, newborn, scientists, cocain, vuln, infant, mo, inf, cholin, polysaccharid
Topic 7: sprats, baltic, filet, fish, her, fillets, kipper, fillet, seas, herring
Topic 8: umam, glutam, inosin, arroz, silicon, autolys, overus, disod, dioxid, mi
Topic 9: regulations, federal, enzymolys, unexplain, code, oleoresin, der, constitu, thereof, poultry
Topic 10: sauc, cook, dish, ad, past, chick, minut, prep, heat, tomato
Topic 11: ikkoku, maison, exquisit, cel