In [20]:
# python3
import gzip
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
import re
from time import time

In [10]:
with gzip.open('Beeradvocate.txt.gz', 'r') as f:
  rb_file = f.readlines()


data = []
row_out = []
stop = stopwords.words('english')

for i in rb_file:
    row = i.decode('utf-8', errors = 'replace')
    #print(row)
    if row == '\n':
      data.append(row_out)
      row_out = []
      continue
    cat, field = row.split(":", 1)
    #remove leading white spaces
    field = field.rstrip()
    if cat == 'review/text':
        #remove tab and newspace characters
        field = field.replace('\t', ' ')
        field = field.replace('\n', '')
        
        #remove punctuation
        
        #field = field.translate(str.maketrans('','',string.punctuation)) #this can be used for python 3
        #field = field.translate(None, string.punctuation) #this has issues with unicode characters
        field = re.sub(r'[^\w\s]','',field) #slower but effective
        field = field.lower()
    row_out.append(field)

In [24]:
data = pd.DataFrame(data)

data.columns = ['beer_name', 'beer_beerId', 'beer_brewer', 'beer_ABV', 'beer_style', 
                'review_appearance', 'review_aroma', 'review_palate', 'review_taste', 
                'review_overall', 'review_time', 'review_profileName', 'review_text']

sample = data.iloc[0:1000]

### TF-IDF

In [25]:
import nltk
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

stemmer = SnowballStemmer("english")
n_features = 1000

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

t0 = time()

tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words='english', max_features = n_features)
tfidf = tfidf_vectorizer.fit_transform(sample['review_text'])
print("done in %0.3fs." % (time() - t0))

done in 1.966s.


https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py

In [35]:
test = data.iloc[1001]['review_text']
response = tfidf_vectorizer.transform([test])

feature_names = tfidf_vectorizer.get_feature_names()
for col in response.nonzero()[1]:
    print feature_names[col], ' - ', response[0, col]

yellow  -  0.212356887853
wheat  -  0.353478657237
veri  -  0.103584820058
time  -  0.200932627497
t  -  0.138323681064
s  -  0.137672818315
pick  -  0.221951295741
past  -  0.307771344264
malt  -  0.180748907846
m  -  0.146234100344
low  -  0.23657491203
limit  -  0.324038370911
light  -  0.234662915419
head  -  0.0702688958867
good  -  0.104827448185
drinkabl  -  0.135344366678
domin  -  0.235482303541
decent  -  0.182576255654
d  -  0.157989609487
bud  -  0.294957955852
bit  -  0.128479744116
beer  -  0.194342721388
appear  -  0.193451154302
