In [2]:
# python3
import gzip
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
import re
from time import time

In [3]:
with gzip.open('Beeradvocate.txt.gz', 'r') as f:
  rb_file = f.readlines()


data = []
row_out = []
stop = stopwords.words('english')

for i in rb_file:
    row = i.decode('utf-8', errors = 'replace')
    #print(row)
    if row == '\n':
      data.append(row_out)
      row_out = []
      continue
    cat, field = row.split(":", 1)
    #remove leading white spaces
    field = field.rstrip()
    if cat == 'review/text':
        #remove tab and newspace characters
        field = field.replace('\t', ' ')
        field = field.replace('\n', '')
        
        #remove punctuation
        
        #field = field.translate(str.maketrans('','',string.punctuation)) #this can be used for python 3
        #field = field.translate(None, string.punctuation) #this has issues with unicode characters
        field = re.sub(r'[^\w\s]','',field) #slower but effective
        field = field.lower()
    row_out.append(field)

In [64]:
data = pd.DataFrame(data)

data.columns = ['beer_name', 'beer_beerId', 'beer_brewer', 'beer_ABV', 'beer_style', 
                'review_appearance', 'review_aroma', 'review_palate', 'review_taste', 
                'review_overall', 'review_time', 'review_profileName', 'review_text']


corpus = data[['beer_beerId', 'review_text']].groupby('beer_beerId').agg(' '.join)
#sample = corpus.iloc[0:1000]

In [61]:
#len(corpus)
corpus.iloc[0]

beer_beerId                                                   10
review_text     theres a bit of bready melanoidin and spiocy ...
Name: 0, dtype: object

### TF-IDF

In [18]:
import nltk
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

stemmer = SnowballStemmer("english")
n_features = 500

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

t0 = time()

tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words='english', max_features = n_features)
tfidf = tfidf_vectorizer.fit_transform(corpus['review_text'])
print("done in %0.3fs." % (time() - t0))

done in 2788.330s.


https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py

In [19]:
test = corpus.iloc[1001]['review_text']
response = tfidf_vectorizer.transform([test])

feature_names = tfidf_vectorizer.get_feature_names()
for col in response.nonzero()[1]:
    print feature_names[col], ' - ', response[0, col]

worth  -  0.132289435872
weak  -  0.154088241374
version  -  0.152678808563
veri  -  0.215331917163
tri  -  0.212743348445
tast  -  0.0678432969964
sweet  -  0.0755959924884
surpris  -  0.128531190268
summer  -  0.159678560511
slight  -  0.172694031845
refresh  -  0.129654822652
quick  -  0.113697804939
price  -  0.174715567897
pour  -  0.074886136979
pack  -  0.169589965583
ok  -  0.145146601867
noth  -  0.121464360211
mouthfeel  -  0.166134746361
light  -  0.150226065941
label  -  0.145138346912
ive  -  0.123142704903
hue  -  0.135719797946
hot  -  0.158946026035
hint  -  0.0957592376478
head  -  0.0609862675123
got  -  0.129526428891
gold  -  0.136059505136
finish  -  0.0794027912669
fine  -  0.133140143708
fantast  -  0.173699031481
fade  -  0.259184309965
especi  -  0.151539116271
drinkabl  -  0.179823004406
clear  -  0.102573329634
clean  -  0.117518032029
cap  -  0.140703453935
came  -  0.155877402406
bottl  -  0.110277904416
bodi  -  0.0749501929524
best  -  0.120598180068
beer

### Training a linear model for each user

In [63]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

t0 = time()

t_user = data.iloc[0]['review_profileName']
t_reviews = data[data['review_profileName'] == t_user]


#beer_corpora = corpus.loc[t_reviews['beer_beerId']]

y = t_reviews['review_overall']
#this is only getting the tf_idf scores for the words used by this user specifically and not entire corpus.
#need to find a faster way to retrive tfidf representation without calling vectorizer transform function.
X = tfidf_vectorizer.transform(t_reviews['review_text'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

reg = LinearRegression().fit(X_train, y_train)

print("done in %0.3fs." % (time() - t0))

done in 2.341s.


In [71]:
#reg.predict(X_test)
np.mean(y_test.values.astype(float)

array([ 2. ,  2. ,  3. ,  4. ,  3.5,  3. ,  2.5,  4.5,  3.5,  2. ,  3.5,
        3.5,  3. ,  3. ,  3. ,  2.5,  3.5,  1.5,  3.5,  2. ,  2.5,  2.5,
        3.5,  4. ,  2.5,  3.5,  3. ,  3.5,  4.5,  2.5,  3. ,  3.5,  3. ,
        3.5,  3. ,  3. ,  3.5,  3. ,  4. ,  2.5,  4. ,  3.5,  3. ,  1.5,
        3. ,  3. ,  3.5,  2.5,  2. ,  2.5,  3. ,  2.5,  3. ,  3.5,  2.5,
        3. ,  2.5,  3. ,  3. ,  3. ,  4. ,  3. ,  1. ,  3. ,  3. ,  2.5,
        3.5,  3.5,  3.5,  2.5,  2. ,  3. ,  3. ,  3.5,  3. ,  3.5,  2.5,
        4.5,  1. ,  2.5,  2.5,  2. ,  3.5,  3. ,  1. ,  3.5,  4. ,  5. ,
        4. ,  4.5,  4. ,  4.5,  2. ,  1.5,  3.5,  4. ,  3.5,  3. ,  3. ,
        4.5,  4. ,  3.5,  2. ,  4. ,  4.5,  4. ,  2.5,  4.5,  2. ,  2.5,
        3.5,  3. ,  3.5,  3.5,  3.5,  3.5,  3. ,  3.5,  2.5,  2.5,  2. ,
        3. ,  3.5,  2. ,  4. ,  3. ,  2. ,  1.5,  4. ,  2. ,  2.5,  2.5,
        4.5,  3.5,  3.5,  3.5,  4.5,  2.5,  3. ,  2.5,  2.5,  3.5,  2.5,
        3. ,  3. ,  3.5,  4. ,  3. ,  3.5,  3. ,  3

In [66]:
for i in range(len(y_test)):
    rse = np.sqrt((float(y_test.values[i]) -  reg.predict(X_test[i])[0])**2)
    print "actual score: {}; predicted: {}; RSE: {}".format(y_test.values[i], reg.predict(X_test[i])[0], rse )
    

actual score:  2; predicted: 1.89740791207; RSE: 0.102592087929
actual score:  2; predicted: 1.98418070376; RSE: 0.0158192962401
actual score:  3; predicted: 2.72716039421; RSE: 0.272839605794
actual score:  4; predicted: 3.50967310166; RSE: 0.490326898338
actual score:  3.5; predicted: 2.1007012769; RSE: 1.3992987231
actual score:  3; predicted: 2.44614820748; RSE: 0.55385179252
actual score:  2.5; predicted: 3.42703357798; RSE: 0.927033577978
actual score:  4.5; predicted: 4.26336059169; RSE: 0.236639408311
actual score:  3.5; predicted: 2.92057685131; RSE: 0.579423148694
actual score:  2; predicted: 2.8235269749; RSE: 0.823526974903
actual score:  3.5; predicted: 3.45339396215; RSE: 0.0466060378488
actual score:  3.5; predicted: 3.25167070111; RSE: 0.24832929889
actual score:  3; predicted: 2.61447721835; RSE: 0.385522781654
actual score:  3; predicted: 2.46696786904; RSE: 0.533032130961
actual score:  3; predicted: 1.92805880744; RSE: 1.07194119256
actual score:  2.5; predicted: 1.

actual score:  3.5; predicted: 4.26528322166; RSE: 0.765283221655
actual score:  3; predicted: 2.56047282815; RSE: 0.439527171846
actual score:  3.5; predicted: 3.80950176655; RSE: 0.309501766552
actual score:  1.5; predicted: 2.25863605378; RSE: 0.758636053783
actual score:  2.5; predicted: 2.82613633416; RSE: 0.326136334156
actual score:  4.5; predicted: 3.88831093389; RSE: 0.611689066113
actual score:  2.5; predicted: 3.50713691556; RSE: 1.00713691556
actual score:  3.5; predicted: 2.76354215439; RSE: 0.736457845606
actual score:  3.5; predicted: 3.66074657251; RSE: 0.16074657251
actual score:  3; predicted: 3.28186673458; RSE: 0.281866734577
actual score:  2; predicted: 1.41030753291; RSE: 0.589692467091
actual score:  3.5; predicted: 3.48007538235; RSE: 0.0199246176476
actual score:  3.5; predicted: 3.61109624492; RSE: 0.111096244923
actual score:  3; predicted: 3.14114543481; RSE: 0.14114543481
actual score:  3; predicted: 3.23156425905; RSE: 0.231564259051
actual score:  3.5; pr

In [42]:
#Find beers consumed by each user
users = data['review_profileName'].unique()

user_reg: {}

for u in users:
    reviews = data[data['review_profileName'] == u]
    y = reviews['review_overall']
    X = tfidf_vectorizer.transform(reviews['review_text'])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    reg = LinearRegression().fit(X_train, y_train)
    
    #calculate RMSE
    
    

33388