# Model Building

In [3]:
import pandas as pd
import numpy as np

## Collaborative Filtering: Matrix Facotrization

In [32]:
from surprise import SVD
from surprise import NormalPredictor
from surprise import Dataset
from surprise import Reader
from surprise import dump
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
from surprise.model_selection.split import train_test_split

In [46]:
df = pd.read_csv('../data/top_rated.csv')

In [47]:
df = df.dropna()

In [48]:
df[(df.score<1)]

Unnamed: 0.1,Unnamed: 0,beer_id,username,score,count


In [49]:
df = df[~(df.score>5)]

In [50]:
df.describe()

Unnamed: 0.1,Unnamed: 0,beer_id,score,count
count,3046479.0,3046479.0,3046479.0,3046479.0
mean,1539764.0,30643.99,4.014147,4597.233
std,889100.1,36813.08,0.6456568,3297.014
min,0.0,5.0,1.0,1373.0
25%,769577.5,1331.0,3.75,2181.0
50%,1540104.0,16062.0,4.06,3379.0
75%,2310122.0,50509.0,4.5,5998.0
max,3078487.0,211516.0,5.0,17160.0


In [10]:
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(df[['username','beer_id','score']],reader)

In [11]:
cross_validate(NormalPredictor(),data,cv=2)

{'test_rmse': array([1.8562683 , 1.43975082]),
 'test_mae': array([0.69394968, 1.1423621 ]),
 'fit_time': (2.19085693359375, 2.6427297592163086),
 'test_time': (15.031850337982178, 15.018870830535889)}

In [14]:
cross_validate(SVD(),data,cv=2)

{'test_rmse': array([1.17831152, 1.68690022]),
 'test_mae': array([0.98580954, 0.30880911]),
 'fit_time': (76.11983704566956, 75.62216782569885),
 'test_time': (14.427448987960815, 15.676785230636597)}

In [19]:
param_grid = {'n_epochs': [10.20], 'lr_all': [0.001,0.002],
              'reg_all': [0.4, 0.5,0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

In [20]:
gs.fit(data)

In [21]:
gs.best_score['rmse']

0.9934610923443413

In [22]:
gs.best_params['rmse']

{'n_epochs': 10, 'lr_all': 0.001, 'reg_all': 0.4}

In [23]:
algo = gs.best_estimator['rmse']

In [74]:
trainset, testset = train_test_split(data, test_size=0.25)
predictions = algo.fit(trainset).test(testset)

In [75]:
accuracy.rmse(predictions)

RMSE: 0.4665


0.46654713198469905

In [77]:
def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    
pred_df = pd.DataFrame(predictions, columns=['username', 'beer_id', 'rui', 'est', 'details'])
pred_df['Iu'] = pred_df.username.apply(get_Iu)
pred_df['Ui'] = pred_df.beer_id.apply(get_Ui)
pred_df['err'] = abs(pred_df.est - df.rui)
best_predictions = pred_df.sort_values(by='err')[:10]
worst_predictions = pred_df.sort_values(by='err')[-10:]

In [57]:
pred_df.describe()

Unnamed: 0,beer_id,rui,est,Iu,Ui,err
count,761620.0,761620.0,761620.0,761620.0,761620.0,761620.0
mean,30591.157397,4.014412,4.034786,152.28074,3415.753943,0.330182
std,36781.662466,0.644893,0.329553,138.635255,2447.544818,0.329513
min,5.0,1.0,1.0,0.0,986.0,0.0
25%,1331.0,3.75,3.893115,40.0,1617.0,0.108798
50%,15881.0,4.06,4.091898,114.0,2523.0,0.240682
75%,50509.0,4.5,4.244886,230.0,4476.0,0.447292
max,211516.0,5.0,5.0,726.0,12671.0,4.0


In [78]:
best_predictions

Unnamed: 0,username,beer_id,rui,est,details,Iu,Ui,err
701794,toomuchhops,50772,4.5,5.0,{'was_impossible': False},120,2165,0.0
153231,wyckydsceptre,50772,4.25,5.0,{'was_impossible': False},33,2165,0.0
513305,gigaknight,50772,4.82,5.0,{'was_impossible': False},14,2165,0.0
183734,OldManBike,6322,3.75,4.0,{'was_impossible': False},65,2293,3.753277e-07
653976,akelmore,6260,4.0,3.999998,{'was_impossible': False},2,4938,1.553949e-06
305158,wordemupg,66507,3.55,3.919998,{'was_impossible': False},325,1184,2.291289e-06
379887,TrudiiSchu,132073,4.55,4.000004,{'was_impossible': False},1,1328,3.523446e-06
233568,rangerred,98383,4.4,3.999995,{'was_impossible': False},349,1690,5.010795e-06
119065,ColdPoncho,2093,4.2,4.250005,{'was_impossible': False},186,11796,5.405297e-06
171871,GMorgan,58610,4.31,4.250006,{'was_impossible': False},175,2156,5.856386e-06


In [79]:
worst_predictions

Unnamed: 0,username,beer_id,rui,est,details,Iu,Ui,err
36992,mrjmann,7971,3.75,4.666856,{'was_impossible': False},99,11006,3.666856
631522,sjstraub,3659,5.0,4.668764,{'was_impossible': False},180,1345,3.668764
485211,starshockey9,50772,4.0,1.0,{'was_impossible': False},53,2165,3.71
520390,CEDAMA,38180,5.0,4.714184,{'was_impossible': False},268,5134,3.714184
412718,dpgoblejr,16814,4.93,4.735764,{'was_impossible': False},326,10653,3.735764
220335,aranton,50772,4.5,1.046254,{'was_impossible': False},112,2165,3.953746
137691,hombrepalo,50772,3.75,1.0,{'was_impossible': False},104,2165,4.0
622170,buyBinboaVodka,50772,4.87,1.0,{'was_impossible': False},14,2165,4.0
462060,Skunkyluvmuffin,50772,4.5,1.0,{'was_impossible': False},120,2165,4.0
58691,CaptainFantasy,50772,4.5,1.0,{'was_impossible': False},3,2165,4.0


In [None]:
#save model as binary
dump.dump('../model/model',algo=algo,verbose=1)

In [65]:
beer_df = pd.read_csv('../data/beers.csv')

In [80]:
df.loc[df.beer_id==6322]

Unnamed: 0,username,beer_id,rui,est,details,Iu,Ui,err
810,beep71,6322,4.42,3.973458,{'was_impossible': False},20,2342,0.484586
859,Samp01,6322,4.10,3.896082,{'was_impossible': False},183,2342,0.755403
6150,RedneckBeerz,6322,4.25,4.019446,{'was_impossible': False},298,2342,0.282483
6327,ROGUE16,6322,4.00,3.948809,{'was_impossible': False},379,2342,0.383734
6366,jholland,6322,3.75,3.906415,{'was_impossible': False},54,2342,0.197064
...,...,...,...,...,...,...,...,...
756931,mkholmes25,6322,4.02,3.993962,{'was_impossible': False},26,2342,0.156796
756992,Reagan1984,6322,4.31,3.992799,{'was_impossible': False},428,2342,0.688280
758514,Number3red,6322,3.50,4.041398,{'was_impossible': False},141,2342,1.054525
758887,neenerzig,6322,4.03,4.112690,{'was_impossible': False},246,2342,0.919368


## Content Based Filtering

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
df = pd.read_csv('../data/beer_features.csv')
df

Unnamed: 0.1,Unnamed: 0,beer_id,text,look,smell,taste,feel,score,name,brewery_id,state,country,style,abv,notes
0,0,5,"Beautiful beer. Light and tasty., I trave...",3.569071,3.314848,3.435774,3.401786,3.459699,Amber,3,LA,US,Vienna Lager,4.5,Amber is a Munich style lager brewed with crys...
1,1,6,"great brown ale...one of my favorites., M...",3.837863,3.575499,3.684462,3.587068,3.659804,Turbodog,3,LA,US,English Brown Ale,5.6,Turbodog is a dark brown ale brewed with Willa...
2,2,7,The labeling with the purple haze guy with ...,3.407776,3.328253,3.254259,3.264490,3.289541,Purple Haze,3,LA,US,Fruit and Field Beer,4.2,Experience the magic of Purple Haze.® Clouds o...
3,3,10,Pours slightly hazy deep amber/brown. 1 fin...,3.935979,5.871466,3.796924,3.783938,3.805655,Dubbel Ale,4,ME,US,Belgian Dubbel,7.0,Allagash Dubbel boasts a deep red color and a ...
4,4,17,"0%, Not much like a hefe, too citrusy (or...",3.479105,3.211594,3.255258,3.344787,3.322062,Widmer Hefeweizen,8,OR,US,German Hefeweizen,4.9,availability: year-round
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,187317,I think a one is to high of a rating. I per...,3.745092,3.618064,3.698520,3.706631,3.681254,Otra Vez,140,CA,US,Leipzig Gose,4.5,"In California, where temperatures often top tr..."
996,996,189272,"Love this beer., 19.2 oz can , 10/30/17, ...",4.534134,4.448494,4.485783,4.485113,4.477996,Ten FIDY - Bourbon Barrel Aged,2681,CO,US,Russian Imperial Stout,12.9,Aged through four seasons and from a blend of ...
997,997,197183,Clear Amber pour with modest head. Aromas o...,3.862383,3.868740,3.826301,3.780224,3.833175,Citradelic Tangerine IPA,192,CO,US,American IPA,6.0,"Tangerine IPA brewed with Citra hops, orange p..."
998,998,202078,""" Bought beer, poured a glass, it looked too...",4.295023,4.355418,4.342755,4.280097,4.331283,Enjoy By Unfiltered IPA,147,CA,US,American Imperial IPA,9.4,In most cases skipping a step is a bad thing. ...


In [55]:
# df[((df['style']=='American IPA')&(df['state']=='CO'))] 
df[df['name']=='Odell IPA']

Unnamed: 0.1,Unnamed: 0,beer_id,text,look,smell,taste,feel,score,name,brewery_id,state,country,style,abv,notes
569,569,35626,Smells wonderful. Taste is citrusy orange a...,4.183253,4.221141,4.2685,4.158446,4.263974,Odell IPA,267,CO,US,American IPA,7.0,No notes at this time.


In [61]:
tf = TfidfVectorizer(analyzer='word', ngram_range = (1,2),max_features=100000, stop_words='english')

In [62]:
tfidf_matrix = tf.fit_transform(df.text)

In [63]:
tfidf_matrix.shape

(1000, 100000)

In [64]:
cosine_sims = cosine_similarity(tfidf_matrix,tfidf_matrix)

In [60]:
columns = [k for (v,k) in sorted((v,k) for k,v in tf.vocabulary_.items())]
tfidf_df = pd.DataFrame(tfidf_matrix.todense(),columns=columns)
tfidf_df

MemoryError: Unable to allocate 266. GiB for an array with shape (1000, 35705257) and data type float64

In [21]:
results={}

In [100]:
for idx, row in df.iterrows():
    sim_indices = cosine_sims[idx].argsort()[-6:-1]
    sim_items = [(cosine_sims[idx][i], df['beer_id'][i]) for i in sim_indices]
    results[row['beer_id']] = sim_items[1:]
    

In [128]:
results[9689]

[(0.9199161477585782, 16506),
 (0.9238316323695005, 7879),
 (0.9267255713596759, 248),
 (0.9422921807803533, 59)]

In [130]:
breweries = pd.read_csv('../data/breweries.csv')

In [143]:
breweries[breweries['name'].str.contains('Crooked Stave')].id

35818    25191
Name: id, dtype: int64

In [134]:
df[df['beer_id']==7879]

Unnamed: 0.1,Unnamed: 0,beer_id,text,look,smell,taste,feel,score,name,brewery_id,state,country,style,abv,notes
391,391,7879,"Seen this one everywhere, so I finally cave...",4.113344,4.090426,4.150638,4.107481,4.160018,Witbier,259,,BE,Belgian Witbier,5.5,No notes at this time.


In [149]:
# df[df['brewery_id']==25191]
# df[(df['name'].str.contains('IPA'))&(df['brewery_id']==147)]
df[(df['name'].str.contains('Chicken'))]

Unnamed: 0.1,Unnamed: 0,beer_id,text,look,smell,taste,feel,score,name,brewery_id,state,country,style,abv,notes


In [58]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [None]:
column_trans = ColumnTransformer(
    [('cat_features',OneHotEncoder(dtype='int'),['state','country','style']),
    ('num_features', )
    (('text_bow',tf,'text')]
)