# Beer Dataset - Recommendation Engine

In [1]:
import pandas as pd
import numpy as np
import pylab as pl

In [3]:
#For Mac OS
Location0 = r'/Users/viral.parikh/Desktop/External_Datasets/kaggle/beer/beer_reviews/beer_reviews.csv'

df = pd.read_csv(Location0)

print df.head()
print df.info()

   brewery_id             brewery_name  review_time  review_overall  \
0       10325          Vecchio Birraio   1234817823             1.5   
1       10325          Vecchio Birraio   1235915097             3.0   
2       10325          Vecchio Birraio   1235916604             3.0   
3       10325          Vecchio Birraio   1234725145             3.0   
4        1075  Caldera Brewing Company   1293735206             4.0   

   review_aroma  review_appearance review_profilename  \
0           2.0                2.5            stcules   
1           2.5                3.0            stcules   
2           2.5                3.0            stcules   
3           3.0                3.5            stcules   
4           4.5                4.0     johnmichaelsen   

                       beer_style  review_palate  review_taste  \
0                      Hefeweizen            1.5           1.5   
1              English Strong Ale            3.0           3.0   
2          Foreign / Export Stou

# Solution 1 - User Similarity

In [4]:
beer_1, beer_2 = "Dale's Pale Ale", "Fat Tire Amber Ale"

In [5]:
beer_1_reviewers = df[df.beer_name==beer_1].review_profilename.unique()

In [6]:
beer_2_reviewers = df[df.beer_name==beer_2].review_profilename.unique()

In [7]:
common_reviewers = set(beer_1_reviewers).intersection(beer_2_reviewers)
print "Users in the sameset: %d" % len(common_reviewers)
list(common_reviewers)[:10]

Users in the sameset: 499


['womencantsail',
 'Marty30',
 'Winter',
 'Lothore',
 'bump8628',
 'gford217',
 'lackenhauser',
 'wspscott',
 'mjurney',
 'LiquidBread219']

In [8]:
def get_beer_reviews(beer, common_users):
    mask = (df.review_profilename.isin(common_users)) & (df.beer_name==beer)
    reviews = df[mask].sort('review_profilename')
    reviews = reviews[reviews.review_profilename.duplicated()==False]
    return reviews

beer_1_reviews = get_beer_reviews(beer_1, common_reviewers)
beer_2_reviews = get_beer_reviews(beer_2, common_reviewers)

print beer_1_reviews.info()
print beer_2_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 499 entries, 1454568 to 1453403
Data columns (total 13 columns):
brewery_id            499 non-null int64
brewery_name          499 non-null object
review_time           499 non-null int64
review_overall        499 non-null float64
review_aroma          499 non-null float64
review_appearance     499 non-null float64
review_profilename    499 non-null object
beer_style            499 non-null object
review_palate         499 non-null float64
review_taste          499 non-null float64
beer_name             499 non-null object
beer_abv              499 non-null float64
beer_beerid           499 non-null int64
dtypes: float64(6), int64(3), object(4)
memory usage: 54.6+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 499 entries, 202456 to 201325
Data columns (total 13 columns):
brewery_id            499 non-null int64
brewery_name          499 non-null object
review_time           499 non-null int64
review_overall        499 non-n

In [9]:
cols = ['beer_name', 'review_profilename', 'review_overall', 'review_aroma', 'review_palate', 'review_taste']
beer_2_reviews[cols].head()

Unnamed: 0,beer_name,review_profilename,review_overall,review_aroma,review_palate,review_taste
202456,Fat Tire Amber Ale,ATPete,4.5,4.0,4.0,4.5
201458,Fat Tire Amber Ale,AdamBear,3.5,2.5,4.5,3.5
201886,Fat Tire Amber Ale,AlCaponeJunior,2.0,3.0,3.5,3.0
202481,Fat Tire Amber Ale,AltBock,4.0,3.0,3.0,3.0
201803,Fat Tire Amber Ale,Andreji,4.0,4.5,4.0,4.0


In [10]:
# choose your own way to calculate distance
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import manhattan_distances
from scipy.stats.stats import pearsonr


ALL_FEATURES = ['review_overall', 'review_aroma', 'review_palate', 'review_taste']
def calculate_similarity(beer1, beer2):
    # find common reviewers
    beer_1_reviewers = df[df.beer_name==beer1].review_profilename.unique()
    beer_2_reviewers = df[df.beer_name==beer2].review_profilename.unique()
    common_reviewers = set(beer_1_reviewers).intersection(beer_2_reviewers)

    # get reviews
    beer_1_reviews = get_beer_reviews(beer1, common_reviewers)
    beer_2_reviews = get_beer_reviews(beer2, common_reviewers)
    dists = []
    for f in ALL_FEATURES:
        dists.append(euclidean_distances(beer_1_reviews[f], beer_2_reviews[f])[0][0])
    
    return dists

In [11]:
calculate_similarity(beer_1, beer_2)

[17.592612085759182, 17.38533865071371, 16.454482671904334, 17.613914953808536]

# Expanding the logic to broader set

In [12]:
# calculate only a subset for the demo
beers = ["Dale's Pale Ale", "Sierra Nevada Pale Ale", "Michelob Ultra",
         "Natural Light", "Bud Light", "Fat Tire Amber Ale", "Coors Light",
         "Blue Moon Belgian White", "60 Minute IPA", "Guinness Draught"]

In [13]:
# calculate everything for real production
# beers = df.beer_name.unique()

simple_distances = []
for beer1 in beers:
    print "starting", beer1
    for beer2 in beers:
        if beer1 != beer2:
            row = [beer1, beer2] + calculate_similarity(beer1, beer2)
            simple_distances.append(row)

starting Dale's Pale Ale
starting Sierra Nevada Pale Ale
starting Michelob Ultra
starting Natural Light
starting Bud Light
starting Fat Tire Amber Ale
starting Coors Light
starting Blue Moon Belgian White
starting 60 Minute IPA
starting Guinness Draught


In [20]:
cols = ["beer1", "beer2", "overall_dist", "aroma_dist", "palate_dist", "taste_dist"]
simple_distances = pd.DataFrame(simple_distances, columns=cols)
simple_distances.tail()

Unnamed: 0,beer1,beer2,overall_dist,aroma_dist,palate_dist,taste_dist
85,Guinness Draught,Bud Light,44.260592,42.520583,45.825757,44.452222
86,Guinness Draught,Fat Tire Amber Ale,23.958297,21.023796,25.014996,22.798026
87,Guinness Draught,Coors Light,41.237119,38.823318,43.50862,40.620192
88,Guinness Draught,Blue Moon Belgian White,27.147744,22.477767,25.806976,23.727621
89,Guinness Draught,60 Minute IPA,31.348844,32.310989,32.256782,34.510868


# Special Case - Customize Weights

In [21]:
def calc_distance(dists, beer1, beer2, weights):
    mask = (dists.beer1==beer1) & (dists.beer2==beer2)
    row = dists[mask]
    row = row[['overall_dist', 'aroma_dist', 'palate_dist', 'taste_dist']]
    dist = weights * row
    return dist.sum(axis=1).tolist()[0]

weights = [2, 1, 1, 1]
print calc_distance(simple_distances, "Guinness Draught", "Blue Moon Belgian White", weights)
print calc_distance(simple_distances, "Guinness Draught", "60 Minute IPA", weights)

126.307851472
161.776326287


# Check Recommendations

In [22]:
my_beer = "Coors Light"
results = []
for b in beers:
    if my_beer!=b:
        results.append((my_beer, b, calc_distance(simple_distances, my_beer, b, weights)))
sorted(results, key=lambda x: x[2])

[('Coors Light', 'Natural Light', 69.523201228303634),
 ('Coors Light', 'Michelob Ultra', 72.430376465589802),
 ('Coors Light', 'Bud Light', 100.45382254092895),
 ('Coors Light', 'Blue Moon Belgian White', 175.24657417286627),
 ('Coors Light', 'Fat Tire Amber Ale', 176.31863930228485),
 ('Coors Light', "Dale's Pale Ale", 181.20123311633913),
 ('Coors Light', 'Guinness Draught', 205.42636799646337),
 ('Coors Light', '60 Minute IPA', 233.40510433819486),
 ('Coors Light', 'Sierra Nevada Pale Ale', 254.78216241090442)]

# Solution 2 - Item Similarity Method
(calculates beers which are most similar to the beers provided)

Let's grab the top 250 beers

In [27]:
n = 250
top_n = df.beer_name.value_counts().index[:n]
df = df[df.beer_name.isin(top_n)]

print top_n.shape
print df.info()
df.head()

(250,)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 355275 entries, 798 to 1586564
Data columns (total 13 columns):
brewery_id            355275 non-null int64
brewery_name          355275 non-null object
review_time           355275 non-null int64
review_overall        355275 non-null float64
review_aroma          355275 non-null float64
review_appearance     355275 non-null float64
review_profilename    355175 non-null object
beer_style            355275 non-null object
review_palate         355275 non-null float64
review_taste          355275 non-null float64
beer_name             355275 non-null object
beer_abv              353477 non-null float64
beer_beerid           355275 non-null int64
dtypes: float64(6), int64(3), object(4)
memory usage: 37.9+ MB
None


Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
798,1075,Caldera Brewing Company,1212201268,4.5,4.5,4,grumpy,American Double / Imperial Stout,4.0,4.5,Imperial Stout,,42964
1559,11715,Destiny Brewing Company,1137124057,4.0,3.5,4,blitheringidiot,American Pale Ale (APA),3.5,3.5,Pale Ale,4.5,26420
1560,11715,Destiny Brewing Company,1129504403,4.0,2.5,4,NeroFiddled,American Pale Ale (APA),4.0,3.5,Pale Ale,4.5,26420
1563,11715,Destiny Brewing Company,1137125989,3.5,3.0,4,blitheringidiot,American IPA,4.0,4.0,IPA,,26132
1564,11715,Destiny Brewing Company,1130936611,3.0,3.0,3,Gavage,American IPA,4.0,3.5,IPA,,26132


In [30]:
df_wide = pd.pivot_table(df, values=["review_overall"], index=["beer_name", "review_profilename"],aggfunc=np.mean).unstack()
df_wide.shape

(250, 22140)

In [31]:
df_wide = df_wide.fillna(0)

In [32]:
df_wide.ix[0:5, 0:5]

Unnamed: 0_level_0,review_overall,review_overall,review_overall,review_overall,review_overall
review_profilename,0110x011,02maxima,03SVTCobra,05Harley,0Naught0
beer_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
#9,0,0,0,0,0
120 Minute IPA,0,0,0,4,0
1554 Enlightened Black Ale,0,0,0,0,0
60 Minute IPA,0,0,0,0,0
90 Minute IPA,5,0,0,4,0


In [33]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics.pairwise import euclidean_distances
#from sklearn.metrics.pairwise import 

dists = cosine_similarity(df_wide)
dists

array([[ 1.        ,  0.27540494,  0.27410345, ...,  0.32928048,
         0.34805798,  0.31249922],
       [ 0.27540494,  1.        ,  0.25151873, ...,  0.2854835 ,
         0.23301356,  0.2802485 ],
       [ 0.27410345,  0.25151873,  1.        , ...,  0.31629515,
         0.22521858,  0.2737628 ],
       ..., 
       [ 0.32928048,  0.2854835 ,  0.31629515, ...,  1.        ,
         0.28025764,  0.34504013],
       [ 0.34805798,  0.23301356,  0.22521858, ...,  0.28025764,
         1.        ,  0.25526913],
       [ 0.31249922,  0.2802485 ,  0.2737628 , ...,  0.34504013,
         0.25526913,  1.        ]])

In [34]:
dists = pd.DataFrame(dists, columns=df_wide.index)
# give the indicies (equivalent to rownames in R) the name of the product id
dists.index = dists.columns
dists.ix[0:10, 0:10]

beer_name,#9,120 Minute IPA,1554 Enlightened Black Ale,60 Minute IPA,90 Minute IPA,Aecht Schlenkerla Rauchbier Märzen,AleSmith IPA,AleSmith Speedway Stout,Allagash White,Alpha King Pale Ale
beer_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
#9,1.0,0.275405,0.274103,0.388364,0.365175,0.253841,0.228479,0.227612,0.340681,0.293315
120 Minute IPA,0.275405,1.0,0.251519,0.378258,0.410366,0.262425,0.315971,0.337541,0.282273,0.336796
1554 Enlightened Black Ale,0.274103,0.251519,1.0,0.319887,0.314028,0.252486,0.266866,0.261761,0.260275,0.307296
60 Minute IPA,0.388364,0.378258,0.319887,1.0,0.533042,0.316928,0.312343,0.307627,0.360975,0.385249
90 Minute IPA,0.365175,0.410366,0.314028,0.533042,1.0,0.312861,0.344218,0.358754,0.356804,0.418582
Aecht Schlenkerla Rauchbier Märzen,0.253841,0.262425,0.252486,0.316928,0.312861,1.0,0.24449,0.246063,0.297672,0.263248
AleSmith IPA,0.228479,0.315971,0.266866,0.312343,0.344218,0.24449,1.0,0.521889,0.277409,0.400741
AleSmith Speedway Stout,0.227612,0.337541,0.261761,0.307627,0.358754,0.246063,0.521889,1.0,0.27393,0.420247
Allagash White,0.340681,0.282273,0.260275,0.360975,0.356804,0.297672,0.277409,0.27393,1.0,0.295666
Alpha King Pale Ale,0.293315,0.336796,0.307296,0.385249,0.418582,0.263248,0.400741,0.420247,0.295666,1.0


In [35]:
beers_i_like = ['Sierra Nevada Pale Ale', '120 Minute IPA', 'Allagash White']
dists[beers_i_like].head()

beer_name,Sierra Nevada Pale Ale,120 Minute IPA,Allagash White
beer_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
#9,0.373968,0.275405,0.340681
120 Minute IPA,0.301693,1.0,0.282273
1554 Enlightened Black Ale,0.330033,0.251519,0.260275
60 Minute IPA,0.459641,0.378258,0.360975
90 Minute IPA,0.441189,0.410366,0.356804


In [36]:
# axis = 1 b/c we want 1 score per beer, which are rows
beers_summed = dists[beers_i_like].apply(lambda row: np.sum(row), axis=1)
#beers_summed = beers_summed.reset_index()
#beers_summed.columns = ['beer_name', 'total_distance']
#beers_summed.sort(['beer_name'], ascending=False).head(10)
#beers_summed.sort(['total_distance'], ascending=False).head(10)

In [37]:
beers_summed.head()

beer_name
#9                            0.990054
120 Minute IPA                1.583966
1554 Enlightened Black Ale    0.841827
60 Minute IPA                 1.198874
90 Minute IPA                 1.208359
dtype: float64

In [38]:
beers_summed.order(ascending=False)

beer_name
Sierra Nevada Pale Ale                        1.654205
Allagash White                                1.634784
120 Minute IPA                                1.583966
HopDevil Ale                                  1.224217
Sierra Nevada Celebration Ale                 1.215156
90 Minute IPA                                 1.208359
60 Minute IPA                                 1.198874
Stone Ruination IPA                           1.194210
Stone IPA (India Pale Ale)                    1.193193
Storm King Stout                              1.192405
Arrogant Bastard Ale                          1.189981
Sierra Nevada Bigfoot Barleywine Style Ale    1.178245
Prima Pils                                    1.178093
Brooklyn Black Chocolate Stout                1.156365
Ayinger Celebrator Doppelbock                 1.148356
Hennepin (Farmhouse Saison)                   1.147501
Samuel Adams Boston Lager                     1.146304
Hop Rod Rye                                   1.140271


In [39]:
ranked_beers = beers_summed.index[beers_summed.index.isin(beers_i_like)==False]
ranked_beers = ranked_beers.tolist()
ranked_beers[:5]

['#9',
 '1554 Enlightened Black Ale',
 '60 Minute IPA',
 '90 Minute IPA',
 'Aecht Schlenkerla Rauchbier M\xc3\xa4rzen']

In [40]:
def get_similar(beers, n=None):
    """
    calculates which beers are most similar to the beers provided. Does not return
    the beers that were provided
    
    Parameters
    ----------
    beers: list
        some beers!
    
    Returns
    -------
    ranked_beers: list
        rank ordered beers
    """
    beers = [beer for beer in beers if beer in dists.columns]
    beers_summed = dists[beers].apply(lambda row: np.sum(row), axis=1)
    beers_summed = beers_summed.order(ascending=False)
    ranked_beers = beers_summed.index[beers_summed.index.isin(beers)==False]
    ranked_beers = ranked_beers.tolist()
    if n is None:
        return ranked_beers
    else:
        return ranked_beers[:n]

In [41]:
for beer in get_similar(["120 Minute IPA"], 10):
    print beer

World Wide Stout
90 Minute IPA
Double Bastard Ale
Stone Ruination IPA
Stone Imperial Russian Stout
Storm King Stout
60 Minute IPA
Oaked Arrogant Bastard Ale
Sierra Nevada Bigfoot Barleywine Style Ale
Brooklyn Black Chocolate Stout


In [42]:
for i, beer in enumerate(get_similar(["Coors Light", "Bud Light", "Amstel Light"], 10)):
    print "%d) %s" % (i+1, beer)

1) Miller Lite
2) Budweiser
3) Corona Extra
4) Samuel Adams Boston Lager
5) Heineken Lager Beer
6) Blue Moon Belgian White
7) Guinness Draught
8) Miller High Life
9) Samuel Adams Summer Ale
10) Sierra Nevada Pale Ale


Quick Check if both the above methods give same answers

In [43]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics.pairwise import euclidean_distances
#from sklearn.metrics.pairwise import 

#dists = cosine_similarity(df_wide)
dists1 = euclidean_distances(df_wide)

In [44]:
dists1 = pd.DataFrame(dists1, columns=df_wide.index)
# give the indicies (equivalent to rownames in R) the name of the product id
dists1.index = dists1.columns
dists1.ix[0:10, 0:10]

beer_name,#9,120 Minute IPA,1554 Enlightened Black Ale,60 Minute IPA,90 Minute IPA,Aecht Schlenkerla Rauchbier Märzen,AleSmith IPA,AleSmith Speedway Stout,Allagash White,Alpha King Pale Ale
beer_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
#9,0.0,174.229933,163.580213,198.889172,226.158606,154.688688,167.027568,174.317897,160.150862,182.979113
120 Minute IPA,174.229933,0.0,176.408262,205.519464,223.01051,165.243948,167.371136,170.681335,176.769447,185.167028
1554 Enlightened Black Ale,163.580213,176.408262,0.0,207.994591,233.043049,154.017247,162.097236,169.752618,168.938783,180.67188
60 Minute IPA,198.889172,205.519464,207.994591,0.0,216.17224,202.668294,208.132603,212.805744,204.677338,210.682114
90 Minute IPA,226.158606,223.01051,233.043049,216.17224,0.0,228.83564,228.252038,228.923871,228.818979,226.068608
Aecht Schlenkerla Rauchbier Märzen,154.688688,165.243948,154.017247,202.668294,228.83564,0.0,153.014546,160.880822,154.37171,177.353857
AleSmith IPA,167.027568,167.371136,162.097236,208.132603,228.252038,153.014546,0.0,135.57227,165.476815,167.44442
AleSmith Speedway Stout,174.317897,170.681335,169.752618,212.805744,228.923871,160.880822,135.57227,0.0,172.592286,169.450547
Allagash White,160.150862,176.769447,168.938783,204.677338,228.818979,154.37171,165.476815,172.592286,0.0,185.653281
Alpha King Pale Ale,182.979113,185.167028,180.67188,210.682114,226.068608,177.353857,167.44442,169.450547,185.653281,0.0


In [47]:
beers_i_like_mod = ["Dale's Pale Ale", "Fat Tire Amber Ale"]
dists1[beers_i_like_mod].head()

beer_name,Dale's Pale Ale,Fat Tire Amber Ale
beer_name,Unnamed: 1_level_1,Unnamed: 2_level_1
#9,170.051442,171.617489
120 Minute IPA,180.702237,185.033875
1554 Enlightened Black Ale,171.986696,152.189963
60 Minute IPA,202.158474,208.631437
90 Minute IPA,224.920048,233.61046


In [None]:
# axis = 1 b/c we want 1 score per beer, which are rows
beers_summed = dists[beers_i_like].apply(lambda row: np.sum(row), axis=1)
#beers_summed = beers_summed.reset_index()
#beers_summed.columns = ['beer_name', 'total_distance']
#beers_summed.sort(['beer_name'], ascending=False).head(10)
#beers_summed.sort(['total_distance'], ascending=False).head(10)