In [1]:
# read .csv into python
import pandas as pd
import numpy as np
import operator
import os
import gzip
import re

In [5]:
with gzip.open('../Beeradvocate.txt.gz', 'r') as f:
  rb_file = f.readlines()


data = []
row_out = []

for i in rb_file:
    row = i.decode('utf-8', errors = 'replace')
    #print(row)
    if row == '\n':
      data.append(row_out)
      row_out = []
      continue
    cat, field = row.split(":", 1)
    #remove leading white spaces
    field = field.rstrip()
    row_out.append(field)

In [8]:
data = pd.DataFrame(data)

data.columns = ['beer_name', 'beer_beerId', 'beer_brewer', 'beer_ABV', 'beer_style', 
                'review_appearance', 'review_aroma', 'review_palate', 'review_taste', 
                'review_overall', 'review_time', 'review_profileName', 'review_text']

# keep 3 columns: user name, beer name, overall score
data2 = data[['beer_name', 'review_profileName', 'review_overall', 'review_time', 'beer_style']]

m = 33382 # number of users
n = 56855 # Number of items
# remove NA
data2 = data2[pd.notnull(data2.beer_name)]
data2 = data2[pd.notnull(data2.review_profileName)]
data2 = data2[pd.notnull(data2.review_overall)]
print(data2.shape)


(1586614, 5)


In [9]:
# keep top 1000/33382 frequent users
user = data2.review_profileName.value_counts()
user_list = user.keys()[:m].tolist()

# keep top 100/56855 most reviewed beer
beer = data2.beer_name.value_counts()[:n]
beer_list = beer.keys()[:n].tolist()

# keep (beer&user) pair in (user_list) and (beer_list)
subdata = data2[data2.beer_name.isin(beer_list)]
subdata = subdata[subdata.review_profileName.isin(user_list)]

# sort by user names
subdata = subdata.sort_values(by=['review_profileName','beer_name','review_time','beer_style'])

print(subdata.shape)

(1586606, 5)


In [40]:

#To be Pre calculated - TBD everyday/everyhour during service downtime.....
Beer_styles = list(set(subdata['beer_style']))
Popular = {}
for style in Beer_styles:
    new_data = subdata[subdata['beer_style'] == style]
    beers = list(set(new_data['beer_name']))
    tempdict = {}
    popdict = {}
    for beer in beers:
        beerdata = new_data[new_data['beer_name'] == beer]
        if len(beerdata) > 50:
            val = (pd.to_numeric(beerdata['review_overall']).sum())/len(beerdata)
            tempdict[beer] = val
            popdict[beer] = val
    newA = list(sorted(tempdict.items(), key=operator.itemgetter(1), reverse=True)[:10])
    Popular[style] = newA

Most_popular = list(sorted(popdict.items(), key=operator.itemgetter(1), reverse=True)[:10])

   


In [41]:
Most_popular

[(' Duvel', 4.3422448979591834),
 (' Unibroue 10', 4.286363636363636),
 (' Struise Tsjeeses Reserva', 4.2164179104477615),
 (' Southampton Grand Cru', 4.20979020979021),
 (' Unibroue 11', 4.205696202531645),
 (' Damnation', 4.156043956043956),
 (' La Chouffe', 4.13921568627451),
 (' De Proef Signature Les Deux Brasseurs Ale (w/Jason Perkins)',
  4.113970588235294),
 (' De Proef Flemish Primitive Wild Ale (Pig Nun)', 4.105263157894737),
 (' Don de Dieu', 4.10264598540146)]

In [52]:
def cold_start(name, flag, demand = ""):
    recommendation = [item[0] for item in Most_popular[:5]]
    if flag == 0:
        types = demand.split(",")
        if len(types) == 1:
            i = 0
            while((len(recommendation) < 10) and (len(Popular[Beer_styles[int(types[0])]]) < i)):
                if Popular[Beer_styles[int(types[0])]][i][0] not in recommendation:
                        recommendation.append(Popular[Beer_styles[int(types[0])]][i][0])
                i = i + 1

        if len(types) == 2:
            i = 0
            while((len(recommendation) < 8) and (len(Popular[Beer_styles[int(types[0])]]) < i)):
                if Popular[Beer_styles[int(types[0])]][i][0] not in recommendation:
                    recommendation.append(Popular[Beer_styles[int(types[0])]][i][0])
                i = i + 1
                
            while((len(recommendation) < 10) and (len(Popular[Beer_styles[int(types[0])]]) < i)):
                if Popular[Beer_styles[int(types[1])]][i][0] not in recommendation:
                    recommendation.append(Popular[Beer_styles[int(types[0])]][i][0])
                i = i + 1
        
    elif flag == 1: 
        new_data = subdata[subdata['review_profileName'] == name]
        main_beer_style = new_data.groupby("beer_style")['beer_name'].count().reset_index(name='count').sort_values(['count'], ascending=False).head(1)
        k = 1 #new_data['beer_name'].values
        print(type(k))
        print(k)
        i = 0
        while(len(recommendation) < 10):
            if Popular[main_beer_style.iloc[0]['beer_style']][i][0] not in recommendation and Popular[main_beer_style.iloc[0]['beer_style']][i][0] not in new_data['beer_name'].values:
                    recommendation.append(Popular[main_beer_style.iloc[0]['beer_style']][i][0])
            i = i + 1
            print(len(recommendation))
    
    i = 5
    while(len(recommendation) < 10):
        if Most_popular[i][0] not in recommendation:
            recommendation.append(Most_popular[i][0])
        i = i+ 1
    return recommendation


In [53]:
# for a new user
cold_start("Assdas", 0 , "11")


[' Duvel',
 ' Unibroue 10',
 ' Struise Tsjeeses Reserva',
 ' Southampton Grand Cru',
 ' Unibroue 11',
 ' Damnation',
 ' La Chouffe',
 ' De Proef Signature Les Deux Brasseurs Ale (w/Jason Perkins)',
 ' De Proef Flemish Primitive Wild Ale (Pig Nun)',
 ' Don de Dieu']

In [None]:
# for a old user ( Assuming he has less review)
cold_start(" Tilley4", 1)
