In [1]:
user_of_interest = 'tnt'

In [2]:
rand_state = 12

# Cleaning Pipeline

In [3]:
from BeerMe.Pipeline import *

In [4]:
df = IMPORT_CLEAN_STEP(db_path='data/beer.db')



1. NA Count...
ABV                923
global_rating        0
user_rating       3882
IBU              47035
dtype: int64


2. Finding IQR outliers...
FEATURE ABV
num of outliers = 3,421
% of outliers = 3.33%


NA Count...
ABV              0
global_rating    0
user_rating      0
IBU              0
dtype: int64


In [5]:
df = df[~df.duplicated()]

In [6]:
df = df[['username', 'beer_name', 'beer_description', 'brewery', 'ABV', 'IBU', 'user_rating']]

In [7]:
df[df.duplicated()]

Unnamed: 0,username,beer_name,beer_description,brewery,ABV,IBU,user_rating
107169,tsharp93,Firestone Lager,Lager - Helles,Firestone Walker Brewing Company,4.5,17.0,3.5
107174,tsharp93,Head High,IPA - American,Kane Brewing Company,6.6,80.0,3.5
107176,tsharp93,The Drongo,Sour - Other,Clown Shoes,6.5,44.1594,4.25
107177,tsharp93,Forbidden Planet,IPA - Imperial / Double,Raduga,8.4,115.0,4.5
107178,tsharp93,They Both Melt,Lager - Munich Dunkel,Bluejacket,5.6,44.1594,3.25


# 0. Baseline - Average 

In [8]:
global_mean = np.mean(df['user_rating'])
print("Global user_rating mean = {:.2f}".format(global_mean))

Global user_rating mean = 3.71


In [9]:
beer_list = list(df[df['username']=='tsharp93']['beer_name'])

In [10]:
estimated_rating_list = []
error_list = []
for beer in beer_list:
    try:
        estimated_rating = global_mean
        estimated_rating_list.append(estimated_rating)

        user_rating = df[(df['username']=='tsharp93') & (df['beer_name']==beer)]['user_rating'].iloc[0].astype(float)
        error_list.append(estimated_rating-user_rating)
    except IndexError:
        print(beer)

In [11]:
mse = np.mean(np.array(error_list)**2)
mae = np.absolute(error_list).mean()
quarter_error_perc = 100 * np.sum(np.absolute(error_list) < 0.25) / len(error_list)
half_error_perc = 100 * np.sum(np.absolute(error_list) < 0.50) / len(error_list)

print("MSE = {:.2f}".format(mse))
print("MAE = {:.2f}".format(mae))
print("Errors within 0.25 = {:.2f} %".format(quarter_error_perc))
print("Errors within 0.50 = {:.2f} %".format(half_error_perc))

MSE = 0.23
MAE = 0.40
Errors within 0.25 = 35.00 %
Errors within 0.50 = 67.50 %


# 1. Cold Start

### For use when a user does not have enough data to build a model for his/her preferences

In [12]:
df = COSINE_STEP(df, user_of_interest)

User of Reference for Cosine Sim = tnt
(102598, 7)
(102598, 8)


In [13]:
beer_list = list(df[df['username']=='tsharp93']['beer_name'])

In [14]:
estimated_rating_list = []
error_list = []
for beer in beer_list:
    try:
        estimated_rating = df[ (df.sort_values('nearest_neighbor_rank')['beer_name'] == beer) & (df['username']!=user_of_interest) ]['user_rating'].iloc[0]
        estimated_rating_list.append(estimated_rating)

        user_rating = df[(df['username']=='tsharp93') & (df['beer_name']==beer)]['user_rating'].iloc[0].astype(float)
        error_list.append(estimated_rating-user_rating)
    except IndexError:
        print(beer)

In [15]:
mse = np.mean(np.array(error_list)**2)
mae = np.absolute(error_list).mean()
quarter_error_perc = 100 * np.sum(np.absolute(error_list) < 0.25) / len(error_list)
half_error_perc = 100 * np.sum(np.absolute(error_list) < 0.50) / len(error_list)

print("MSE = {:.2f}".format(mse))
print("MAE = {:.2f}".format(mae))
print("Errors within 0.25 = {:.2f} %".format(quarter_error_perc))
print("Errors within 0.50 = {:.2f} %".format(half_error_perc))

MSE = 0.33
MAE = 0.38
Errors within 0.25 = 42.50 %
Errors within 0.50 = 55.00 %


#### Problems with this approach: 
#### (1) Need another user to have the beer I have selected. If I want to get my user_rating for a beer that no one has tried, I can't do that. 
#### (2) Not very accurate. Almost half the points are outside of 0.5 error - sometimes worse than baseline

# 2. Content Based Filtering 

### *Assumes the user of interest has enough data to build a model 

### A. Encoding Beer Description

In [16]:
user_df = df[df['username'] == user_of_interest]
len(user_df)

1684

In [17]:
user_df.head()

Unnamed: 0,nearest_neighbor_rank,username,beer_name,beer_description,brewery,ABV,IBU,user_rating
100914,,tnt,Society & Solitude #7,IPA - Imperial / Double,Hill Farmstead Brewery,8.0,44.1594,4.25
100915,,tnt,Riwaka Single Hop Pale Ale,Pale Ale - American,Hill Farmstead Brewery,5.2,44.1594,5.0
100916,,tnt,"Walk, Don't Run",Blonde Ale,Suarez Family Brewery,3.7,44.1594,4.5
100917,,tnt,Czech Two,Pilsner - Czech,Gravely Brewing Co,4.1,39.0,4.25
100918,,tnt,Spirit of Revolt,IPA - Imperial / Double,Revolution Brewing Company,8.2,52.0,4.0


#### Feature Engineering - Encode Categorical Beer Description

In [18]:
user_df = convert_categorical(user_df, ['beer_description'])

In [19]:
user_df.drop(['username', 'nearest_neighbor_rank', 'beer_description', 'beer_name', 'brewery'], axis=1, inplace=True)

In [20]:
user_df.shape

(1684, 135)

In [21]:
user_df.head()

Unnamed: 0,ABV,IBU,user_rating,beer_description_Altbier,beer_description_American Wild Ale,beer_description_Barleywine - American,beer_description_Barleywine - English,beer_description_Belgian Blonde,beer_description_Belgian Dubbel,beer_description_Belgian Quadrupel,...,beer_description_Stout - Russian Imperial,beer_description_Strong Ale - American,beer_description_Strong Ale - English,beer_description_Table Beer,beer_description_Traditional Ale,beer_description_Wheat Beer - American Pale Wheat,beer_description_Wheat Beer - Other,beer_description_Wheat Wine,beer_description_Winter Ale,beer_description_Witbier
100914,8.0,44.1594,4.25,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100915,5.2,44.1594,5.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100916,3.7,44.1594,4.5,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100917,4.1,39.0,4.25,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100918,8.2,52.0,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Build Model and Score

In [22]:
features = user_df.columns[list(user_df.columns != 'user_rating')]
target = 'user_rating'

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(user_df[features], user_df[target], random_state=rand_state)

from sklearn.linear_model import LassoCV
lassocv = LassoCV(fit_intercept=True, normalize=True, cv=5, random_state=rand_state)
lassocv.fit(X_train, y_train)

from sklearn.linear_model import Lasso
lasso = Lasso(alpha=lassocv.alpha_,fit_intercept=True, normalize=True, random_state=rand_state)
lasso.fit(X_train, y_train)

Lasso(alpha=0.000452166405093667, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=True, positive=False, precompute=False,
   random_state=12, selection='cyclic', tol=0.0001, warm_start=False)

In [24]:
coef_dropped_perc = 100*np.sum(lasso.coef_ == 0) / len(lasso.coef_)
print("Percentage of estimators dropped = {:.2f} %".format(coef_dropped_perc))

features_dropped = list(features[lasso.coef_ == 0])
print("Features dropped: ", features_dropped)

features_kept = list(features[lasso.coef_ != 0])
print("Features kept: ", features_kept)

Percentage of estimators dropped = 62.69 %
Features dropped:  ['beer_description_Altbier', 'beer_description_Barleywine - American', 'beer_description_Barleywine - English', 'beer_description_Belgian Blonde', 'beer_description_Belgian Strong Golden Ale', 'beer_description_Belgian Tripel', 'beer_description_Bière de Champagne / Bière Brut', 'beer_description_Bière de Garde', 'beer_description_Bière de Mars', 'beer_description_Blonde Ale', 'beer_description_Bock - Doppelbock', 'beer_description_Bock - Hell / Maibock / Lentebock', 'beer_description_Brown Ale - Belgian', 'beer_description_Brown Ale - English', 'beer_description_Brown Ale - Imperial / Double', 'beer_description_California Common', 'beer_description_Cider - Traditional', 'beer_description_Dunkelweizen', 'beer_description_English Bitter', 'beer_description_Fruit Beer', 'beer_description_Golden Ale', 'beer_description_Grisette', 'beer_description_Gruit / Ancient Herbed Ale', 'beer_description_IPA - American', 'beer_description

In [25]:
preds = lasso.predict(X_test)
error_list = preds - y_test

mse = np.mean(np.array(error_list)**2)
mae = np.absolute(error_list).mean()
quarter_error_perc = 100 * np.sum(np.absolute(error_list) < 0.25) / len(error_list)
half_error_perc = 100 * np.sum(np.absolute(error_list) < 0.50) / len(error_list)

print("MSE = {:.2f}".format(mse))
print("MAE = {:.2f}".format(mae))
print("Errors within 0.25 = {:.2f} %".format(quarter_error_perc))
print("Errors within 0.50 = {:.2f} %".format(half_error_perc))

MSE = 0.24
MAE = 0.37
Errors within 0.25 = 44.42 %
Errors within 0.50 = 72.92 %


##### Improved Accuracy over both Cold Start and Baseline

### B. NLP (Count Vectorizer)

In [26]:
user_df = df[df['username'] == user_of_interest]

In [27]:
user_df.head()

Unnamed: 0,nearest_neighbor_rank,username,beer_name,beer_description,brewery,ABV,IBU,user_rating
100914,,tnt,Society & Solitude #7,IPA - Imperial / Double,Hill Farmstead Brewery,8.0,44.1594,4.25
100915,,tnt,Riwaka Single Hop Pale Ale,Pale Ale - American,Hill Farmstead Brewery,5.2,44.1594,5.0
100916,,tnt,"Walk, Don't Run",Blonde Ale,Suarez Family Brewery,3.7,44.1594,4.5
100917,,tnt,Czech Two,Pilsner - Czech,Gravely Brewing Co,4.1,39.0,4.25
100918,,tnt,Spirit of Revolt,IPA - Imperial / Double,Revolution Brewing Company,8.2,52.0,4.0


#### Feature Engineering - Count Vectorizer

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [29]:
X = vect.fit_transform(user_df['beer_description'])

tfidf_df = pd.DataFrame(X.toarray(), columns=vect.get_feature_names())
user_df = pd.concat([user_df.reset_index(drop=True), tfidf_df], axis=1)

In [30]:
user_df.drop(['username', 'nearest_neighbor_rank', 'beer_description', 'beer_name', 'brewery'], axis=1, inplace=True)

In [31]:
user_df.shape

(1684, 129)

In [32]:
user_df.head()

Unnamed: 0,ABV,IBU,user_rating,abbey,ale,altbier,amber,american,ancient,baltic,...,wee,weisse,wheat,white,wild,wine,winter,witbier,yam,zwickelbier
0,8.0,44.1594,4.25,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5.2,44.1594,5.0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3.7,44.1594,4.5,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4.1,39.0,4.25,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8.2,52.0,4.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Build Model and Score

In [33]:
features = user_df.columns[list(user_df.columns != 'user_rating')]
target = 'user_rating'

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(user_df[features], user_df[target], random_state=rand_state)

from sklearn.linear_model import LassoCV
lassocv = LassoCV(fit_intercept=True, normalize=True, cv=5, random_state=rand_state)
lassocv.fit(X_train, y_train)

from sklearn.linear_model import Lasso
lasso = Lasso(alpha=lassocv.alpha_,fit_intercept=True, normalize=True, random_state=rand_state)
lasso.fit(X_train, y_train)

Lasso(alpha=0.0002774451133807317, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=True, positive=False, precompute=False,
   random_state=12, selection='cyclic', tol=0.0001, warm_start=False)

In [35]:
coef_dropped_perc = 100*np.sum(lasso.coef_ == 0) / len(lasso.coef_)
print("Percentage of estimators dropped = {:.2f} %".format(coef_dropped_perc))

features_dropped = list(features[lasso.coef_ == 0])
print("Features dropped: ", features_dropped)

features_kept = list(features[lasso.coef_ != 0])
print("Features kept: ", features_kept)

Percentage of estimators dropped = 53.91 %
Features dropped:  ['abbey', 'ale', 'altbier', 'amber', 'american', 'ancient', 'belgian', 'berliner', 'bière', 'blonde', 'bock', 'braggot', 'bruin', 'brut', 'california', 'cascadian', 'champagne', 'common', 'czech', 'de', 'doppelbock', 'dortmunder', 'double', 'dunkelweizen', 'euro', 'export', 'flanders', 'framboise', 'garde', 'german', 'golden', 'gose', 'grape', 'gruit', 'heavy', 'hell', 'herbed', 'international', 'ipa', 'italian', 'kellerbier', 'kriek', 'kölsch', 'lambic', 'lentebock', 'maibock', 'mars', 'milk', 'oatmeal', 'old', 'oud', 'patersbier', 'pilsner', 'porter', 'pumpkin', 'pyment', 'roggenbier', 'schwarzbier', 'scotch', 'session', 'strong', 'triple', 'vienna', 'wee', 'weisse', 'wheat', 'winter', 'yam', 'zwickelbier']
Features kept:  ['ABV', 'IBU', 'baltic', 'barleywine', 'beer', 'bitter', 'black', 'brown', 'cider', 'coffee', 'cream', 'dark', 'dry', 'dubbel', 'england', 'english', 'extra', 'farmhouse', 'fruit', 'fruited', 'grisette',

In [36]:
preds = lasso.predict(X_test)
error_list = preds - y_test

mse = np.mean(np.array(error_list)**2)
mae = np.absolute(error_list).mean()
quarter_error_perc = 100 * np.sum(np.absolute(error_list) < 0.25) / len(error_list)
half_error_perc = 100 * np.sum(np.absolute(error_list) < 0.50) / len(error_list)

print("MSE = {:.2f}".format(mse))
print("MAE = {:.2f}".format(mae))
print("Errors within 0.25 = {:.2f} %".format(quarter_error_perc))
print("Errors within 0.50 = {:.2f} %".format(half_error_perc))

MSE = 0.24
MAE = 0.37
Errors within 0.25 = 47.03 %
Errors within 0.50 = 72.21 %


### C. NLP (TFIDF)

In [37]:
user_df = df[df['username'] == user_of_interest]

In [38]:
user_df.head()

Unnamed: 0,nearest_neighbor_rank,username,beer_name,beer_description,brewery,ABV,IBU,user_rating
100914,,tnt,Society & Solitude #7,IPA - Imperial / Double,Hill Farmstead Brewery,8.0,44.1594,4.25
100915,,tnt,Riwaka Single Hop Pale Ale,Pale Ale - American,Hill Farmstead Brewery,5.2,44.1594,5.0
100916,,tnt,"Walk, Don't Run",Blonde Ale,Suarez Family Brewery,3.7,44.1594,4.5
100917,,tnt,Czech Two,Pilsner - Czech,Gravely Brewing Co,4.1,39.0,4.25
100918,,tnt,Spirit of Revolt,IPA - Imperial / Double,Revolution Brewing Company,8.2,52.0,4.0


#### Feature Engineering - Count Vectorizer

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()

In [40]:
X = vect.fit_transform(user_df['beer_description'])

tfidf_df = pd.DataFrame(X.toarray(), columns=vect.get_feature_names())
user_df = pd.concat([user_df.reset_index(drop=True), tfidf_df], axis=1)

In [41]:
user_df.drop(['username', 'nearest_neighbor_rank', 'beer_description', 'beer_name', 'brewery'], axis=1, inplace=True)

In [42]:
user_df.shape

(1684, 129)

In [43]:
user_df.head()

Unnamed: 0,ABV,IBU,user_rating,abbey,ale,altbier,amber,american,ancient,baltic,...,wee,weisse,wheat,white,wild,wine,winter,witbier,yam,zwickelbier
0,8.0,44.1594,4.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5.2,44.1594,5.0,0.0,0.490558,0.0,0.0,0.472183,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.7,44.1594,4.5,0.0,0.346861,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.1,39.0,4.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8.2,52.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Build Model and Score

In [44]:
features = user_df.columns[list(user_df.columns != 'user_rating')]
target = 'user_rating'

In [45]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(user_df[features], user_df[target], random_state=rand_state)

from sklearn.linear_model import LassoCV
lassocv = LassoCV(fit_intercept=True, normalize=True, cv=5, random_state=rand_state)
lassocv.fit(X_train, y_train)

from sklearn.linear_model import Lasso
lasso = Lasso(alpha=lassocv.alpha_,fit_intercept=True, normalize=True, random_state=rand_state)
lasso.fit(X_train, y_train)

Lasso(alpha=0.0002774451133807317, copy_X=True, fit_intercept=True,
   max_iter=1000, normalize=True, positive=False, precompute=False,
   random_state=12, selection='cyclic', tol=0.0001, warm_start=False)

In [46]:
coef_dropped_perc = 100*np.sum(lasso.coef_ == 0) / len(lasso.coef_)
print("Percentage of estimators dropped = {:.2f} %".format(coef_dropped_perc))

features_dropped = list(features[lasso.coef_ == 0])
print("Features dropped: ", features_dropped)

features_kept = list(features[lasso.coef_ != 0])
print("Features kept: ", features_kept)

Percentage of estimators dropped = 56.25 %
Features dropped:  ['abbey', 'ale', 'altbier', 'amber', 'american', 'ancient', 'belgian', 'berliner', 'bière', 'blonde', 'bock', 'braggot', 'bruin', 'brut', 'california', 'cascadian', 'champagne', 'cider', 'common', 'czech', 'de', 'doppelbock', 'dortmunder', 'double', 'dunkelweizen', 'euro', 'export', 'flanders', 'framboise', 'garde', 'german', 'golden', 'gose', 'grape', 'gruit', 'heavy', 'hell', 'herbed', 'international', 'ipa', 'ipl', 'italian', 'kellerbier', 'kriek', 'kölsch', 'lambic', 'lentebock', 'maibock', 'mars', 'milk', 'oatmeal', 'old', 'oud', 'patersbier', 'pilsner', 'porter', 'pumpkin', 'pyment', 'roggenbier', 'rye', 'schwarzbier', 'scotch', 'session', 'strong', 'triple', 'vienna', 'wee', 'weisse', 'wheat', 'winter', 'yam', 'zwickelbier']
Features kept:  ['ABV', 'IBU', 'baltic', 'barleywine', 'beer', 'bitter', 'black', 'brown', 'coffee', 'cream', 'dark', 'dry', 'dubbel', 'england', 'english', 'extra', 'farmhouse', 'fruit', 'fruited

In [47]:
preds = lasso.predict(X_test)
error_list = preds - y_test

mse = np.mean(np.array(error_list)**2)
mae = np.absolute(error_list).mean()
quarter_error_perc = 100 * np.sum(np.absolute(error_list) < 0.25) / len(error_list)
half_error_perc = 100 * np.sum(np.absolute(error_list) < 0.50) / len(error_list)

print("MSE = {:.2f}".format(mse))
print("MAE = {:.2f}".format(mae))
print("Errors within 0.25 = {:.2f} %".format(quarter_error_perc))
print("Errors within 0.50 = {:.2f} %".format(half_error_perc))

MSE = 0.24
MAE = 0.37
Errors within 0.25 = 46.79 %
Errors within 0.50 = 72.21 %


In [48]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, preds)

0.369173870352508

# Hybrid 