In [50]:
from __future__ import annotations
from typing import Tuple
import pandas as pd
import pickle
import numpy as np
from scipy import spatial
import sklearn as skl
import os
import json
import fasttext
import fasttext.util
import string
import math
from nltk.corpus import stopwords
import nltk
stops = set(stopwords.words('english'))
stops.add('notes')
stops.add('note')
stops.add('hint')
stops.add('hints')

In [51]:
#first download the model from https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip
# and extract the bin file. The download is VERY large, something like 9gb

wmd = fasttext.load_model('../glove/wiki.en/wiki.en.bin')



In [3]:
#Uncomment this if you want to reduce the dimensions of the vectors, in principle larger vectors are more accurate. Default in the model is 300
#fasttext.util.reduce_model(wmd, 100)

<fasttext.FastText._FastText at 0x7fa2f070e760>

In [94]:
#load the beer data from the json
beers = json.load(open('../data/beer_foods_tastes.json','r'))
beer_taste_vectors = {}
#for each beer, 
for b in beers.keys():
    desc = beers[b]['taste_desc']
    try:
        #extract the colour, texture and taste words
        col, feel, taste = partition_words(desc)
        #prepare empty vector for addition
        tasteVec = np.zeros(300)
        for ta in taste:
            #for each word in the taste description, fetch the word vector and sum them all together
            tasteVec = np.add(tasteVec, wmd.get_word_vector(ta))
        #store the summed vector representation in beer_taste_vectors dic 
        beer_taste_vectors[b] = tasteVec
    except IndexError as e:
        print(f'index error for {b}')


index error for 904914
index error for 902532
index error for 921188


In [58]:
#these functions are not used.
#tried to write a function to guess whether a word belongs to taste or appearance or texture, but it was not accurate enough
#def identify_token(token: str) -> str:
#        label = measure_attributes(wmd.get_word_vector(token))
#        if label is not None:
#            return label
#        else:
#            raise ValueError(f'Labelling fail for {token} with label {label}')
#    
#def measure_attributes(v1: np.ndarray) -> tuple:
#    scores = {}
#    scores.clear()
#    scores['appearance'] = distance(v1, vAppearance)
#    scores['color'] = distance(v1, vColor)
#    scores['taste'] = distance(v1, vTaste)
#    scores['aroma'] = distance(v1, vAroma)
#    scores['mouthfeel'] = distance(v1, vMouthFeel)
#    scores['texture']  = distance(v1, vTexture)
#    print(scores)
#    return max(scores, key=scores.get)
#    
#
#def distance(v1: np.ndarray, v2: np.ndarray) -> float: 
#    return (1 - spatial.distance.cosine(v1, v2)) 

In [180]:
def nearest_beers(beer_id: str, n=25):
    
    ranking = {}
    errors = 0
    success = 0
    groundVect = beer_taste_vectors[beer_id]
    for b in beer_taste_vectors.keys():
        try:
            compareVect = beer_taste_vectors[b]
            #print(compareVect.shape, groundVect.shape)
            ranking[b] = ((1 - spatial.distance.cosine(groundVect, compareVect)))
            success += 1
        except ValueError as e:
            #print(f'skipped processing for {b}')
            #print(e)
            errors += 1
    print(f'{errors} errors')
    print(f'{success} done')
    sorted_list = sorted(ranking.items(), key=lambda item: item[1])
    sorted_list.reverse()
    return sorted_list[:n]
        

In [181]:
def partition_words(desc: str) -> tuple[str]:
    #print(desc)
    terms = process(desc).split(',')
    #print(terms)
    color = terms[0]
    #print(color)
    mouthfeel = terms[1]
    #print(mouthfeel)
    taste = terms[2:]
    #print(taste)
    return color, mouthfeel, taste

def process(desc: str) -> str:
    tempTerms = []
    desc = desc.lower().strip()
    mapping = desc.maketrans({'-': '_'})
    desc = desc.translate(mapping)
    for terms in desc.split(','):
        #print(terms)
        term = ' '.join([ t for t in terms.split() if (t not in stops)])
        #print(term)
        tempTerms.append(term)
    #print(tempTerms)
    return ','.join(tempTerms)

In [182]:
closest = nearest_beers('914782', n=25)
for i in closest:
    print(i[0], i[1], sep='\t')

0 errors
1153 done
918895	1
914782	1
924208	0.9261185413166917
951272	0.9250384829635898
906362	0.9250384829635898
944094	0.9250384829635898
933365	0.9187346696817323
945748	0.9187346696817323
958154	0.9170359218713465
959684	0.9081323325230717
915645	0.9081323325230717
958438	0.9029874894647677
945454	0.894810640220399
936093	0.8907280581646628
946598	0.8880899557958529
904124	0.8880899557958529
911982	0.8837754604299842
910643	0.8837754604299842
923116	0.8837754604299842
956468	0.8837754604299842
953912	0.8837754604299842
905853	0.8837754604299842
912546	0.8827788698677748
933304	0.8827463540904674
923204	0.8827463540904674


In [169]:
print(spatial.distance.euclidean(wmd.get_word_vector('finland'), wmd.get_word_vector('spain')) **3)
print(spatial.distance.euclidean(wmd.get_word_vector('finland'), wmd.get_word_vector('sweden')) **3)
print(spatial.distance.euclidean(wmd.get_word_vector('spain'), wmd.get_word_vector('sweden')) **3)

100.26931648301252
33.901781855939504
80.08023513648725
