In [90]:
from __future__ import annotations
from typing import Tuple
import pandas as pd
import pickle
import numpy as np
from scipy import spatial
import sklearn as skl
import os
import json
import fasttext
import fasttext.util
import string
import math
from nltk.corpus import stopwords
import nltk
stops = set(stopwords.words('english'))
stops.add('notes')
stops.add('note')
stops.add('hint')
stops.add('hints')

In [6]:
wmd = fasttext.load_model('../glove/wiki.en/wiki.en.bin')



In [7]:
fasttext.util.reduce_model(wmd, 100)

<fasttext.FastText._FastText at 0x7f3b2f358c40>

In [52]:
beers = json.load(open('../data/beer_foods_tastes.json','r'))
beer_taste_vectors = {}
for b in beers.keys():
    desc = beers[b]['taste_desc']
    beer_taste_vectors[b] = np.array([wmd.get_word_vector(d) for d in desc.split()][0]).flatten()
print(len(beer_taste_vectors))
vAppearance = wmd.get_word_vector('appearance')
vColor = wmd.get_word_vector('color')
vTaste = wmd.get_word_vector('taste')
vAroma = wmd.get_word_vector('aroma')
vMouthFeel = wmd.get_word_vector('mouthfeel')
vTexture = wmd.get_word_vector('texture')

1156


In [58]:
def identify_token(token: str) -> str:
        label = measure_attributes(wmd.get_word_vector(token))
        if label is not None:
            return label
        else:
            raise ValueError(f'Labelling fail for {token} with label {label}')
    
def measure_attributes(v1: np.ndarray) -> tuple:
    scores = {}
    scores.clear()
    scores['appearance'] = distance(v1, vAppearance)
    scores['color'] = distance(v1, vColor)
    scores['taste'] = distance(v1, vTaste)
    scores['aroma'] = distance(v1, vAroma)
    scores['mouthfeel'] = distance(v1, vMouthFeel)
    scores['texture']  = distance(v1, vTexture)
    print(scores)
    return max(scores, key=scores.get)
    

def distance(v1: np.ndarray, v2: np.ndarray) -> float: 
    return (1 - spatial.distance.cosine(v1, v2)) 

In [56]:
def nearest_beer(beer_id: str):
    ranking = {}
    errors = 0
    success = 0
    groundVect = beer_taste_vectors[beer_id]
    for b in beer_taste_vectors.keys():
        try:
            compareVect = beer_taste_vectors[b]
            #print(compareVect.shape, groundVect.shape)
            ranking[b] = math.sqrt((1 - spatial.distance.cosine(groundVect, compareVect)))
            success += 1
        except ValueError as e:
            #print(f'skipped processing for {b}')
            #print(e)
            errors += 1
    print(f'{errors} errors')
    print(f'{success} done')
    return dict(sorted(ranking.items(), key=lambda item: item[1]))
        

In [175]:
def partition_words(str: desc) -> tuple[str]:
    terms = process(desc).split(',')
    #print(terms)
    color = terms[0]
    #print(color)
    mouthfeel = terms[1]
    #print(mouthfeel)
    taste = terms[2:]
    #print(taste)
    return color, mouthfeel, taste

def process(desc: str) -> str:
    tempTerms = []
    desc = desc.lower().strip()
    mapping = desc.maketrans({'-': '_'})
    desc = desc.translate(mapping)
    for terms in desc.split(','):
        #print(terms)
        term = ' '.join([ t for t in terms.split() if (t not in stops)])
        #print(term)
        tempTerms.append(term)
    #print(tempTerms)
    return ','.join(tempTerms)

In [176]:
col, textu, taste = partition_words("Golden-yellow, full-bodied, cloudy, very strongly hopped, apricot notes, fruity \n")

In [177]:
spatial.distance.euclidean( wmd.get_word_vector('toasted malt'), wmd.get_word_vector('toasted_malt'))

3.138782501220703

In [178]:
col, feel, taste = partition_words("Golden-yellow, full-bodied, cloudy, very strongly hopped, apricot notes, fruity \n")

In [179]:
col

'cinnamon_brown'

In [180]:
feel

'extra full_bodied'

In [181]:
taste

['cloudy', 'mildly hopped', 'toasted malt', 'grainy', 'aromatic']