In [6]:
from __future__ import annotations
from typing import Tuple
import pandas as pd
import pickle
import numpy as np
from scipy import spatial
import sklearn as skl
import os
import json
import fasttext
import fasttext.util
import string
import math
from nltk.corpus import stopwords
import nltk
import functools
import operator
stops = set(stopwords.words('english'))
stops.add('notes')
stops.add('note')
stops.add('hint')
stops.add('hints')


In [None]:
wmd = fasttext.load_model('../glove/wiki.en/wiki.en.bin')

In [101]:
country_dict = {}
eng = open('../data/countriesEN', 'r').readlines()
fin = open('../data/countriesFI', 'r').readlines()
for e,f in zip(eng, fin):
    country_dict[f.strip()] = e.strip().lower()

print(country_dict)


{'Argentiina': 'argentina', 'Australia': 'australia', 'Bolivia': 'bolivia', 'Bulgaria': 'bulgaria', 'Chile': 'chile', 'Englanti': 'english', 'Espanja': 'spain', 'Etelä-Afrikka': 'south africa', 'Euroopan unioni': 'european union', 'Georgia': 'georgia', 'Intia': 'india', 'Israel': 'israel', 'Italia': 'italy', 'Itävalta': 'austria', 'Kanada': 'canada', 'Kiina': 'china', 'Kreikka': 'greece', 'Kroatia': 'croatia', 'Libanon': 'lebanon', 'Luxemburg': 'luxembourg', 'Meksiko': 'mexico', 'Moldova': 'moldova', 'Montenegro': 'montenegro', 'Muu alkuperämaa': 'other country of origin', 'Peru': 'peru', 'Portugali': 'portugal', 'Ranska': 'france', 'Romania': 'romania', 'Saksa': 'germany', 'Serbia': 'serbia', 'Slovakia': 'slovakia', 'Slovenia': 'slovenia', 'Sveitsi': 'switzerland', 'Tsekki': 'czech republic', 'Turkki': 'turkey', 'Unkari': 'hungary', 'Uruguay': 'uruguay', 'Uusi-Seelanti': 'new zealand', 'Venäjä': 'russia', 'Yhdysvallat': 'united states', 'Japani': 'japan', 'Kypros': 'cyprus', 'Pohjois-

In [84]:
def partition_words(desc: str, ) -> tuple[str]:
    #print(desc)
    terms = process(desc).split(',')
    #print(terms)
    try: 
        color = terms[0]
        #print(color)
        mouthfeel = terms[1]
        #print(mouthfeel)
        taste = terms[2:]
        #print(taste)
        return [color], [mouthfeel], taste
    except IndexError:
        return "", "", ""
def process(desc: str) -> str:
    tempTerms = []
    desc = desc.lower().strip()
    mapping = desc.maketrans({'-': ' ', '_': ' '})
    desc = desc.translate(mapping)
    for terms in desc.split(','):
        #print(terms)
        term = ' '.join([ t for t in terms.split() if (t not in stops)])
        #print(term)
        tempTerms.append(term)
    #print(tempTerms)
    return ','.join(tempTerms)

def computeTasteVect(desc: tuple) -> np.ndarray:
    try: 
        taste = desc[2]
    except IndexError:
        return np.nan
    tasteVec = np.zeros(300)
    tt = extractTokens(taste)
    for t in tt:
        #for each word in the taste description, fetch the word vector and sum them all together
        #print(t)
        tasteVec = np.add(tasteVec, wmd.get_word_vector(t))
    #store the summed vector representation in beer_taste_vectors dic
    return tasteVec

def computeColorVect(desc: tuple) -> np.ndarray:
    try: 
        color = desc[0]
    except IndexError:
        return np.nan
    colVec = np.zeros(300)
    cc = extractTokens(color)
    for c in cc:
        #print(c)
        colVec = np.add(colVec, wmd.get_word_vector(c))
    return colVec

def computeFeelVect(desc: tuple) -> np.ndarray:
    try: 
        feel = desc[1]
    except IndexError:
        return np.nan
    feelVec = np.zeros(300)
    ff = extractTokens(feel)
    for f in ff:
        #print(f)
        feelVec = np.add(feelVec, wmd.get_word_vector(f))
    return feelVec

def extractTokens(l: list):
    nl = []
    for i in l:
        sl = i.split(' ')
        if type(sl) == list:
            for s in sl:
                nl.append(s)
        else:
            nl.append(sl)
    return nl

def computeCountryVect(maa: str) -> np.ndarray:
    return wmd.get_word_vector(maa)

In [85]:
#load the beer data from the json
#for each beer, 
bdf = pd.read_json(open('../data/beer_foods_tastes.json','r'), orient='index')
bdf.head()

Unnamed: 0,foods,taste_desc
718897,"[Tapas_ja_antipasti, Seurustelujuoma, Miedot_j...","Golden-yellow, acidic, ripe apricot notes, tea..."
715894,"[Grilliruoka, Kana_kalkkuna, Miedot_juustot, P...","Amber-yellow, full-bodied, cloudy, with a rich..."
758594,"[Rasvainen_kala, Miedot_juustot, Nautiskelujuo...","Caramel-brown, full-bodied, cloudy, with a ric..."
730097,"[Voimakkaat_juustot, Riistalinnut, Nautiskeluj...","Mahogany-brown, full-bodied, cloudy, with a ri..."
919855,"[Ayriaiset, Vaharasvainen_kala, Nautiskelujuoma]","Straw-yellow, medium-bodied, cloudy, mildly ho..."


In [86]:
bdf['taste_desc_partition'] = bdf['taste_desc'].apply(partition_words)
bdf.head()

Unnamed: 0,foods,taste_desc,taste_desc_partition
718897,"[Tapas_ja_antipasti, Seurustelujuoma, Miedot_j...","Golden-yellow, acidic, ripe apricot notes, tea...","([golden yellow], [acidic], [ripe apricot, tea..."
715894,"[Grilliruoka, Kana_kalkkuna, Miedot_juustot, P...","Amber-yellow, full-bodied, cloudy, with a rich...","([amber yellow], [full bodied], [cloudy, rich ..."
758594,"[Rasvainen_kala, Miedot_juustot, Nautiskelujuo...","Caramel-brown, full-bodied, cloudy, with a ric...","([caramel brown], [full bodied], [cloudy, rich..."
730097,"[Voimakkaat_juustot, Riistalinnut, Nautiskeluj...","Mahogany-brown, full-bodied, cloudy, with a ri...","([mahogany brown], [full bodied], [cloudy, ric..."
919855,"[Ayriaiset, Vaharasvainen_kala, Nautiskelujuoma]","Straw-yellow, medium-bodied, cloudy, mildly ho...","([straw yellow], [medium bodied], [cloudy, mil..."


In [93]:
bdf['taste_vect'] = bdf['taste_desc_partition'].apply(computeTasteVect)
bdf['col_vect'] = bdf['taste_desc_partition'].apply(computeColorVect)
bdf['feel_vect'] = bdf['taste_desc_partition'].apply(computeFeelVect)
bdf.index = bdf.index.rename('Numero')
bdf.head()

Unnamed: 0_level_0,foods,taste_desc,taste_desc_partition,taste_vect,col_vect,feel_vect
Numero,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
718897,"[Tapas_ja_antipasti, Seurustelujuoma, Miedot_j...","Golden-yellow, acidic, ripe apricot notes, tea...","([golden yellow], [acidic], [ripe apricot, tea...","[0.07987281028181314, 0.43386150151491165, -0....","[-0.9887842833995819, -0.05810368980746716, 0....","[-0.2133956402540207, 0.22910979390144348, -0...."
715894,"[Grilliruoka, Kana_kalkkuna, Miedot_juustot, P...","Amber-yellow, full-bodied, cloudy, with a rich...","([amber yellow], [full bodied], [cloudy, rich ...","[-1.7081294995732605, 0.27396272122859955, -1....","[-0.14614155888557434, 0.19808457244653255, 0....","[-0.1657096636481583, 0.44968753308057785, -0...."
758594,"[Rasvainen_kala, Miedot_juustot, Nautiskelujuo...","Caramel-brown, full-bodied, cloudy, with a ric...","([caramel brown], [full bodied], [cloudy, rich...","[-1.6897689891047776, 0.5013146325945854, -2.0...","[-0.3954937756061554, -0.3169432431459427, -0....","[-0.1657096636481583, 0.44968753308057785, -0...."
730097,"[Voimakkaat_juustot, Riistalinnut, Nautiskeluj...","Mahogany-brown, full-bodied, cloudy, with a ri...","([mahogany brown], [full bodied], [cloudy, ric...","[-1.7472486239857972, 0.26350243110209703, -1....","[-0.27681805193424225, -0.19199614971876144, 0...","[-0.1657096636481583, 0.44968753308057785, -0...."
919855,"[Ayriaiset, Vaharasvainen_kala, Nautiskelujuoma]","Straw-yellow, medium-bodied, cloudy, mildly ho...","([straw yellow], [medium bodied], [cloudy, mil...","[-0.9882284207269549, 0.47043924778699875, -1....","[-0.757635623216629, -0.03374277602415532, -0....","[-0.09927538456395268, 0.6873573958873749, 0.1..."


In [102]:
asdf = pickle.load(open('../data/country_vect_df.bin', 'rb'))

In [103]:
asdf.head()

Unnamed: 0,country_FI,country_EN,vect
0,Argentiina,Argentina,"[-0.1663632, 0.07589441, 0.32345265, -0.123166..."
1,Australia,Australia,"[-0.38573915, 0.08130928, -0.13291593, 0.13398..."
2,Bolivia,Bolivia,"[-0.25193572, -0.09912944, 0.1734743, -0.03175..."
3,Bulgaria,Bulgaria,"[-0.3960979, 0.10610346, -0.17874336, 0.508475..."
4,Chile,Chile,"[-0.027834663, -0.043989502, 0.10476164, 0.285..."


In [107]:
alkodf = pd.read_csv(open('../data/alkon-hinnasto-tekstitiedostona.csv','r'), sep = ',', index_col='Numero')
alkodf.head()

Unnamed: 0_level_0,Nimi,Valmistaja,Pullokoko,Hinta,Litrahinta,Uutuus,Hinnastojärjestyskoodi,Tyyppi,Alatyyppi,Erityisryhmä,...,Suljentatyyppi,Alkoholi-%,Hapot g/l,Sokeri g/l,Kantavierrep-%,Väri EBC,Katkerot EBU,Energia kcal/100 ml,Valikoima,EAN
Numero,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
945096,Adrianna Vineyard Fortuna Terrae Malbec 2015,Catena Zapata,"0,75 l",84.96,113.28,,110,punaviinit,Mehevä & Hilloinen,,...,luonnonkorkki,14.0,5.6,2.0,,,,80.0,tilausvalikoima,7794450005274
935635,Adrianna Vineyard River Stones Malbec 2017,Catena Zapata,"0,75 l",124.91,166.55,,110,punaviinit,Mehevä & Hilloinen,,...,luonnonkorkki,13.5,5.0,2.0,,,,80.0,tilausvalikoima,7794450005304
440567,Alamos Malbec Organic 2020,Alamos,"0,75 l",10.99,14.52,,110,punaviinit,Mehevä & Hilloinen,luomu,...,metallinen kierrekapseli,13.5,5.1,,,,,80.0,vakiovalikoima,7794450004871
424107,Alamos Tempranillo 2016,Alamos,"0,75 l",10.99,14.52,,110,punaviinit,Mehevä & Hilloinen,,...,metallinen kierrekapseli,12.5,4.6,2.0,,,,70.0,vakiovalikoima,7794450092410
939055,Altos Las Hormigas Clásico Malbec 2018,Altos Las Hormigas,"0,75 l",16.99,22.65,,110,punaviinit,Mehevä & Hilloinen,ympäristövastuullinen pakkaus,...,luonnonkorkki,13.5,5.1,3.0,,,,80.0,tilausvalikoima,7798051950032


In [108]:
bdf = bdf.join(alkodf[['Kantavierrep-%','Väri EBC','Katkerot EBU', 'Litrahinta', 'Valmistusmaa']], 'Numero')
bdf.head()

Unnamed: 0_level_0,foods,taste_desc,taste_desc_partition,taste_vect,col_vect,feel_vect,Kantavierrep-%,Väri EBC,Katkerot EBU,Litrahinta,Valmistusmaa
Numero,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
718897,"[Tapas_ja_antipasti, Seurustelujuoma, Miedot_j...","Golden-yellow, acidic, ripe apricot notes, tea...","([golden yellow], [acidic], [ripe apricot, tea...","[0.07987281028181314, 0.43386150151491165, -0....","[-0.9887842833995819, -0.05810368980746716, 0....","[-0.2133956402540207, 0.22910979390144348, -0....",26.6,,,23.92,Australia
715894,"[Grilliruoka, Kana_kalkkuna, Miedot_juustot, P...","Amber-yellow, full-bodied, cloudy, with a rich...","([amber yellow], [full bodied], [cloudy, rich ...","[-1.7081294995732605, 0.27396272122859955, -1....","[-0.14614155888557434, 0.19808457244653255, 0....","[-0.1657096636481583, 0.44968753308057785, -0....",17.8,15.9,23.0,15.18,Belgia
758594,"[Rasvainen_kala, Miedot_juustot, Nautiskelujuo...","Caramel-brown, full-bodied, cloudy, with a ric...","([caramel brown], [full bodied], [cloudy, rich...","[-1.6897689891047776, 0.5013146325945854, -2.0...","[-0.3954937756061554, -0.3169432431459427, -0....","[-0.1657096636481583, 0.44968753308057785, -0....",18.5,41.8,24.0,16.18,Belgia
730097,"[Voimakkaat_juustot, Riistalinnut, Nautiskeluj...","Mahogany-brown, full-bodied, cloudy, with a ri...","([mahogany brown], [full bodied], [cloudy, ric...","[-1.7472486239857972, 0.26350243110209703, -1....","[-0.27681805193424225, -0.19199614971876144, 0...","[-0.1657096636481583, 0.44968753308057785, -0....",19.4,70.5,18.0,20.68,Belgia
919855,"[Ayriaiset, Vaharasvainen_kala, Nautiskelujuoma]","Straw-yellow, medium-bodied, cloudy, mildly ho...","([straw yellow], [medium bodied], [cloudy, mil...","[-0.9882284207269549, 0.47043924778699875, -1....","[-0.757635623216629, -0.03374277602415532, -0....","[-0.09927538456395268, 0.6873573958873749, 0.1...",17.6,,7.0,16.42,Belgia


In [118]:
#bdf = bdf.rename(mapper={'Valmistusmaa': 'country_FI'},axis=1, errors='raise', )
bdf['country_EN'] = bdf['country_FI'].apply(lambda x: country_dict[x].lower())

In [119]:
bdf['country_vect'] = bdf['country_EN'].apply(computeCountryVect)
bdf.head()

Unnamed: 0_level_0,foods,taste_desc,taste_desc_partition,taste_vect,col_vect,feel_vect,Kantavierrep-%,Väri EBC,Katkerot EBU,Litrahinta,country_FI,country_EN,country_vect
Numero,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
718897,"[Tapas_ja_antipasti, Seurustelujuoma, Miedot_j...","Golden-yellow, acidic, ripe apricot notes, tea...","([golden yellow], [acidic], [ripe apricot, tea...","[0.07987281028181314, 0.43386150151491165, -0....","[-0.9887842833995819, -0.05810368980746716, 0....","[-0.2133956402540207, 0.22910979390144348, -0....",26.6,,,23.92,Australia,australia,"[-0.2850604, -0.03134475, -0.22727826, 0.54968..."
715894,"[Grilliruoka, Kana_kalkkuna, Miedot_juustot, P...","Amber-yellow, full-bodied, cloudy, with a rich...","([amber yellow], [full bodied], [cloudy, rich ...","[-1.7081294995732605, 0.27396272122859955, -1....","[-0.14614155888557434, 0.19808457244653255, 0....","[-0.1657096636481583, 0.44968753308057785, -0....",17.8,15.9,23.0,15.18,Belgia,belgium,"[-0.19977358, -0.12839371, -0.2806692, 0.11698..."
758594,"[Rasvainen_kala, Miedot_juustot, Nautiskelujuo...","Caramel-brown, full-bodied, cloudy, with a ric...","([caramel brown], [full bodied], [cloudy, rich...","[-1.6897689891047776, 0.5013146325945854, -2.0...","[-0.3954937756061554, -0.3169432431459427, -0....","[-0.1657096636481583, 0.44968753308057785, -0....",18.5,41.8,24.0,16.18,Belgia,belgium,"[-0.19977358, -0.12839371, -0.2806692, 0.11698..."
730097,"[Voimakkaat_juustot, Riistalinnut, Nautiskeluj...","Mahogany-brown, full-bodied, cloudy, with a ri...","([mahogany brown], [full bodied], [cloudy, ric...","[-1.7472486239857972, 0.26350243110209703, -1....","[-0.27681805193424225, -0.19199614971876144, 0...","[-0.1657096636481583, 0.44968753308057785, -0....",19.4,70.5,18.0,20.68,Belgia,belgium,"[-0.19977358, -0.12839371, -0.2806692, 0.11698..."
919855,"[Ayriaiset, Vaharasvainen_kala, Nautiskelujuoma]","Straw-yellow, medium-bodied, cloudy, mildly ho...","([straw yellow], [medium bodied], [cloudy, mil...","[-0.9882284207269549, 0.47043924778699875, -1....","[-0.757635623216629, -0.03374277602415532, -0....","[-0.09927538456395268, 0.6873573958873749, 0.1...",17.6,,7.0,16.42,Belgia,belgium,"[-0.19977358, -0.12839371, -0.2806692, 0.11698..."


In [120]:
bdf.head()

Unnamed: 0_level_0,foods,taste_desc,taste_desc_partition,taste_vect,col_vect,feel_vect,Kantavierrep-%,Väri EBC,Katkerot EBU,Litrahinta,country_FI,country_EN,country_vect
Numero,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
718897,"[Tapas_ja_antipasti, Seurustelujuoma, Miedot_j...","Golden-yellow, acidic, ripe apricot notes, tea...","([golden yellow], [acidic], [ripe apricot, tea...","[0.07987281028181314, 0.43386150151491165, -0....","[-0.9887842833995819, -0.05810368980746716, 0....","[-0.2133956402540207, 0.22910979390144348, -0....",26.6,,,23.92,Australia,australia,"[-0.2850604, -0.03134475, -0.22727826, 0.54968..."
715894,"[Grilliruoka, Kana_kalkkuna, Miedot_juustot, P...","Amber-yellow, full-bodied, cloudy, with a rich...","([amber yellow], [full bodied], [cloudy, rich ...","[-1.7081294995732605, 0.27396272122859955, -1....","[-0.14614155888557434, 0.19808457244653255, 0....","[-0.1657096636481583, 0.44968753308057785, -0....",17.8,15.9,23.0,15.18,Belgia,belgium,"[-0.19977358, -0.12839371, -0.2806692, 0.11698..."
758594,"[Rasvainen_kala, Miedot_juustot, Nautiskelujuo...","Caramel-brown, full-bodied, cloudy, with a ric...","([caramel brown], [full bodied], [cloudy, rich...","[-1.6897689891047776, 0.5013146325945854, -2.0...","[-0.3954937756061554, -0.3169432431459427, -0....","[-0.1657096636481583, 0.44968753308057785, -0....",18.5,41.8,24.0,16.18,Belgia,belgium,"[-0.19977358, -0.12839371, -0.2806692, 0.11698..."
730097,"[Voimakkaat_juustot, Riistalinnut, Nautiskeluj...","Mahogany-brown, full-bodied, cloudy, with a ri...","([mahogany brown], [full bodied], [cloudy, ric...","[-1.7472486239857972, 0.26350243110209703, -1....","[-0.27681805193424225, -0.19199614971876144, 0...","[-0.1657096636481583, 0.44968753308057785, -0....",19.4,70.5,18.0,20.68,Belgia,belgium,"[-0.19977358, -0.12839371, -0.2806692, 0.11698..."
919855,"[Ayriaiset, Vaharasvainen_kala, Nautiskelujuoma]","Straw-yellow, medium-bodied, cloudy, mildly ho...","([straw yellow], [medium bodied], [cloudy, mil...","[-0.9882284207269549, 0.47043924778699875, -1....","[-0.757635623216629, -0.03374277602415532, -0....","[-0.09927538456395268, 0.6873573958873749, 0.1...",17.6,,7.0,16.42,Belgia,belgium,"[-0.19977358, -0.12839371, -0.2806692, 0.11698..."


In [121]:
bdf.shape

(1156, 13)

In [122]:
bdf.columns

Index(['foods', 'taste_desc', 'taste_desc_partition', 'taste_vect', 'col_vect',
       'feel_vect', 'Kantavierrep-%', 'Väri EBC', 'Katkerot EBU', 'Litrahinta',
       'country_FI', 'country_EN', 'country_vect'],
      dtype='object')

In [None]:
pickle.dump(bdf,open('../dataframe/model_df.bin', 'wb'))