In [1]:
import pandas as pd
import pickle
import numpy as np
import fasttext
import fasttext.util
import json
from scipy import spatial
from nltk.corpus import stopwords
import nltk
stops = set(stopwords.words('english'))
stops.add('notes')
stops.add('note')
stops.add('hint')
stops.add('hints')

food_dict = json.load(open('../data/food_dict.json','r'))

In [2]:
df = pickle.load(open('../dataframe/model_df.bin','rb'))

In [3]:
alko_csv = pd.read_csv(open('../data/alkon-hinnasto-tekstitiedostona.csv', 'r'))

In [4]:
df = df.merge(alko_csv[['Numero','EAN']], left_on='Numero', right_on='Numero')

In [5]:
pickle.dump(df, open('../dataframe/model_df_v2.bin', 'wb'))

In [6]:
bin_df = pickle.load(open('../dataframe/model_df_v2.bin',  'rb'))

In [7]:
wmd = fasttext.load_model('../glove/wiki.en/wiki.en.bin')



In [8]:
def translate_foods(foods: list):
    return [food_dict[food][0] for food in foods]    

In [9]:
bin_df['foods'] = bin_df.foods.apply(translate_foods)


In [10]:
bin_df[bin_df['Numero'] == 919855]['taste_desc'].values[0]

'Straw-yellow, medium-bodied, cloudy, mildly hopped, aromatic, sour apple, citrus notes \n'

In [23]:
def compute_food_vectors(food_tags):
    tags_processed = []
    #print(food_tags)
    vect = np.zeros(300)
    #tags = food_tags.split(',')
    for tag in food_tags:
        tag = tag.strip().replace(r'-_',' ')
        tt = tag.split(' ')
        if type(tt) == list:
            tags_processed.extend(tt)
        else:
            tags_processed.append(tt)
    for t in tags_processed:
        #print(t)
        vect = np.add(vect, wmd.get_word_vector(t))
        #print(t, wmd.get_nearest_neighbors(t))
    return vect

In [25]:
bin_df['foods_vect'] = bin_df['foods'].apply(compute_food_vectors)

In [26]:
bin_df.columns

Index(['Numero', 'foods', 'taste_desc', 'taste_desc_partition', 'taste_vect',
       'col_vect', 'feel_vect', 'Kantavierrep-%', 'Väri EBC', 'Katkerot EBU',
       'Litrahinta', 'country_FI', 'country_EN', 'country_vect', 'EAN',
       'foods_vect'],
      dtype='object')

In [27]:
alkodf = pd.read_csv(open('../data/alkon-hinnasto-tekstitiedostona.csv','r'), sep = ',')
alkodf.columns

Index(['Numero', 'Nimi', 'Valmistaja', 'Pullokoko', 'Hinta', 'Litrahinta',
       'Uutuus', 'Hinnastojärjestyskoodi', 'Tyyppi', 'Alatyyppi',
       'Erityisryhmä', 'Oluttyyppi', 'Valmistusmaa', 'Alue', 'Vuosikerta',
       'Etikettimerkintöjä', 'Huomautus', 'Rypäleet', 'Luonnehdinta',
       'Pakkaustyyppi', 'Suljentatyyppi', 'Alkoholi-%', 'Hapot g/l',
       'Sokeri g/l', 'Kantavierrep-%', 'Väri EBC', 'Katkerot EBU',
       'Energia kcal/100 ml', 'Valikoima', 'EAN'],
      dtype='object')

In [28]:
bin_df = bin_df.merge(alkodf[['Numero','Nimi']], left_on='Numero', right_on='Numero')

In [30]:
bin_df

Unnamed: 0,Numero,foods,taste_desc,taste_desc_partition,taste_vect,col_vect,feel_vect,Kantavierrep-%,Väri EBC,Katkerot EBU,Litrahinta,country_FI,country_EN,country_vect,EAN,foods_vect,Nimi
0,718897,"[tapas and antipasti, party wine, mild cheese,...","Golden-yellow, acidic, ripe apricot notes, tea...","([golden yellow], [acidic], [ripe apricot, tea...","[0.07987281028181314, 0.43386150151491165, -0....","[-0.9887842833995819, -0.05810368980746716, 0....","[-0.2133956402540207, 0.22910979390144348, -0....",26.6,,,23.92,Australia,australia,"[-0.2850604, -0.03134475, -0.22727826, 0.54968...",9317705000107,"[0.664138131774962, 0.3014734573662281, -1.479...",Maxwell Honey Mead
1,715894,"[grilled food, chicken, turkey, mild cheese, p...","Amber-yellow, full-bodied, cloudy, with a rich...","([amber yellow], [full bodied], [cloudy, rich ...","[-1.7081294995732605, 0.27396272122859955, -1....","[-0.14614155888557434, 0.19808457244653255, 0....","[-0.1657096636481583, 0.44968753308057785, -0....",17.8,15.9,23.0,15.18,Belgia,belgium,"[-0.19977358, -0.12839371, -0.2806692, 0.11698...",5425007658828,"[-0.9217289164662361, 0.48395066149532795, -0....",Achel Blond
2,758594,"[fatty fish, mild cheese, meditation beverage,...","Caramel-brown, full-bodied, cloudy, with a ric...","([caramel brown], [full bodied], [cloudy, rich...","[-1.6897689891047776, 0.5013146325945854, -2.0...","[-0.3954937756061554, -0.3169432431459427, -0....","[-0.1657096636481583, 0.44968753308057785, -0....",18.5,41.8,24.0,16.18,Belgia,belgium,"[-0.19977358, -0.12839371, -0.2806692, 0.11698...",5425007658880,"[0.5714964913204312, 0.6492249220609665, -1.59...",Achel Bruin
3,730097,"[strong cheeses, game birds, meditation bevera...","Mahogany-brown, full-bodied, cloudy, with a ri...","([mahogany brown], [full bodied], [cloudy, ric...","[-1.7472486239857972, 0.26350243110209703, -1....","[-0.27681805193424225, -0.19199614971876144, 0...","[-0.1657096636481583, 0.44968753308057785, -0....",19.4,70.5,18.0,20.68,Belgia,belgium,"[-0.19977358, -0.12839371, -0.2806692, 0.11698...",5425007658859,"[-0.7808311134576797, 0.035434480756521225, -1...",Achel Extra Bruin
4,919855,"[seafood, lean fish, meditation beverage]","Straw-yellow, medium-bodied, cloudy, mildly ho...","([straw yellow], [medium bodied], [cloudy, mil...","[-0.9882284207269549, 0.47043924778699875, -1....","[-0.757635623216629, -0.03374277602415532, -0....","[-0.09927538456395268, 0.6873573958873749, 0.1...",17.6,,7.0,16.42,Belgia,belgium,"[-0.19977358, -0.12839371, -0.2806692, 0.11698...",5425031890355,"[-0.41338769625872374, 0.730044960975647, -0.9...",Alvinne Phi Blond Sour Ale
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1151,911916,"[party wine, meditation beverage]","Mahogany-brown, full-bodied, strongly hopped, ...","([mahogany brown], [full bodied], [strongly ho...","[-1.485752671957016, -0.10107788443565369, -1....","[-0.27681805193424225, -0.19199614971876144, 0...","[-0.1657096636481583, 0.44968753308057785, -0....",26.5,84.5,55.0,45.26,Yhdysvallat,united states,"[0.045622084, -0.35862827, 0.1916659, 0.238043...",636251830341,"[0.11925933137536049, -0.3152843862771988, -0....",Stone Southern Charred 2015
1152,935551,"[game, meditation beverage, chocolate deserts]","Black, extra full-bodied, medium hopped, espre...","([black], [extra full bodied], [medium hopped,...","[-1.7364337434992194, 0.47930875420570374, -1....","[-0.3380114436149597, -0.0010778139112517238, ...","[-0.2778867376036942, 0.6079885140061378, -0.5...",26.0,,49.0,40.86,Yhdysvallat,united states,"[0.045622084, -0.35862827, 0.1916659, 0.238043...",856467003074,"[-0.5057903826236725, -0.6452207937836647, -1....",Westbrook Mexican Cake Imperial Stout
1153,906362,"[berries and fruits, mild cheese, meditation b...","Yellowy brown, light-bodied, mildly hopped, ci...","([yellowy brown], [light bodied], [mildly hopp...","[-0.2375907376408577, -0.07339736819267273, -1...","[-0.6801424920558929, -0.04885224997997284, -0...","[-0.2531137359328568, 0.38831834215670824, 0.2...",13.1,,22.0,15.27,Belgia,belgium,"[-0.19977358, -0.12839371, -0.2806692, 0.11698...",5425018070893,"[-0.31398187205195427, -0.21874704211950302, -...",Oud Beersel Jasmine Flower Infused Lambic hana...
1154,951272,"[berries and fruits, party wine, mild cheese, ...","Yellowy brown, light-bodied, mildly hopped, fr...","([yellowy brown], [light bodied], [mildly hopp...","[-0.2375907376408577, -0.07339736819267273, -1...","[-0.6801424920558929, -0.04885224997997284, -0...","[-0.2531137359328568, 0.38831834215670824, 0.2...",12.9,,17.0,15.27,Belgia,belgium,"[-0.19977358, -0.12839371, -0.2806692, 0.11698...",5425018070626,"[-0.10079468041658401, -0.7185215502977371, -0...",Oud Beersel Rozenlambiek hanapakkaus


In [31]:
pickle.dump(bin_df,open('../dataframe/model_df_v2.bin','wb'))