# Tag generation
In this notebook I will try to extract keywords or tags that describe well each wine review. Firstly, I will try a TFIDF approach with a noun pos tagging filtering and then I will search for other methods that might work.

In [2]:
import pandas as pd
import numpy as np
import spacy
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nlp = spacy.load('en')

In [3]:
os.chdir('/home/fykos/Documents/workspace/wine_recommendation_system/')

In [3]:
wines = pd.read_csv('data/raw/winemag-data-130k-v2.csv')

In [4]:
wines.head(3)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm


# Normalizing and preprocessing the wine reviews

In [4]:
def normalize(review):
    review_letters = re.sub('[^a-zA-Z]', ' ', str(review))
    review_letters = review_letters.lower()
    return (" ".join(review_letters.split()))

In [5]:
def remove_stopwords(review):
    stop_words = set(stopwords.words('english'))
    ls = [word for word in review.split() if word not in stop_words]
    txt = " ".join(ls)
    return (txt)

In [6]:
def lemmatizing(review):
    words = TextBlob(review).words.singularize()
    return (" ".join(words))

In [7]:
def noun_extractor(review):
    doc = nlp(review)
    return " ".join(token.text for token in doc if token.tag_ == 'NN')

In [10]:
wines['noun_words'] = wines['description'].map(noun_extractor)

In [22]:
wines.head(10)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,noun_words
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,fruit broom brimstone herb palate apple citrus...
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,fruity wine Firm berry acidity
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,lime flesh rind dominate pineapple acidity win...
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,rind lemon pith blossom palate bit honey guava...
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,bottling country wine companion winter stew
5,5,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.0,Northern Spain,Navarra,,Michael Schachner,@wineschach,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem,raspberry whiff case mouth tomatoey acidity pl...
6,6,Italy,"Here's a bright, informal red that opens with ...",Belsito,87,16.0,Sicily & Sardinia,Vittoria,,Kerin O’Keefe,@kerinokeefe,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,Terre di Giurfo,red berry pepper savory herb palate acidity
7,7,France,This dry and restrained wine offers spice in p...,,87,24.0,Alsace,Alsace,,Roger Voss,@vossroger,Trimbach 2012 Gewurztraminer (Alsace),Gewürztraminer,Trimbach,wine spice profusion acidity texture food
8,8,Germany,Savory dried thyme notes accent sunnier flavor...,Shine,87,12.0,Rheinhessen,,,Anna Lee C. Iijima,,Heinz Eifel 2013 Shine Gewürztraminer (Rheinhe...,Gewürztraminer,Heinz Eifel,Savory thyme sunnier peach wine fruity footprint
9,9,France,This has great depth of flavor with its fresh ...,Les Natures,87,27.0,Alsace,Alsace,,Roger Voss,@vossroger,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...,Pinot Gris,Jean-Baptiste Adam,depth flavor apple pear touch spice acidity te...


In [23]:
wines.to_csv('data/modified/wines_with_nouns.csv')

# Finding important words with TFIDF

In [4]:
wines = pd.read_csv('data/modified/wines_with_nouns.csv')

In [5]:
wines.dropna(axis=0, inplace=True)
wines.reset_index(inplace=True)

In [6]:
stops = ['wine', 'accessibility', 'picture', 'value', 'end', 'structure']
tfidf_vectorizer = TfidfVectorizer(stop_words=stops)
tfidf_matrix = tfidf_vectorizer.fit_transform(wines['noun_words'])

In [7]:
# features holds a list of all the words in the tfidf's vocabulary in the same order as the column in the matrix
features = tfidf_vectorizer.get_feature_names()
weights = np.asarray(tfidf_matrix.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term':features, 'weights':weights})
weights_df = weights_df.sort_values(by='weights', ascending=False)

In [8]:
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

In [13]:
def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    ''' Top tfidf features in specific document (matrix row) '''
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)

In [14]:
top_feats_in_doc(tfidf_matrix, features, 0)

Unnamed: 0,feature,tfidf
0,winter,0.534875
1,stew,0.519907
2,country,0.464839
3,companion,0.420021
4,bottling,0.226081
5,filtering,0.0
6,film,0.0
7,filling,0.0
8,fill,0.0
9,filet,0.0


In [46]:
# call the foods database 
food_db = pd.read_csv('data/raw/8b. AUSNUT 2011-13 AHS Food Nutrient Database.csv')
# process foods
test = set(word.strip().lower() for ls in list(map(lambda x:x.split(',') ,food_db['Food Name'].tolist())) for word in ls)

In [49]:
wines['description'][0]

"Much like the regular bottling from 2012, this comes across as rather rough and tannic, with rustic, earthy, herbal characteristics. Nonetheless, if you think of it as a pleasantly unfussy country wine, it's a good companion to a hearty winter stew."

In [47]:
# pickout the foods from the wine list
terms = weights_df[weights_df['weights'] > 0.001]
foods = []
for term in terms['term']:
    if term in test:
        print(term)
        foods.append(term)

fruit
cherry
spice
vanilla
plum
pepper
apple
chocolate
blackberry
raspberry
lemon
pear
berry
coffee
strawberry
citrus
peach
licorice
leather
cinnamon
lime
light
cranberry
currant
tea
blueberry
grape
meat
grapefruit
caramel
melon
olive
orange
sugar
pomegranate
pineapple
mocha
mint
juice
honey
pie
butter
thyme
jam
sage
beef
firm
apricot
red
skin
coconut
milk
fennel
flower
cream
fig
syrup
watermelon
table
almond
rock
dill
cardamom
salt
jasmine
white
tongue
mushroom
sea
bacon
bean
sauce
ginger
butterscotch
flesh
soy
tart
heart
tomato
bread
banana
guava
oil
pink
pine
steak
rose
salmon
root
mulberry
custard
gold
liqueur
cracker
pork
truffle
mango
yeast
drink
cake
roast
energy
beer
fat
raisin
maple
water
nectarine
pastry
gingerbread
powder
cheese
lychee
marshmallow
seafood
lamb
hazelnut
chicken
extract
creamy
garden
marzipan
chamomile
ice
