# Extracting wine flavours
In this notebook we are extracting each individual flavour from the wine descriptions and we are implementing a co-occurence matrix to figure out which flavours go together.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import nltk
import re
import pickle
import spacy
import en_core_web_sm

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import *
from collections import Counter
from nltk import word_tokenize
from nltk import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity

nlp = en_core_web_sm.load()
%matplotlib inline

In [2]:
os.chdir('/home/fykos/Documents/workspace/wine_recommendation_system//')

In [3]:
wine = pd.read_csv('data/raw/winemag-data-130k-v2.csv')

# Extracting the flavours from the descriptions
Here I will use spacy to extract all the nouns and noun phrases which is the type of the fruit flavours and I will filter this list better with another list I have from external sources.

In [4]:
def normalize(review):
    review_letters = re.sub('[^a-zA-Z]', ' ', str(review))
    review_letters = review_letters.lower()
    return (" ".join(review_letters.split()))

In [5]:
def remove_stopwords(review):
    stop_words = set(stopwords.words('english'))
    ls = [word for word in review.split() if word not in stop_words]
    txt = " ".join(ls)
    return (txt)

In [6]:
def noun_finder(review):
    blob = nlp(normalize(review))
    return (" ".join([token.text for token in blob if token.tag_ == 'NN']))

In [9]:
noun_list = wine['description'].map(noun_finder).values

In [None]:
stops = ['wine', 'pasta', 'whole', 'character', 'cabernet', 'wood', 'spicy', 'tannins', 'crisp', 'juicy', 'fruits', 'blend', 'sauvignon', 'structure', 'fruity', 'aromas', 'flavors', 'ripe', 'syrup', 'cake', 'cheese', 'cream', 'bean', 'hard', 'milk', 'sauce', 'barbecue', 'steak', 'rock', 'powder', 'ruby', 'oil', 'salt', 'pastry', 'flesh', 'bitter', 'sugar', 'leather', 'herbal', 'creamy', 'table', 'brown', 'golden', 'gold', 'extract', 'broad', 'natural', 'salmon', 'tongue', 'dry', 'pure', 'root', 'sea', 'port', 'chewy', 'solid', 'blue', 'pink', 'ground', 'beef', 'purple', 'spring', 'lean', 'raw', 'red', 'black', 'white', 'yellow', 'mature', 'tropical', 'meat', 'wild', 'new', 'juice', 'firm', 'sweet', 'fresh', 'light', 'flower', 'green', 'soft', 'skin', 'spice', 'dark', 'herb', 'palate', 'valley', 'finish', 'drink', 'flavor', 'fruit', 'aroma', 'note', 'texture', 'thi', 'acidity']
tfidf_vectorizer = TfidfVectorizer(stop_words=stops)
tfidf_matrix = tfidf_vectorizer.fit_transform(noun_list)

In [None]:
# features holds a list of all the words in the tfidf's vocabulary in the same order as the column in the matrix
features = tfidf_vectorizer.get_feature_names()
weights = np.asarray(tfidf_matrix.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term':features, 'weights':weights})
weights_df = weights_df.sort_values(by='weights', ascending=False)

In [28]:
# call the foods database 
food_db = pd.read_csv('data/raw/8b. AUSNUT 2011-13 AHS Food Nutrient Database.csv')

In [29]:
# process foods
test = set(word.strip().lower() for ls in list(map(lambda x:x.split(',') ,food_db['Food Name'].tolist())) for word in ls)

In [None]:
# pickout the foods from the wine list
terms = weights_df[weights_df['weights'] > 0.001]
foods = []
for term in terms['term']:
    if term in test:
        foods.append(term)

In [22]:
print(foods)

['cherry', 'berry', 'plum', 'apple', 'blackberry', 'vanilla', 'pepper', 'citrus', 'lemon', 'raspberry', 'peach', 'pear', 'chocolate', 'currant', 'licorice', 'lime', 'coffee', 'melon', 'grapefruit', 'honey', 'apricot', 'pineapple', 'strawberry', 'cinnamon', 'almond', 'mocha', 'mint', 'orange', 'jam', 'grape', 'blueberry', 'tea', 'sage', 'pie', 'caramel', 'cranberry', 'raisin', 'olive', 'tomato', 'coconut', 'butter', 'bacon', 'fig', 'mango', 'banana', 'thyme', 'prune', 'mushroom', 'pomegranate', 'butterscotch', 'ginger', 'lychee', 'mousse', 'bread', 'nut', 'truffle', 'yeast', 'jasmine', 'nectarine', 'hazelnut', 'fennel', 'liqueur', 'tart', 'herbs', 'quince', 'dill', 'watermelon', 'heart', 'lamb', 'pork', 'chicken', 'guava', 'seafood', 'maple', 'custard', 'energy', 'soy', 'beer', 'cooking', 'coating']


# Co-occurence matrix

In [13]:
wine_flavors = ['cherry', 'berry', 'plum', 'apple', 'blackberry', 'vanilla', 'pepper', 'citrus', 'lemon', 'raspberry', 'peach', 'pear', 'chocolate', 'currant', 'licorice', 'lime', 'coffee', 'melon', 'grapefruit', 'honey', 'apricot', 'pineapple', 'strawberry', 'cinnamon', 'almond', 'mocha', 'mint', 'orange', 'jam', 'grape', 'blueberry', 'tea', 'sage', 'pie', 'caramel', 'cranberry', 'raisin', 'olive', 'tomato', 'coconut', 'butter', 'bacon', 'fig', 'mango', 'banana', 'thyme', 'prune', 'mushroom', 'pomegranate', 'butterscotch', 'ginger', 'lychee', 'mousse', 'bread', 'nut', 'truffle', 'yeast', 'jasmine', 'nectarine', 'hazelnut', 'fennel', 'liqueur', 'tart', 'herbs', 'quince', 'dill', 'watermelon', 'heart', 'lamb', 'pork', 'chicken', 'guava', 'seafood', 'maple', 'custard', 'energy', 'soy', 'beer', 'cooking', 'coating']

In [19]:
wine_test = pd.read_csv('data/modified/wines_with_nouns.csv')

In [17]:
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

In [37]:
def noun_preprocess(review):
    tokenized_review = tokenize(review)
    tokens = [token for token in tokenized_review if token in test]
    return tokens

In [18]:
import collections
from nltk.corpus import stopwords
import string
 
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via']

In [23]:
from collections import defaultdict
# remember to include the other import from the previous post
 
com = defaultdict(lambda : defaultdict(int))

for review in wine_test['noun_words']:
    terms_only = [term for term in preprocess(str(review)) if term not in stop]
    
    # Build co-occurrence matrix
    for i in range(len(terms_only)-1):            
        for j in range(i+1, len(terms_only)):
            w1, w2 = sorted([terms_only[i], terms_only[j]])                
            if w1 != w2:
                com[w1][w2] += 1

In [24]:
import operator
com_max = []
# For each term, look for the most common co-occurrent terms
for t1 in com:
    t1_max_terms = sorted(com[t1].items(), key=operator.itemgetter(1), reverse=True)[:5]
    for t2, t2_count in t1_max_terms:
        com_max.append(((t1, t2), t2_count))
# Get the most frequent co-occurrences
terms_max = sorted(com_max, key=operator.itemgetter(1), reverse=True)
print(terms_max[:100])

[(('fruit', 'wine'), 31321), (('acidity', 'wine'), 25873), (('fruit', 'palate'), 13636), (('cherry', 'wine'), 13475), (('finish', 'palate'), 12968), (('finish', 'fruit'), 12773), (('finish', 'wine'), 12636), (('palate', 'wine'), 12347), (('acidity', 'fruit'), 11951), (('nose', 'palate'), 11692), (('cherry', 'palate'), 11460), (('texture', 'wine'), 11027), (('spice', 'wine'), 10787), (('cherry', 'fruit'), 9600), (('acidity', 'palate'), 9064), (('character', 'wine'), 8692), (('oak', 'wine'), 8604), (('fruity', 'wine'), 8448), (('fruit', 'spice'), 8173), (('berry', 'wine'), 7800), (('cherry', 'finish'), 7694), (('fruit', 'oak'), 7391), (('wine', 'wood'), 7301), (('structure', 'wine'), 7239), (('berry', 'palate'), 7051), (('palate', 'spice'), 6917), (('acidity', 'finish'), 6875), (('cherry', 'spice'), 6663), (('palate', 'plum'), 6565), (('apple', 'wine'), 6531), (('plum', 'wine'), 6435), (('blend', 'wine'), 6306), (('fruit', 'nose'), 6271), (('berry', 'finish'), 5994), (('acidity', 'cherry

In [26]:
search_word = 'vanilla'
count_search = collections.Counter()
for review in wine['description']:
    terms_only = [term for term in preprocess(review) 
                  if term not in stop]
    if search_word in terms_only:
        count_search.update(terms_only)
print("Co-occurrence for %s:" % search_word)
print(count_search.most_common(100))

Co-occurrence for vanilla:
[('vanilla', 10454), ('flavors', 5658), ('wine', 4473), ('The', 4310), ('aromas', 3603), ('palate', 3576), ('fruit', 3287), ('finish', 3225), ('This', 2972), ('oak', 2862), ('cherry', 2829), ('tannins', 2213), ('black', 2131), ('acidity', 2076), ('spice', 2005), ('notes', 1901), ('ripe', 1737), ('A', 1659), ("It's", 1583), ('plum', 1508), ('nose', 1507), ('Drink', 1476), ('sweet', 1435), ('rich', 1333), ('It', 1238), ('red', 1225), ('berry', 1189), ('offers', 1175), ('toast', 1115), ('blend', 1107), ('blackberry', 1097), ('raspberry', 931), ('Cabernet', 929), ('soft', 904), ('apple', 897), ('chocolate', 878), ('shows', 872), ('Chardonnay', 868), ('creamy', 831), ('texture', 797), ('white', 774), ('crisp', 760), ('dark', 735), ('fresh', 711), ('Sauvignon', 705), ('good', 690), ('dry', 683), ('touch', 679), ('dried', 655), ('coffee', 652), ('full', 649), ('pepper', 644), ('lead', 641), ('lemon', 628), ('fruits', 618), ('peach', 615), ('alongside', 607), ('well'

In [53]:
stop_list = ['wine', 'fruit', 'light', 'bean', 'red', 'firm', 'extract', 
             'skin', 'sauce', 'white', 'drink', 'gold', 'tongue', 'spring']
flavor_cooccurences = {}
for flavor in wine_flavors:
    count_search = collections.Counter()
    for review in wine_test['noun_words']:
        terms_only = [term for term in noun_preprocess(str(review)) if term not in stop_list]
        if flavor in terms_only:
            terms_only.remove(flavor)
            count_search.update(terms_only)
#     print("Co-occurrence for %s:" % search_word)
#     print(count_search.most_common(20))
    flavor_cooccurences[flavor] = [item[0] for item in count_search.most_common(20)]

In [69]:
flavor_cooccurences

{'almond': ['apple',
  'peach',
  'citrus',
  'pear',
  'cherry',
  'flower',
  'lemon',
  'spice',
  'honey',
  'apricot',
  'berry',
  'vanilla',
  'melon',
  'paste',
  'nut',
  'grapefruit',
  'lime',
  'raspberry',
  'butter',
  'pineapple'],
 'apple': ['pear',
  'lemon',
  'citrus',
  'peach',
  'lime',
  'apple',
  'melon',
  'spice',
  'vanilla',
  'grapefruit',
  'almond',
  'honey',
  'flower',
  'pineapple',
  'apricot',
  'orange',
  'mousse',
  'juice',
  'butter',
  'bread'],
 'apricot': ['peach',
  'honey',
  'lemon',
  'citrus',
  'pear',
  'apple',
  'vanilla',
  'spice',
  'pineapple',
  'orange',
  'almond',
  'lime',
  'melon',
  'grapefruit',
  'flower',
  'jam',
  'mango',
  'sugar',
  'butterscotch',
  'grape'],
 'bacon': ['cherry',
  'blackberry',
  'pepper',
  'spice',
  'berry',
  'plum',
  'chocolate',
  'licorice',
  'fat',
  'currant',
  'meat',
  'raspberry',
  'vanilla',
  'jam',
  'blueberry',
  'leather',
  'mocha',
  'beef',
  'coffee',
  'cinnamon'],


In [55]:
def save_obj(obj, name ):
    with open('data/modified/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [56]:
save_obj(flavor_cooccurences, 'flavor_combinations')

# Creating the elasticsearch mapping

In [92]:
print(flavor_cooccurences)

{'cherry': ['spice', 'berry', 'plum', 'raspberry', 'pepper', 'blackberry', 'vanilla', 'licorice', 'chocolate', 'leather', 'cherry', 'cinnamon', 'currant', 'coffee', 'mocha', 'mint', 'flower', 'strawberry', 'pie', 'sage'], 'berry': ['cherry', 'spice', 'plum', 'pepper', 'raspberry', 'berry', 'leather', 'licorice', 'chocolate', 'vanilla', 'blackberry', 'coffee', 'currant', 'mint', 'cinnamon', 'flower', 'mocha', 'tomato', 'olive', 'sage'], 'plum': ['cherry', 'berry', 'spice', 'blackberry', 'vanilla', 'pepper', 'raspberry', 'chocolate', 'licorice', 'plum', 'leather', 'currant', 'coffee', 'mocha', 'cinnamon', 'tomato', 'blueberry', 'mint', 'olive', 'meat'], 'apple': ['pear', 'lemon', 'citrus', 'peach', 'lime', 'apple', 'melon', 'spice', 'vanilla', 'grapefruit', 'almond', 'honey', 'flower', 'pineapple', 'apricot', 'orange', 'mousse', 'juice', 'butter', 'bread'], 'blackberry': ['cherry', 'spice', 'plum', 'chocolate', 'pepper', 'licorice', 'currant', 'berry', 'vanilla', 'jam', 'leather', 'coffe

In [60]:
from elasticsearch import Elasticsearch
client = Elasticsearch('localhost')

In [138]:
mappings = {
    'mappings':{
        'flavours':{
            'properties':{
                'flavor': {'type': 'text', 'analyzer': 'english'},
                'flavor_combinations': {'type': 'text'}
            }
        }
    
}}
client.indices.create(index='findmyflavors', body=mappings)

{'acknowledged': True, 'index': 'findmyflavors', 'shards_acknowledged': True}

In [139]:
for doc_id, flavor in enumerate(flavor_cooccurences):
    client.create(index = 'findmyflavors', id=doc_id, doc_type='flavours', 
                  body={'flavor': flavor, 'flavor_combinations': flavor_cooccurences[flavor]})

In [144]:
query = {
            "query": {
                "match": {
                    "flavor": 'coffee'
                }
            }
        }

result = client.search(index = 'findmyflavors', body=query)

In [145]:
result

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'hits': {'hits': [{'_id': '16',
    '_index': 'findmyflavors',
    '_score': 2.3025851,
    '_source': {'flavor': 'coffee',
     'flavor_combinations': ['cherry',
      'plum',
      'chocolate',
      'berry',
      'spice',
      'vanilla',
      'blackberry',
      'licorice',
      'pepper',
      'mocha',
      'leather',
      'raspberry',
      'currant',
      'sage',
      'coconut',
      'blueberry',
      'olive',
      'meat',
      'coffee',
      'mint']},
    '_type': 'flavours'}],
  'max_score': 2.3025851,
  'total': 1},
 'timed_out': False,
 'took': 3}