In [1]:
import pandas as pd
import numpy as np
import os

from elasticsearch import Elasticsearch
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import *
from textblob import TextBlob
from textblob import Word
from collections import Counter
from nltk import word_tokenize
from nltk import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
os.chdir('/home/fykos/Documents/workspace/wine_recommendation_system/')

In [17]:
wine = pd.read_csv('data/raw/winemag-data-130k-v2.csv')

In [18]:
wine.drop(['Unnamed: 0', 'taster_name', 'taster_twitter_handle', 'region_2'], axis=1, inplace=True)

In [19]:
wine.drop_duplicates(inplace=True)

In [20]:
wine.dropna(inplace=True)

In [21]:
wine['id'] = range(0, len(wine))

In [22]:
wine.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,title,variety,winery,id
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,0
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,1
5,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.0,Northern Spain,Navarra,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem,2
6,Italy,"Here's a bright, informal red that opens with ...",Belsito,87,16.0,Sicily & Sardinia,Vittoria,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,Terre di Giurfo,3
9,France,This has great depth of flavor with its fresh ...,Les Natures,87,27.0,Alsace,Alsace,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...,Pinot Gris,Jean-Baptiste Adam,4


In [9]:
wine.shape

(64767, 11)

# Loading data to ElasticSearch

In [2]:
client = Elasticsearch('localhost')

In [14]:
mappings = {
    'mappings':{
        'wines':{
            'properties':{
                'id': {'type': 'text'},
                'country': {'type':'text'},
                'description': {'type': 'text', 'analyzer': 'english'},
                'designation': {'type': 'text'},
                'points': {'type': 'float'},
                'price': {'type': 'float'},
                'province': {'type': 'text'},
                'region_1': {'type': 'text'},
                'title': {'type': 'text', 'analyzer': 'english'},
                'variety': {'type': 'text'},
                'winery': {'type': 'text'}
            }
        }
    
}}
client.indices.create(index='findmywine', body=mappings)

{'acknowledged': True, 'index': 'findmywine', 'shards_acknowledged': True}

In [23]:
docs = wine.to_dict(orient='records')

for doc_id, doc in enumerate(docs):
    client.create(index = 'findmywine', id=doc_id, doc_type='wines', body=doc)

# Word2Vec stuff

In [33]:
char_splitter = re.compile("[.,;!:()-]")

In [47]:
def generate_candidate_phrases(text, stopwords):
    """ generate phrases using phrase boundary markers """
 
    # generate approximate phrases with punctation
    coarse_candidates = char_splitter.split(text.lower())
 
    candidate_phrases = []
 
    for coarse_phrase in coarse_candidates:
 
        words = re.split("\\s+", coarse_phrase)
        previous_stop = False
 
        # examine each word to determine if it is a phrase boundary marker or part of a phrase or lone ranger
        for w in words:
 
            if w in stopwords and not previous_stop:
                # phrase boundary encountered, so put a hard indicator
                candidate_phrases.append(";")
                previous_stop = True
            elif w not in stopwords and len(w) > 3:
                # keep adding words to list until a phrase boundary is detected
                candidate_phrases.append(w.strip())
                previous_stop = False
 
    # get a list of candidate phrases without boundary demarcation
    phrases = re.split(";+", ' '.join(candidate_phrases))
 
    return phrases

In [48]:
generate_candidate_phrases(wine['description'], stopwords.words('english'))

['',
 ' deep color matches ',
 ' bold grapy flavors ',
 ' touch ',
 ' sweetness soft texture ',
 ' syrupy balance ',
 ' chase ',
 ' spicy burrito ',
 ' kung chicken']

In [49]:
wine['description']

'The deep color matches the bold grapy flavors. With a touch of sweetness, soft texture and syrupy balance, this will chase down a spicy burrito or kung pao chicken.'

In [8]:
def normalize(review):
    review_letters = re.sub('[^a-zA-Z]', ' ', str(review))
    review_letters = review_letters.lower()
    return (" ".join(review_letters.split()))

In [9]:
def remove_stopwords(review):
    stop_words = set(stopwords.words('english'))
    ls = [word for word in review.split() if word not in stop_words]
    txt = " ".join(ls)
    return (txt)

In [10]:
# this list is to be given as input to word2vec because is word tokenized
wine_processed_reviews = []
for review in wine['description']:
    wine_processed_reviews.append(word_tokenize(remove_stopwords(normalize(review))))

In [19]:
from gensim.models import Word2Vec

## Word2Vec needs a large corpus to train in order to perform well
cbow_model = Word2Vec(wine_processed_reviews, min_count = 4, size = 300, window = 8, sg=1)

In [15]:
similar_pork = cbow_model.wv.most_similar('pork', topn=200)
foods_similar_to_pork = [food[0] for food in similar_pork]

In [20]:
cbow_model.wv.most_similar('vanilla', topn=10)

[('macaroon', 0.6385142207145691),
 ('copious', 0.6314226984977722),
 ('flake', 0.619184672832489),
 ('infusion', 0.6168062090873718),
 ('frosting', 0.6045944690704346),
 ('padded', 0.6019116640090942),
 ('vanillin', 0.5998919010162354),
 ('crumbs', 0.59284907579422),
 ('folding', 0.592144250869751),
 ('latte', 0.5910093784332275)]

In [29]:
from gensim.models.phrases import Phrases

test = Phrases(wine_processed_reviews)

In [31]:
test.vocab

defaultdict(int,
            {b'pineapple': 1724,
             b'rind': 461,
             b'pineapple_rind': 6,
             b'lemon': 4401,
             b'rind_lemon': 3,
             b'pith': 344,
             b'lemon_pith': 67,
             b'orange': 2956,
             b'pith_orange': 2,
             b'blossom': 764,
             b'orange_blossom': 190,
             b'start': 337,
             b'blossom_start': 1,
             b'aromas': 20924,
             b'start_aromas': 5,
             b'palate': 19756,
             b'aromas_palate': 423,
             b'bit': 3162,
             b'palate_bit': 81,
             b'opulent': 636,
             b'bit_opulent': 1,
             b'notes': 9368,
             b'opulent_notes': 7,
             b'honey': 2085,
             b'notes_honey': 33,
             b'drizzled': 38,
             b'honey_drizzled': 8,
             b'guava': 190,
             b'drizzled_guava': 1,
             b'mango': 535,
             b'guava_mango': 9,
             

# Quering Elasticsearch

In [3]:
query = {
    'query':{
        'match_all':{}
    }
}

result = client.search(index = 'findmywine', body=query)

In [4]:
result

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'hits': {'hits': [{'_id': '40',
    '_index': 'findmywine',
    '_score': 1.0,
    '_source': {'country': 'US',
     'description': 'This is made from equal parts Cabernet Franc and Merlot, with small additions of Malbec. Juicy, it reveals a raisiny quality of ripeness around integrated oak and generous tannin.',
     'designation': 'The Cypher',
     'id': 40,
     'points': 86,
     'price': 75.0,
     'province': 'California',
     'region_1': 'Napa Valley',
     'title': 'Mulvane Wine Co. 2013 The Cypher Red (Napa Valley)',
     'variety': 'Bordeaux-style Red Blend',
     'winery': 'Mulvane Wine Co.'},
    '_type': 'wines'},
   {'_id': '41',
    '_index': 'findmywine',
    '_score': 1.0,
    '_source': {'country': 'Australia',
     'description': 'This medium-bodied Chardonnay features aromas of pineapple and roasted cashew. Similar notes of ripe pineapple and nuts mark the palate, then ease to a rather soft, ea

In [37]:
wines = []
for hit in result['hits']['hits']:
    wine = hit['_source']
    wine['id'] = hit['_id']
    wines.append(wine)

In [7]:
query = {
    'query':{
            'match':{
                'region_1': 'Sonoma Valley'
            }
    }
}

result = client.search(index = 'findmywine', body=query)

In [8]:
result

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
 'hits': {'hits': [{'_id': '101',
    '_index': 'findmywine',
    '_score': 4.8111386,
    '_source': {'country': 'US',
     'description': "Black as a moonless night, dense in structure, tannic, bone dry and super-peppery, this Syrah isn't offering much relief right now. In all likelihood it will emerge from its cocoon in a few years to offer a wealth of ripe blackberry, currant and carob fruit.",
     'designation': 'Jack London Vineyard',
     'id': 101,
     'points': 88,
     'price': 25.0,
     'province': 'California',
     'region_1': 'Sonoma Valley',
     'title': 'Kenwood 2005 Jack London Vineyard Syrah (Sonoma Valley)',
     'variety': 'Syrah',
     'winery': 'Kenwood'},
    '_type': 'wines'},
   {'_id': '738',
    '_index': 'findmywine',
    '_score': 4.8111386,
    '_source': {'country': 'US',
     'description': "There's a minty, ammonia streak of greenness that can charitably be described as gooseberry

In [21]:
wines = []
i = 0
for hit in result['hits']['hits']:
    if i < 30:
        wine = hit['_source']
        print (wine['description'], ' ---> ', [word for word in foods_similar_to_pork if word in wine['description']])
        print()
        i+=1
    else:
        break

Unique aromas of chamomile, dried orange peel, hay and smoked chicken show on the nose of this bottling. The palate is also a wash of intrigue, with smoked lemon peels, roasting chicken, oregano and other dried herbs.  --->  ['roast', 'chicken', 'ham', 'roasting', 'smoked', 'tri']

The grapes come from the Chicken Dinner vineyard, which says it all. Pop this tart, tangy wine, with its Zin-like red berry flavors and hints of white pepper, and bring on the fried chicken. —P.G.  --->  ['chicken', 'fried']

A rugged, simple wine, with raspberry, cherry and spice flavors. Drink it with everyday fare, like burgers or chicken tacos.  --->  ['chicken', 'burgers', 'tacos', 'fare']

A Chardonnay with noticeable barrel influence, this is creamy and thick, with a baked apple flavor. Pair it with chicken in cream sauce or lobster and butter.  --->  ['chicken', 'sauce', 'lobster']

This fresh and easy Sicilian white is a blend of Insolia, Grecanico and Catarratto with easy citrus and stone fruit aro

In [17]:
query = {
    'query':{
            'terms':{
                'description': foods_similar_to_pork
            }
    }
}

result = client.search(index = 'findmywine', body=query)

In [18]:
wines = []
i = 0
for hit in result['hits']['hits']:
    if i < 30:
        wine = hit['_source']
        print (wine['description'], ' ---> ', [word for word in foods_similar_to_pork if word in wine['description']])
        print()
        i+=1
    else:
        break

Black plum, cola and chocolate define this textbook Malbec. The wine has body, heft and lots of fruit, but also some complexity and character. Flavors of black cherry, plum and cassis are layered and fairly boisterous, while the mouthfeel is firm and just slightly tannic. Just right for red meats and pastas.  --->  ['meats', 'pasta', 'pastas', 'jus', 'meat']

This blend of Incrocio Manzoni and Vermentino opens with bold floral intensity, plus solid background tones of peach and lemon mousse. It offers a creamy, soft mouthfeel that could stand up to white meat or shellfish.  --->  ['fish', 'shellfish', 'meat']

Great now with lamb, this Pinot is softly complex and forward in fruit, with a silky texture. It shows cherry, raspberry, mocha, anise and smoky oak flavors. Best now and for a few more years.  --->  ['lamb']

More tannic, drier and more acidic than most fine coastal Syrahs, this one from the Sierra Foothills is marked by its firm structure.The black currant, licorice, beef jerky