### This notebook is to build an algorithm for recommending similar wines based on online review and inherent wine knowledge

In [1]:
import pandas as pd
import numpy as np
import string
from collections import Counter
import scipy as sp
from pandas import ExcelWriter
from pandas import ExcelFile
import wikipedia as wk
import re
import random

In [2]:
#NLP word learning packages
from glove import Corpus, Glove
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Zijun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Zijun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
import pickle
def save_obj(obj, name ):
    with open('./'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('./' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [4]:
#load the models
#information from the wine folly nots with "Name,Places,type,traits(body,sweetness, etc),grapes,flavors + wiki on all the flavors"
glove = load_obj("pkl_data/glove_note_wiki")
corpus = load_obj("pkl_data/corpus_note_wiki")

In [4]:
path = "./wine-reviews"
filename = "winemag-data-130k-v2.csv"
df= pd.read_csv(path+"/"+filename)

In [6]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [7]:
df_new = df[['title','variety','country','description']]

In [8]:
len(df_new)

129971

In [9]:
#remove duplicates
df_new = df_new.drop_duplicates()

In [10]:
len(df_new)

119988

In [11]:
df_new["variety"]=df_new["variety"].str.replace(" ","_")

In [12]:
df_new["title"] = df_new["title"].str.replace("(","")
df_new["title"] = df_new["title"].str.replace(")","")
df_new["description"] = df_new["description"].str.replace(","," ")

In [13]:
col_names = df_new.columns

In [14]:
#make sure every cell is string
df_new["title"] = df_new["title"].astype(str)
df_new["description"] = df_new["description"].astype(str)
df_new["variety"] = df_new["variety"].astype(str)
df_new["country"] = df_new["country"].astype(str)

In [15]:
df_new["lines"] = 0
df_new.head(5)

Unnamed: 0,title,variety,country,description,lines
0,Nicosia 2013 Vulkà Bianco Etna,White_Blend,Italy,Aromas include tropical fruit broom brimston...,0
1,Quinta dos Avidagos 2011 Avidagos Red Douro,Portuguese_Red,Portugal,This is ripe and fruity a wine that is smooth...,0
2,Rainstorm 2013 Pinot Gris Willamette Valley,Pinot_Gris,US,Tart and snappy the flavors of lime flesh and...,0
3,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,US,Pineapple rind lemon pith and orange blossom ...,0
4,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot_Noir,US,Much like the regular bottling from 2012 this...,0


In [16]:
#this cell takes some time
df_new["lines"] = 0
for index,row in df_new.iterrows():
    df_new.loc[index,"lines"] = ",".join(row[col_names]).lower()

In [17]:
#clean the punctuations
df_new["lines"] = df_new["lines"].str.replace(","," ")
df_new["lines"] = df_new["lines"].str.replace(".","")
df_new["lines"] = df_new["lines"].str.replace("'","")

In [19]:
len(df_new["lines"])

119988

In [20]:
save_obj(df_new,"pkl_data/df_online_review")

In [16]:
df_new = load_obj("pkl_data/df_online_review")

In [21]:
#tokenize
lines = [str(x) if isinstance(x,float) else word_tokenize(x) for x in df_new["lines"]]
stop_words = set(stopwords.words('english')) 

In [22]:
#get rid of english stopwords
online_lines = []
for l in lines:
    temp = 0
    if l == 'nan':
        temp = ['nan']
    else:
        temp = [w for w in l if (w not in stop_words)]                  
    online_lines.append(temp)

In [23]:
#save
save_obj(online_lines,"pkl_data/online_lines_notes")

In [17]:
online_lines = load_obj("pkl_data/online_lines_notes")

In [7]:
len(online_lines)

119988

In [8]:
#check most common words
flatten_lines =  [item for sublist in online_lines for item in sublist]
word_counter = Counter(flatten_lines)
word_counter.most_common(10)

[('wine', 73222),
 ('flavors', 57955),
 ('us', 50579),
 ('fruit', 41578),
 ('aromas', 36243),
 ('palate', 34726),
 ('finish', 32124),
 ('acidity', 32101),
 ('red', 31215),
 ('valley', 29443)]

In [None]:
#add the words to the original model which only included wine-folly and details on flavors from wikipedia
corpus.fit(online_lines, window=10)
#word embedding of 50
glove = Glove(no_components=50, learning_rate=0.01)
glove.fit(corpus.matrix, epochs=200, no_threads=4, verbose=True)

In [28]:
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')
#save the glove model
save_obj(glove,"pkl_data/glove_online")
save_obj(corpus,"pkl_data/corpus_online")

In [18]:
#load
glove = load_obj("pkl_data/glove_online")
corpus = load_obj("pkl_data/corpus_online")

In [19]:
#calculate the vectors
sentence_vectors = []
real_vectors = []
embed_n = 50
for each_line in online_lines:
    temp = 0
    sum_temp = 0
    if (each_line == 'nan') or (len(each_line) == 0):
        #the vector is 0 if the name is "nan"
        temp = [0]*embed_n
    else:
        #2 vectors are being generated as 2 useful inputs later in the analysis
        #1: mean of the word vectors -> sentence vector
        #2: the word vectors themselves
        temp = np.mean([glove.word_vectors[glove.dictionary[w]] for w in each_line],axis=0)    
        temp_realv = [glove.word_vectors[glove.dictionary[w]] for w in each_line]
    sentence_vectors.append(temp) 
    real_vectors.append(temp_realv)

In [67]:
#pick one of the wines
target_index = random.randint(1,len(online_lines)+1)
target = sentence_vectors[target_index]
Euc_distance= []
Cos_distance = []
temp = 0

for x in sentence_vectors:
    temp = np.linalg.norm(target - np.array(x))
    Euc_distance.append(temp)
    
for x in sentence_vectors:
    temp = sp.spatial.distance.cosine(target,np.array(x))
    Cos_distance.append(temp)

In [68]:
Euc_distance = np.array(Euc_distance)
Cos_distance= np.array(Cos_distance)

In [69]:
#the closest neighbor
index_Euc = Euc_distance.argsort()[1]
print(index_Euc)

34631


In [70]:
#the closest neighbor
index_Cos = Cos_distance.argsort()[1]
print(index_Cos)

53956


In [71]:
df_new.iloc[target_index]

title          Elyse 2012 Morisoli Vineyard Zinfandel Rutherford
variety                                                Zinfandel
country                                                       US
description    Thick in cherry-vanilla  big oak and tannin  t...
lines          elyse 2012 morisoli vineyard zinfandel rutherf...
Name: 31603, dtype: object

In [72]:
#get the index to see the details on description
original_target_index = df_new.index.values[target_index]

In [73]:
df.loc[original_target_index]

Unnamed: 0                                                           31603
country                                                                 US
description              Thick in cherry-vanilla, big oak and tannin, t...
designation                                              Morisoli Vineyard
points                                                                  90
price                                                                   37
province                                                        California
region_1                                                        Rutherford
region_2                                                              Napa
taster_name                                                 Virginie Boone
taster_twitter_handle                                              @vboone
title                    Elyse 2012 Morisoli Vineyard Zinfandel (Ruther...
variety                                                          Zinfandel
winery                   

In [74]:
df.loc[original_target_index,"title"]

'Elyse 2012 Morisoli Vineyard Zinfandel (Rutherford)'

In [75]:
df_new.loc[original_target_index,"description"]

'Thick in cherry-vanilla  big oak and tannin  this vineyard-designate contains handfuls of Petite Sirah and mixed blacks that are  as the producer puts it  too long to list. Given 10 months in American oak  it offers a balance of fruit and leathery power  from raspberry and blackberry to Sunday morning bacon. Drink now through 2020.'

In [76]:
df_new.iloc[index_Euc]

title          John's Blend 2003 Individual Selection Caberne...
variety                                       Cabernet_Sauvignon
country                                                Australia
description    From veteran winemaker John Glaetzer  this Cab...
lines          johns blend 2003 individual selection cabernet...
Name: 35341, dtype: object

In [77]:
#get the original index
original_Euc_index = df_new.index.values[index_Euc]
original_Euc_index

35341

In [78]:
df.loc[original_Euc_index]

Unnamed: 0                                                           35341
country                                                          Australia
description              From veteran winemaker John Glaetzer, this Cab...
designation                                           Individual Selection
points                                                                  93
price                                                                   50
province                                                   South Australia
region_1                                                   Langhorne Creek
region_2                                                               NaN
taster_name                                                 Joe Czerwinski
taster_twitter_handle                                               @JoeCz
title                    John's Blend 2003 Individual Selection Caberne...
variety                                                 Cabernet Sauvignon
winery                   

In [79]:
df_new.loc[original_Euc_index,"description"]

"From veteran winemaker John Glaetzer  this Cabernet from Langhorne Creek is something of a throwback  spending some 28 months in American oak hogsheads. Yet there's clearly enough cherry and cassis fruit to support all of the vanilla and toasted coconut  and the wine finishes long and balanced. Drink 2010–2020."

In [80]:
df_new.iloc[index_Cos]

title          Stonestreet 2006 Christopher's Cabernet Sauvig...
variety                                       Cabernet_Sauvignon
country                                                       US
description    Hard mountain tannins characterize this Cab  w...
lines          stonestreet 2006 christophers cabernet sauvign...
Name: 55725, dtype: object

In [81]:
#get the original index
original_Cos_index = df_new.index.values[index_Cos]
original_Cos_index

55725

In [82]:
df.loc[original_Cos_index]

Unnamed: 0                                                           55725
country                                                                 US
description              Hard mountain tannins characterize this Cab, w...
designation                                                  Christopher's
points                                                                  96
price                                                                  100
province                                                        California
region_1                                                  Alexander Valley
region_2                                                            Sonoma
taster_name                                                            NaN
taster_twitter_handle                                                  NaN
title                    Stonestreet 2006 Christopher's Cabernet Sauvig...
variety                                                 Cabernet Sauvignon
winery                   

In [83]:
df_new.loc[original_Cos_index,"description"]

"Hard mountain tannins characterize this Cab  which was grown above 2 000 feet in the Mayacamas. It's hugely deep in blackberries and black currants  with notes of dark chocolate and violets. Meanwhile  100% new French oak adds an elaborate layer of smoky caramel and char that's entirely in keeping with the wine's volume. Impressive and flashy  but immature  this wine requires cellaring. Best 2012–2018."

In [84]:
#check how many tasters there are
len(df["taster_name"].unique())

20

In [85]:
#check the most similar 5 wines based on cosine distance
index_Cos = Cos_distance.argsort()[1:6]
print(index_Cos)

[ 53956  96001 100304  48296  37816]


In [86]:
#get the original index
original_Cos_index = df_new.index.values[index_Cos]
original_Cos_index

array([ 55725, 102208, 107041,  49707,  38682], dtype=int64)

In [87]:
similar_titles = df.loc[original_Cos_index,"title"]

In [88]:
for t in similar_titles:   
    print(t)

Stonestreet 2006 Christopher's Cabernet Sauvignon (Alexander Valley)
Aquinas 2014 Cabernet Sauvignon (Napa County-Sonoma County-Lake County)
Cakebread 2012 Benchland Select Cabernet Sauvignon (Napa Valley)
Kenwood 2010 Artist Series Cabernet Sauvignon (Sonoma County)
Elyse 2012 Korte Ranch Vineyard Zinfandel (St. Helena)
