In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import nltk
import re
import pickle
from scipy import sparse

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors
from nltk.corpus import stopwords
from nltk.stem.porter import *
from textblob import TextBlob
from textblob import Word
from collections import Counter
from nltk import word_tokenize
from nltk import sent_tokenize

%matplotlib inline

In [2]:
os.chdir('/home/fykos/Documents/workspace/wine_beer_exploration/')

In [3]:
wine = pd.read_csv('data/raw/winemag-data-130k-v2.csv')

In [4]:
# drop all duplicates and check how many left
new_wine = wine.drop_duplicates()
new_wine.shape

(129971, 14)

In [5]:
# check again for missing values
data = []
for col in new_wine.columns:
    data.append([col, new_wine[col].isnull().sum(), '{:.2f}%'.format((new_wine[col].isnull().sum())/len(new_wine[col])*100) ])
df = pd.DataFrame(data, columns=['columns', 'missing value count', 'missing value percentage'])
df

Unnamed: 0,columns,missing value count,missing value percentage
0,Unnamed: 0,0,0.00%
1,country,63,0.05%
2,description,0,0.00%
3,designation,37465,28.83%
4,points,0,0.00%
5,price,8996,6.92%
6,province,63,0.05%
7,region_1,21247,16.35%
8,region_2,79460,61.14%
9,taster_name,26244,20.19%


In [6]:
# I will remove all the rows that have missing values in the country and province column
# I will still have plenty to work with
new_wine = new_wine.dropna(subset=['country', 'designation', 'price', 'region_1'])
new_wine.shape

(70175, 14)

In [7]:
new_wine.drop(['Unnamed: 0', 'region_2', 'taster_name', 'taster_twitter_handle', 'designation', 'title'], axis = 1, inplace = True)

In [8]:
new_wine.reset_index(inplace=True)

In [9]:
new_wine.drop('index', axis = 1, inplace = True)

In [10]:
new_wine.head(3)

Unnamed: 0,country,description,points,price,province,region_1,variety,winery
0,US,"Pineapple rind, lemon pith and orange blossom ...",87,13.0,Michigan,Lake Michigan Shore,Riesling,St. Julian
1,US,"Much like the regular bottling from 2012, this...",87,65.0,Oregon,Willamette Valley,Pinot Noir,Sweet Cheeks
2,Spain,Blackberry and raspberry aromas show a typical...,87,15.0,Northern Spain,Navarra,Tempranillo-Merlot,Tandem


# Processing the descriptions to use in nearest neighbors

In [11]:
def normalize(review):
    review_letters = re.sub('[^a-zA-Z]', ' ', str(review))
    review_letters = review_letters.lower()
    return (" ".join(review_letters.split()))

In [12]:
def remove_stopwords(review):
    stop_words = set(stopwords.words('english'))
    ls = [word for word in review.split() if word not in stop_words]
    txt = " ".join(ls)
    return (txt)

In [13]:
def lemmatizing(review):
    words = TextBlob(review).words.singularize()
    return (" ".join(words))

In [None]:
# this list is to be given as input to word2vec because is word tokenized
wine_processed_reviews = []
for review in new_wine['description']:
    wine_processed_reviews.append(lemmatizing(remove_stopwords(normalize(review))))

In [69]:
# stops = ['wine', 'syrup', 'cake', 'cheese', 'cream', 'bean', 'hard', 'milk', 'sauce', 'barbecue', 'steak', 'rock', 'powder', 'ruby', 'oil', 'salt', 'pastry', 'flesh', 'bitter', 'sugar', 'leather', 'herbal', 'creamy', 'table', 'brown', 'golden', 'gold', 'extract', 'broad', 'natural', 'salmon', 'tongue', 'dry', 'pure', 'root', 'sea', 'port', 'chewy', 'solid', 'blue', 'pink', 'ground', 'beef', 'purple', 'spring', 'lean', 'raw', 'red', 'black', 'white', 'yellow', 'mature', 'tropical', 'meat', 'wild', 'new', 'juice', 'firm', 'sweet', 'fresh', 'light', 'flower', 'green', 'soft', 'skin', 'spice', 'dark', 'herb', 'palate', 'valley', 'finish', 'drink', 'flavor', 'fruit', 'aroma', 'note', 'texture', 'thi', 'acidity']
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf_matrix = tfidf_vectorizer.fit_transform(wine_processed_reviews)

In [70]:
# features holds a list of all the words in the tfidf's vocabulary in the same order as the column in the matrix
features = tfidf_vectorizer.get_feature_names()
weights = np.asarray(tfidf_matrix.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term':features, 'weights':weights})
weights_df = weights_df.sort_values(by='weights', ascending=False)

In [71]:
new_wine.drop('description', axis = 1, inplace = True)

In [72]:
new_wine['country'] = new_wine['country'].astype('category').cat.codes
new_wine['province'] = new_wine['province'].astype('category').cat.codes
new_wine['region_1'] = new_wine['region_1'].astype('category').cat.codes
new_wine['variety'] = new_wine['variety'].astype('category').cat.codes
new_wine['winery'] = new_wine['winery'].astype('category').cat.codes

In [73]:
new_wine.head(3)

Unnamed: 0,country,points,price,province,region_1,variety,winery
0,6,87,13.0,30,486,325,8549
1,6,87,65.0,43,1097,300,8684
2,5,87,15.0,39,673,401,8738


In [74]:
X = new_wine.as_matrix()

In [75]:
X.shape

(70175, 7)

In [76]:
tfidf_matrix.shape

(70175, 513695)

In [77]:
test = sparse.hstack((X, tfidf_matrix))

In [78]:
test = test.tocsr()

In [98]:
test.shape

(70175, 513702)

# Finding the nearest neighbor

In [101]:
from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=31, algorithm='auto', metric='euclidean').fit(test)
distances, indices = nbrs.kneighbors(test[11])

In [102]:
print(distances)
print(indices)

[[   0.            0.            7.21040938   56.17803892   64.02331604
    64.21057431   67.25310906   67.25315928   70.24948202   75.21299592
    75.2129964    75.2129964    78.4601155    89.17955639   92.03251124
    93.46649662   97.43201542   97.47818044   98.18342816   98.20373283
   100.5683239   100.64292389  104.19688982  104.37422133  108.44806191
   108.44806191  108.8668912   109.48938027  109.50337424  112.22741486
   112.41439458]]
[[55463    11 51053 52908 61037 17469 68448 49462  4641 23890 11122 53031
    847 27258 57804 29796 57706 58153 12468 24648 38390 21119 56390 61266
  30269 15818 19347 60495 56935 19518 42356]]


In [105]:
wine.iloc[11, :]

Unnamed: 0                                                              11
country                                                             France
description              This is a dry wine, very spicy, with a tight, ...
designation                                                            NaN
points                                                                  87
price                                                                   30
province                                                            Alsace
region_1                                                            Alsace
region_2                                                               NaN
taster_name                                                     Roger Voss
taster_twitter_handle                                           @vossroger
title                              Leon Beyer 2012 Gewurztraminer (Alsace)
variety                                                     Gewürztraminer
winery                   

In [104]:
wine.iloc[55463, :]

Unnamed: 0                                                           55463
country                                                                 US
description              This has light scents of peach, golden apple, ...
designation                                                     Frosty Dog
points                                                                  84
price                                                                   20
province                                                          Virginia
region_1                                                          Virginia
region_2                                                               NaN
taster_name                                                            NaN
taster_twitter_handle                                                  NaN
title                    Chateau Morrisette 2011 Frosty Dog White (Virg...
variety                                                        White Blend
winery                   

In [106]:
wine.iloc[51053, :]

Unnamed: 0                                                           51053
country                                                                 US
description              This wine is mostly Cabernet Sauvignon (71%), ...
designation                                               Taptiel Vineyard
points                                                                  92
price                                                                   45
province                                                        Washington
region_1                                                      Red Mountain
region_2                                                   Columbia Valley
taster_name                                               Sean P. Sullivan
taster_twitter_handle                                        @wawinereport
title                     Cadence 2013 Taptiel Vineyard Red (Red Mountain)
variety                                           Bordeaux-style Red Blend
winery                   

In [107]:
wine.iloc[11, :]['description']

"This is a dry wine, very spicy, with a tight, taut texture and strongly mineral character layered with citrus as well as pepper. It's a food wine with its almost crisp aftertaste."

In [109]:
wine.iloc[51053, :]['description']

'This wine is mostly Cabernet Sauvignon (71%), with the rest Merlot (18%) and Cabernet Franc. Very pure aromas of black cherry and black currant are followed by cranberry and tart fruit flavors surrounded by a firm structure. It shows a lovely sense of restraint and balance, especially considering the warm region and vintage.'

In [84]:
wine.iloc[64126, :]

Unnamed: 0                                                           64126
country                                                              Italy
description              Mario Schiopetto and his sons, Carlo and Giorg...
designation                                                            NaN
points                                                                  90
price                                                                   35
province                                                Northeastern Italy
region_1                                                            Collio
region_2                                                               NaN
taster_name                                                     Roger Voss
taster_twitter_handle                                           @vossroger
title                          Mario Schiopetto 2001 Pinot Bianco (Collio)
variety                                                       Pinot Bianco
winery                   

In [93]:
for index in indices:
    print(wine.iloc[index, :]['price'])

0          NaN
44059     24.0
4467      22.0
43036     35.0
29884    111.0
50855     15.0
23790     20.0
12361     11.0
53202     17.0
59848     18.0
31300     20.0
15204     53.0
34762     31.0
26342     22.0
64081     40.0
28747     18.0
28809     19.0
22393     36.0
68106     18.0
37426     42.0
16058     79.0
10037     14.0
16113      NaN
70026     11.0
1974      50.0
56615     60.0
9310      70.0
44060     18.0
32811      NaN
11590     20.0
57842     15.0
Name: price, dtype: float64
