# Cleaning Data

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from ast import literal_eval
from sklearn.metrics.pairwise import pairwise_kernels
import pickle
import os.path

In [2]:
# Read in CSV file
wine_file = 'Resources/final_data.csv'

In [3]:
df = pd.read_csv(wine_file)

In [5]:
df.head()

Unnamed: 0,id,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
1,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine¬†,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
2,5,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15,Northern Spain,Navarra,,Michael Schachner,@wineschach,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem
3,6,Italy,"Here's a bright, informal red that opens with ...",Belsito,87,16,Sicily & Sardinia,Vittoria,,Kerin O‚ÄôKeefe,@kerinokeefe,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,Terre di Giurfo
4,9,France,This has great depth of flavor with its fresh ...,Les Natures,87,27,Alsace,Alsace,,Roger Voss,@vossroger,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...,Pinot Gris,Jean-Baptiste Adam


In [4]:
df.head()

Unnamed: 0,id,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
1,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine¬†,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
2,5,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15,Northern Spain,Navarra,,Michael Schachner,@wineschach,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem
3,6,Italy,"Here's a bright, informal red that opens with ...",Belsito,87,16,Sicily & Sardinia,Vittoria,,Kerin O‚ÄôKeefe,@kerinokeefe,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,Terre di Giurfo
4,9,France,This has great depth of flavor with its fresh ...,Les Natures,87,27,Alsace,Alsace,,Roger Voss,@vossroger,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...,Pinot Gris,Jean-Baptiste Adam


In [7]:
# Drop rows without a price
df = df[df.price.notnull()]

In [8]:
df.count()

id                       40999
country                  40999
description              40999
designation              40999
points                   40999
price                    40999
province                 40999
region_1                 40999
region_2                 19752
taster_name              31458
taster_twitter_handle    30259
title                    40999
variety                  40999
winery                   40999
dtype: int64

In [9]:
# Drop rows without a designation
df = df[df.designation.notnull()]

In [10]:
df.count()

id                       40999
country                  40999
description              40999
designation              40999
points                   40999
price                    40999
province                 40999
region_1                 40999
region_2                 19752
taster_name              31458
taster_twitter_handle    30259
title                    40999
variety                  40999
winery                   40999
dtype: int64

In [11]:
df = df[df.region_1.notnull()]

In [12]:
df.count()

id                       40999
country                  40999
description              40999
designation              40999
points                   40999
price                    40999
province                 40999
region_1                 40999
region_2                 19752
taster_name              31458
taster_twitter_handle    30259
title                    40999
variety                  40999
winery                   40999
dtype: int64

In [13]:
len(df)

40999

In [None]:
# Export file as a CSV
df.to_csv("Output/clean_final.csv")

# Recommender System with Test Data

In [6]:
# Define the TD-IDF Vectorizer Object and remove english stop words
tfidf = TfidfVectorizer(stop_words='english')

In [7]:
# Construct the matrix
tfidf = tfidf.fit_transform(df['description'])

In [8]:
# Shape of matrix; 70,175 wines described by 24,253 different words
tfidf.shape

(40999, 20595)

In [9]:
# Compute the cosine similarity matrix
# tfidf shows rate of when words appear in entire dataset
cosine_sim = pairwise_kernels(tfidf, tfidf, metric='cosine', n_jobs=-1)

In [10]:
# Construct a reverse map of indices and wine titles
indices = pd.Series(df.index, index=df['title']).drop_duplicates()


In [11]:
# Function that takes wine title as an input and outputs most similar wines within our dataset
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get index of wine that matches title
    idx = indices[title]
    # Get the pairwise similarity scores of all wines with the input wine
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort wines based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 5 most similar wines
    sim_scores = sim_scores[1:6]
    # Get the wine indices
    wine_indices = [i[0] for i in sim_scores]
    # Return the top 5 most similar wines
    # Figure out how to return price and points
    return df['title'].iloc[wine_indices]


In [12]:
get_recommendations('Pier 2007 Vila Riserva  (Barbaresco)')

23003    Poggio Verrano 2009 Dr√≤mos L'Altro  (Maremma ...
3964     Dante Rivetti 2012 Bricco di Neive Riserva  (B...
11402    Dei 2011 Bossona Riserva  (Vino Nobile di Mont...
17813     Camigliano 2011 Gualto  (Brunello di Montalcino)
8762     Tenuta La Fuga 2009 Due Sorelle Riserva  (Brun...
Name: title, dtype: object

In [15]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if taster exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [17]:
features = ['taster_name', 'variety', 'province']

for feature in features:
    df[feature] = df[feature].apply(clean_data)

In [18]:
def create_soup(x):
    return ' '.join(x['taster_name']) + ' ' + ' '.join(x['variety']) + ' '.join(x['province'])

In [20]:
# Create a new soup feature
df['soup'] = df.apply(create_soup, axis=1)

In [22]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df['soup'])

In [23]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [25]:
# Reset index of your main DataFrame and construct reverse mapping as before
df = df.reset_index()
indices = pd.Series(df.index, index=df['title'])

In [26]:
get_recommendations('Pier 2007 Vila Riserva  (Barbaresco)', cosine_sim2)

77      Ceretto 2003 Bricco Rocche Brunate  (Barolo)
79             Giacomo Ascheri 2001 Sorano  (Barolo)
83     Giacomo Ascheri 2003 Vigna dei Pola  (Barolo)
172          Cascina Adelaide 2005 4 Vigne  (Barolo)
195        Cavallotto 2010 Vignolo Riserva  (Barolo)
Name: title, dtype: object

# Saving Model

In [25]:
# Serialize data using pickle
# Because pickle cannot pickle bytes larger than 4 GB, data needs to be cut in pieces
n_bytes = 2**31
max_bytes = 2**31 - 1
data = bytearray(n_bytes)

bytes_out = pickle.dumps(data)
with open('wine_cosine.pk', 'wb') as f_out:
    for idx in range(0, len(bytes_out), max_bytes):
        f_out.write(bytes_out[idx:idx+max_bytes])

In [None]:
# Serialize indices data using pickle
with open('wine_data.pk1', 'wb') as pickle_file:
    pickle.dump(indices, pickle_file)
    

In [26]:
# Serialize dataframe using pickle
df.to_pickle('df.pk')