# Cleaning Data

In [1]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from ast import literal_eval
from sklearn.metrics.pairwise import pairwise_kernels

In [141]:
# Read in JSON file
wine_file = 'Resources/final_data.csv'

In [142]:
df = pd.read_csv(wine_file)

In [143]:
df.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
1,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine¬†,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
2,5,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15,Northern Spain,Navarra,,Michael Schachner,@wineschach,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem
3,6,Italy,"Here's a bright, informal red that opens with ...",Belsito,87,16,Sicily & Sardinia,Vittoria,,Kerin O‚ÄôKeefe,@kerinokeefe,Terre di Giurfo 2013 Belsito Frappato (Vittoria),Frappato,Terre di Giurfo
4,9,France,This has great depth of flavor with its fresh ...,Les Natures,87,27,Alsace,Alsace,,Roger Voss,@vossroger,Jean-Baptiste Adam 2012 Les Natures Pinot Gris...,Pinot Gris,Jean-Baptiste Adam


In [144]:
# Drop rows without a price
df = df[df.price.notnull()]

In [145]:
df.count()

Unnamed: 0               40999
country                  40999
description              40999
designation              40999
points                   40999
price                    40999
province                 40999
region_1                 40999
region_2                 19752
taster_name              31458
taster_twitter_handle    30259
title                    40999
variety                  40999
winery                   40999
dtype: int64

In [146]:
# Drop rows without a designation
df = df[df.designation.notnull()]

In [147]:
df.count()

Unnamed: 0               40999
country                  40999
description              40999
designation              40999
points                   40999
price                    40999
province                 40999
region_1                 40999
region_2                 19752
taster_name              31458
taster_twitter_handle    30259
title                    40999
variety                  40999
winery                   40999
dtype: int64

In [148]:
df = df[df.region_1.notnull()]

In [149]:
df.count()

Unnamed: 0               40999
country                  40999
description              40999
designation              40999
points                   40999
price                    40999
province                 40999
region_1                 40999
region_2                 19752
taster_name              31458
taster_twitter_handle    30259
title                    40999
variety                  40999
winery                   40999
dtype: int64

In [150]:
# Export file as a CSV
df.to_csv("Output/clean_final.csv")

# Recommender System with Test Data

In [129]:
# Define the TD-IDF Vectorizer Object and remove english stop words
tfidf = TfidfVectorizer(stop_words='english')

In [130]:
# Construct the matrix
tfidf = tfidf.fit_transform(df['description'])

In [131]:
# Shape of matrix; 70,175 wines described by 24,253 different words
tfidf.shape

(40999, 20595)

In [132]:
# Compute the cosine similarity matrix
cosine_sim = pairwise_kernels(tfidf, tfidf, metric='cosine', n_jobs=-1)

In [133]:
# Construct a reverse map of indices and wine titles
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [134]:
# Function that takes wine title as an input and outputs most similar wines within our dataset
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get index of wine that matches title
    idx = indices[title]
    # Get the pairwise similarity scores of all wines with the input wine
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort wines based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 5 most similar wines
    sim_scores = sim_scores[1:6]
    # Get the wine indices
    wine_indices = [i[0] for i in sim_scores]
    # Return the top 5 most similar wines
    return df['title'].iloc[wine_indices]


In [135]:
get_recommendations('Pradorey 2010 Vendimia Seleccionada Finca Valdelayegua Single Vineyard Crianza  (Ribera del Duero)')

12927    Coto de Hayas 2013 Centenaria Garnacha (Campo ...
21026    Coto de Hayas 2015 Roble Tempranillo-Cabernet ...
19613             Trapiche 2012 Extravaganza Red (Mendoza)
38137    Emilio Moro 2010 Malleolus de Sanchomart√≠n  (...
20286    Balbas 2012 Ardal Reserva Selecci√≥n Especial ...
Name: title, dtype: object

# Refinement

In [None]:
# 3 most important features: taster, price, and points
features = ['taster_name', 'price', 'points']
for feature in features:
    df[feature] = df[feature].apply(literal_eval)