# Wine recommendation
This is a wine tasting dataset with scores and descriptions of the wines, as well as the corresponding tasters


In [47]:
import pandas as pd 
import matplotlib.pyplot as plt

In [48]:
#load dataset
df=pd.read_csv('data/winemag-data-130k-v2.csv')

### Basic Analysis of the Data Set

In [54]:
unique_countries = df['country'].unique()
num_countries = len(unique_countries)
print("Number of unique countries:", num_countries)

Number of unique countries: 42


In [55]:
unique_provinces = df['province'].unique()
num_provinces = len(unique_provinces)
print("Number of unique provinces:", num_provinces)

Number of unique provinces: 355


In [56]:
unique_titles = df['title'].unique()
num_titles = len(unique_titles)
print("Number of unique titles:", num_titles)

Number of unique titles: 39344


In [57]:
unique_taster_names = df['taster_name'].unique().tolist()
num_taster_names = len(unique_taster_names)
print("Number of unique taster names:", num_taster_names)

Number of unique taster names: 20


# ***In the first part***

### Recommended wines from different countries（To avoid small probability events, I need to have requirements for the number of entries）

In [49]:
country_points = df[['country', 'points']]

country_counts = country_points['country'].value_counts()

valid_countries = country_counts[country_counts >= 1000].index
valid_country_points = country_points[country_points['country'].isin(valid_countries)]
average_points_by_country = valid_country_points.groupby('country')['points'].mean()

top_countries = average_points_by_country.sort_values(ascending=False).head(10)

In [50]:
top_countries

country
Austria      90.149140
France       88.847597
US           88.577100
Italy        88.492023
Portugal     88.190793
Spain        87.290839
Argentina    86.709544
Chile        86.628998
Name: points, dtype: float64

### Recommended wines from different province（To avoid small probability events, I need to have requirements for the number of entries）

In [41]:
province_points = df[['province', 'points']]

province_counts = province_points['province'].value_counts()

valid_province = province_counts[province_counts >= 200].index
valid_province_points = province_points[province_points['province'].isin(valid_province)]
average_points_by_province = valid_province_points.groupby('province')['points'].mean()

top_province = average_points_by_province.sort_values(ascending=False).head(10)

In [42]:
top_province

province
Champagne          90.243043
Mosel              90.075163
Alsace             89.552113
Burgundy           89.442478
Piedmont           89.360533
South Australia    89.283251
Rhône Valley       89.094937
Douro              89.060386
Oregon             89.058288
Tuscany            88.909432
Name: points, dtype: float64

### Knowing that Champagne is the highest rated province, I would like to know the top 10 variety here

In [64]:
province = 'Champagne' #input the province what you want
subset = df[df['province'] == province]

average_scores = subset.groupby('variety')['points'].mean()
sorted_varieties = average_scores.sort_values(ascending=False)

top_10_varieties = sorted_varieties.head(10)

In [67]:
top_10_varieties

variety
Chardonnay         91.510000
Pinot Noir         90.047619
Pinot Blanc        90.000000
Champagne Blend    89.951456
Pinot Meunier      89.750000
Name: points, dtype: float64

Knowing that Chardonnay is the highest rated variety,I would like to know the wine of Chardonnay

In [81]:
top_variety_wines = df[df['variety'] == 'Chardonnay'] #input the variety what you want
top_variety_wines = top_variety_wines.dropna(subset=['price'])
sorted_top_variety_wines = top_variety_wines.sort_values('price', ascending=False)
selected_columns = ['title','points', 'price']
top_variety_ratings_prices = sorted_top_variety_wines[selected_columns]

Here we can make recommendations according to price

In [82]:
top_variety_ratings_prices

Unnamed: 0,title,points,price
30110,Olivier Leflaive 2014 Montrachet,97,886.0
36529,Krug 2002 Clos du Mesnil Brut Blanc de Blancs ...,99,800.0
30131,Olivier Leflaive 2014 Chevalier-Montrachet,95,710.0
353,Louis Latour 2014 Le Montrachet (Montrachet),96,630.0
30121,Olivier Leflaive 2014 Bâtard-Montrachet,95,569.0
...,...,...,...
32942,Pine & Post 2006 Chardonnay (Washington),87,6.0
30465,Gallo Family Vineyards 2005 Twin Valley Chardo...,83,5.0
8428,Earth's Harvest 2014 Organic Grapes Chardonnay...,85,5.0
37951,Earth's Harvest 2014 Organic Grapes Chardonnay...,85,5.0


### Recommended wines from different varieties（To avoid small probability events, I need to have requirements for the number of entries）

In [35]:
variety_points = df[['variety', 'points']]

variety_counts = variety_points['variety'].value_counts()

valid_variety = variety_counts[variety_counts >= 5].index
valid_variety_points = variety_points[variety_points['variety'].isin(valid_variety)]
average_points_by_variety = valid_variety_points.groupby('variety')['points'].mean()

top_variety = average_points_by_variety.sort_values(ascending=False).head(10)

In [36]:
top_variety

variety
Picolit                   92.416667
Furmint                   91.285714
Savagnin                  91.285714
Tokaji                    91.000000
Neuburger                 90.666667
Roter Veltliner           90.666667
Gros and Petit Manseng    90.583333
Alsace white blend        90.555556
Austrian white blend      90.529412
Scheurebe                 90.500000
Name: points, dtype: float64

find wines with the variety "Picolit" and obtain the corresponding title, price, and points, sorted by score in descending order

In [78]:
wanted_wines = df[df['variety'] == 'Picolit']
wanted_wines = wanted_wines.dropna(subset=['price', 'points'])
sorted_wanted_wines = wanted_wines.sort_values('points', ascending=False)
selected_columns = ['title', 'price', 'points']
wanted_data = sorted_wanted_wines[selected_columns]

In [83]:
wanted_data

Unnamed: 0,title,price,points
33842,Livio Felluga 2007 Picolit Picolit (Colli Orie...,90.0,97
33843,Livio Felluga 2006 Picolit Picolit (Colli Orie...,90.0,96
15613,Livio Felluga 2004 Picolit Picolit (Colli Orie...,100.0,95
36498,Rocca Bernarda 2004 Picolit (Colli Orientali d...,50.0,94
25960,Comelli 2011 Eoos Picolit (Colli Orientali del...,45.0,93
36501,Valchiarò 2004 Picolit (Colli Orientali del Fr...,90.0,93
23380,Jacùss 2007 Picolit (Colli Orientali del Friuli),55.0,92
18510,Comelli 2009 Eoos Picolit (Colli Orientali del...,20.0,90
23862,Marco Cecchini 2005 Picolit (Colli Orientali d...,30.0,90
23864,Conte d'Attimis-Maniago 2004 Picolit (Colli Or...,60.0,90


# ***In the second part（TF-IDF）***

Studying code from week 2.1 Content Based Filtering 

I would like to use the description of the wine to achieve the recommendation of its following purchase after the purchase of a wine.

In [7]:
df.columns

Index(['Unnamed: 0', 'country', 'description', 'designation', 'points',
       'price', 'province', 'region_1', 'region_2', 'taster_name',
       'taster_twitter_handle', 'title', 'variety', 'winery'],
      dtype='object')

In [28]:
taster_name_ids = df["taster_name"].unique().tolist()
title_ids = df["title"].unique().tolist()

In [29]:
#Make a dictionary mapping ids (keys) to indexes (values)
taster_name_id_to_index = {x: i for i, x in enumerate(taster_name_ids)}
title_id_to_index = {x: i for i, x in enumerate(title_ids)}

In [12]:
taster_counts = df['taster_name'].value_counts()
taster_names = taster_counts.index.tolist()

print("Number of taster names:", len(taster_names))
print("List of taster names:")
print(taster_names)


Number of taster names: 19
List of taster names:
['Roger Voss', 'Michael Schachner', 'Kerin O’Keefe', 'Paul Gregutt', 'Virginie Boone', 'Matt Kettmann', 'Joe Czerwinski', 'Sean P. Sullivan', 'Anna Lee C. Iijima', 'Jim Gordon', 'Anne Krebiehl\xa0MW', 'Lauren Buzzeo', 'Susan Kostrzewa', 'Jeff Jenssen', 'Mike DeSimone', 'Alexander Peartree', 'Carrie Dykes', 'Fiona Adams', 'Christina Pickard']


In [78]:
print("Number of wines:", len(title_id_to_index))

Number of wines: 39344


In [75]:
num_titles = len(df['title'])
print("Total number of titles:", num_titles)

Total number of titles: 40430


In [8]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm


In [9]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
df['description'] = df['description'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df['description'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(40430, 20234)

Use cosine similarity 

In [10]:
# Import cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [11]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [12]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['title'].iloc[movie_indices]

In [13]:
get_recommendations('Nicosia 2013 Vulkà Bianco  (Etna)')

2000     Feudi del Pisciotto 2013 Baglio del Sole Inzol...
9976     Contrada Santo Spirito di Passopisciaro 2012 A...
36459    La Staffa 2014  Verdicchio dei Castelli di Jes...
9969         Terrazze dell'Etna 2009 Cirneco Rosso  (Etna)
28898                    Vignamaggio 2013 Merlot (Toscana)
16249    Marilena Barbera 2013 Coste al Vento Grillo (S...
29722    Poggiobello 2014 Friulano (Friuli Colli Orient...
908                      Cascina Bruciata 2013  Barbaresco
5991     Barone Sergio 2015 Alègre Grillo (Terre Sicili...
29250    Mastroberardino 2008 Vintage  (Fiano di Avellino)
Name: title, dtype: object

In [14]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the wine that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores of all wines with that wine
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the wines based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar wines
    sim_scores = sim_scores[1:11]

    # Get the wine indices
    wine_indices = [i[0] for i in sim_scores]

    # Get the titles and points of the top 10 most similar wines
    recommended_wines = df.iloc[wine_indices][['title','points']]

    return recommended_wines


In [15]:
title = "Feudi del Pisciotto 2013 Baglio del Sole Inzolia (Sicilia)" 
recommendations = get_recommendations(title)
print(recommendations)


                                                   title  points
16249  Marilena Barbera 2013 Coste al Vento Grillo (S...      86
5991   Barone Sergio 2015 Alègre Grillo (Terre Sicili...      87
9271     Trerose 2014 Salterio  (Rosso di Montepulciano)      86
9969       Terrazze dell'Etna 2009 Cirneco Rosso  (Etna)      87
23715              Ada Nada 2012 Valeirano  (Barbaresco)      88
18353        Casale Daviddi 2012  Rosso di Montepulciano      87
15915  Barone Sergio 2016 Alègre Grillo (Terre Sicili...      88
0                      Nicosia 2013 Vulkà Bianco  (Etna)      87
34212              Montresor 2014 Gran Guardia  (Lugana)      86
20304                    Gino Fasoli NV Brut  (Prosecco)      86


In [13]:
# Filter the DataFrame for reviews by Roger Voss
score_by_taster = df[df['taster_name'] == 'Roger Voss']

# Sort the reviews by descending points
sorted_score = score_by_taster.sort_values(by='points', ascending=False)

# Get the titles of the top 10 reviews
top_10_titles = sorted_score.head(10)['title']

# Print the titles
for i, title in enumerate(top_10_titles):
    print(f"{i+1}. {title}")


1. Krug 2002 Brut  (Champagne)
2. Krug 2002 Clos du Mesnil Brut Blanc de Blancs Chardonnay (Champagne)
3. Blandy's 1969 Bual (Madeira)
4. Louis Jadot 2014  Bâtard-Montrachet
5. Château Margaux 2009  Margaux
6. Château Palmer 2009  Margaux
7. Château Haut-Brion 2007  Pessac-Léognan
8. Salon 2004 Le Mesnil Blanc de Blancs Brut Chardonnay (Champagne)
9. Château Pontet-Canet 2012 Barrel Sample  (Pauillac)
10. Château Mouton Rothschild 2012 Barrel Sample  (Pauillac)


In [14]:
sorted_score.head(10)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
36528,36528,France,This is a fabulous wine from the greatest Cham...,Brut,100,259.0,Champagne,Champagne,,Roger Voss,@vossroger,Krug 2002 Brut (Champagne),Champagne Blend,Krug
36529,36529,France,Pure Chardonnay from the walled vineyard in th...,Clos du Mesnil Brut Blanc de Blancs,99,800.0,Champagne,Champagne,,Roger Voss,@vossroger,Krug 2002 Clos du Mesnil Brut Blanc de Blancs ...,Chardonnay,Krug
36196,36196,Portugal,A superb wine that brings together all the gre...,,98,230.0,Madeira,,,Roger Voss,@vossroger,Blandy's 1969 Bual (Madeira),Bual,Blandy's
27591,27591,France,This is a major wine from a great white wine v...,,98,367.0,Burgundy,Bâtard-Montrachet,,Roger Voss,@vossroger,Louis Jadot 2014 Bâtard-Montrachet,Chardonnay,Louis Jadot
1558,1558,France,"A massive wine for Margaux, packed with tannin...",,98,1900.0,Bordeaux,Margaux,,Roger Voss,@vossroger,Château Margaux 2009 Margaux,Bordeaux-style Red Blend,Château Margaux
1559,1559,France,"A beautiful wine, with the firmest tannins sur...",,98,380.0,Bordeaux,Margaux,,Roger Voss,@vossroger,Château Palmer 2009 Margaux,Bordeaux-style Red Blend,Château Palmer
39288,39288,France,"The palate opens slowly, offering an initial c...",,98,800.0,Bordeaux,Pessac-Léognan,,Roger Voss,@vossroger,Château Haut-Brion 2007 Pessac-Léognan,Bordeaux-style White Blend,Château Haut-Brion
36530,36530,France,Salon's releases are rare and signify a great ...,Le Mesnil Blanc de Blancs Brut,98,520.0,Champagne,Champagne,,Roger Voss,@vossroger,Salon 2004 Le Mesnil Blanc de Blancs Brut Char...,Chardonnay,Salon
16107,16107,France,"96–98. Barrel sample. This is a structured, ri...",Barrel Sample,97,,Bordeaux,Pauillac,,Roger Voss,@vossroger,Château Pontet-Canet 2012 Barrel Sample (Paui...,Bordeaux-style Red Blend,Château Pontet-Canet
16106,16106,France,"96–98. Barrel sample. This powerful, impressiv...",Barrel Sample,97,,Bordeaux,Pauillac,,Roger Voss,@vossroger,Château Mouton Rothschild 2012 Barrel Sample ...,Bordeaux-style Red Blend,Château Mouton Rothschild


In [15]:
# Filter the DataFrame for reviews by Roger Voss
score_by_taster = df[df['taster_name'] == 'Kerin O’Keefe']

# Sort the reviews by descending points
sorted_score = score_by_taster.sort_values(by='points', ascending=False)

# Get the titles of the top 10 reviews
top_10_titles = sorted_score.head(10)['title']

# Print the titles
for i, title in enumerate(top_10_titles):
    print(f"{i+1}. {title}")


1. Tenuta San Guido 2012 Sassicaia  (Bolgheri Sassicaia)
2. Tenuta San Guido 2013  Bolgheri Sassicaia
3. Brezza 2013 Cannubi  (Barolo)
4. Comm. G. B. Burlotto 2013 Monvigliero  (Barolo)
5. Massolino 2011 Vigna Rionda Riserva  (Barolo)
6. Giuseppe Rinaldi 2013 Brunate  (Barolo)
7. Marchesi Antinori 2012 Solaia Red (Toscana)
8. Germano Ettore 2013 Prapò  (Barolo)
9. Cavallotto 2010 Vignolo Riserva  (Barolo)
10. Elvio Cogno 2013 Ravera  (Barolo)


In [16]:
sorted_score.head(10)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
35517,35517,Italy,"One of Italy's most iconic bottlings, the 2012...",Sassicaia,99,235.0,Tuscany,Bolgheri Sassicaia,,Kerin O’Keefe,@kerinokeefe,Tenuta San Guido 2012 Sassicaia (Bolgheri Sas...,Red Blend,Tenuta San Guido
26889,26889,Italy,"Red berry, cedar and light spice aromas lead t...",,98,235.0,Tuscany,Bolgheri Sassicaia,,Kerin O’Keefe,@kerinokeefe,Tenuta San Guido 2013 Bolgheri Sassicaia,Red Blend,Tenuta San Guido
16770,16770,Italy,One of the best expressions from the classic C...,Cannubi,98,60.0,Piedmont,Barolo,,Kerin O’Keefe,@kerinokeefe,Brezza 2013 Cannubi (Barolo),Nebbiolo,Brezza
16771,16771,Italy,Always the firm's showstopper and one of the b...,Monvigliero,98,70.0,Piedmont,Barolo,,Kerin O’Keefe,@kerinokeefe,Comm. G. B. Burlotto 2013 Monvigliero (Barolo),Nebbiolo,Comm. G. B. Burlotto
16772,16772,Italy,From one of the most celebrated vineyards in t...,Vigna Rionda Riserva,98,151.0,Piedmont,Barolo,,Kerin O’Keefe,@kerinokeefe,Massolino 2011 Vigna Rionda Riserva (Barolo),Nebbiolo,Massolino
16773,16773,Italy,"Classic Nebbiolo aromas of new leather, mentho...",Brunate,98,300.0,Piedmont,Barolo,,Kerin O’Keefe,@kerinokeefe,Giuseppe Rinaldi 2013 Brunate (Barolo),Nebbiolo,Giuseppe Rinaldi
35519,35519,Italy,This stunning expression of Solaia opens with ...,Solaia,97,325.0,Tuscany,Toscana,,Kerin O’Keefe,@kerinokeefe,Marchesi Antinori 2012 Solaia Red (Toscana),Red Blend,Marchesi Antinori
16523,16523,Italy,"Polished and well structured, this vibrant, sa...",Prapò,97,75.0,Piedmont,Barolo,,Kerin O’Keefe,@kerinokeefe,Germano Ettore 2013 Prapò (Barolo),Nebbiolo,Germano Ettore
350,350,Italy,"After a few minutes in the glass, this stunnin...",Vignolo Riserva,97,150.0,Piedmont,Barolo,,Kerin O’Keefe,@kerinokeefe,Cavallotto 2010 Vignolo Riserva (Barolo),Nebbiolo,Cavallotto
16774,16774,Italy,"Wild berry, iris, rose and a potpourri of culi...",Ravera,97,82.0,Piedmont,Barolo,,Kerin O’Keefe,@kerinokeefe,Elvio Cogno 2013 Ravera (Barolo),Nebbiolo,Elvio Cogno


### Find top 10 cost-effective wines with points above 95 and prices below 100 with price tags

In [18]:
# Filter the DataFrame for wines with points above 95 and prices below 100
cost_effective_wines = df[(df['points'] > 95) & (df['price'] < 100)]

# Sort the wines by ascending price
sorted_wines = cost_effective_wines.sort_values(by='price')

# Get the top 10 cost-effective wines
top_10_wines = sorted_wines.head(10)

# Print the titles, points, and prices
for i, row in top_10_wines.iterrows():
    title = row['title']
    points = row['points']
    price = row['price']
    print(f"{i+1}. {title} - {points} points, ${price}")


40310. Isole e Olena 2010  Chianti Classico - 96 points, $27.0
9902. Domaines Schlumberger 2014 Saering Grand Cru Riesling (Alsace) - 96 points, $29.0
16529. Trisaetum 2016 Ribbon Ridge Estate Dry Riesling (Ribbon Ridge) - 96 points, $32.0
34506. Williams Selyem 2007 Late Harvest Muscat (Russian River Valley) - 96 points, $40.0
15852. Stolo 2014 Estate Syrah (San Luis Obispo County) - 96 points, $40.0
16525. Taylor Fladgate NV 325 Anniversary  (Port) - 97 points, $40.0
9905. Kuentz-Bas 2015 Geisberg Grand Cru Riesling (Alsace) - 96 points, $42.0
33846. Samsara 2008 Las Hermanas Vineyard Pinot Noir (Sta. Rita Hills) - 96 points, $44.0
33845. Woodward Canyon 2009 Chardonnay (Washington) - 96 points, $44.0
26893. Iron Horse 2012 Wedding Cuvée Estate Bottled Sparkling (Green Valley) - 96 points, $44.0


### Find top 10 low-point and high-priced wines with points below 90 and prices above 200

In [19]:
# Filter the DataFrame for wines with points below 90 and prices above 500
low_rated_high_priced_wines = df[(df['points'] < 90) & (df['price'] > 200)]

# Sort the wines by descending price
sorted_wines = low_rated_high_priced_wines.sort_values(by='price', ascending=False)

# Get the top 10 low-rated and high-priced wines
top_10_wines = sorted_wines.head(10)

# Print the titles, points, and prices
for i, row in top_10_wines.iterrows():
    title = row['title']
    points = row['points']
    price = row['price']
    print(f"{i+1}. {title} - {points} points, ${price}")


27519. Vega Sicilia 2008 Unico  (Ribera del Duero) - 89 points, $500.0
26152. Armand de Brignac NV Brut Rosé  (Champagne) - 89 points, $450.0
26865. Armand de Brignac NV Brut Rosé  (Champagne) - 89 points, $450.0
31711. Matarromera 2000 Prestigio Pago de las Solanas  (Ribera del Duero) - 88 points, $325.0
26206. Capichera 2011 Albori di Lampata Red (Isola dei Nuraghi) - 88 points, $320.0
34029. Villa Canestrari 2005 10 Anni Riserva  (Amarone della Valpolicella) - 87 points, $300.0
6906. Nathaniel Rose 2012 Left Bank Abigail's Vineyard Domaine Barrien Cabernet Sauvignon (Lake Michigan Shore) - 87 points, $250.0
20838. Domaine Sophie Cinier 2012  Saint-Véran - 89 points, $250.0
38330. Domaine du Pegau 2015 Cuvée à Tempo White (Châteauneuf-du-Pape) - 87 points, $250.0
30715. Buglioni 2007 Riserva  (Amarone della Valpolicella Classico) - 88 points, $249.0


### find out top 10 wine of every taster

In [20]:
# Group the DataFrame by taster_name
grouped_by_taster = df.groupby('taster_name')

# Iterate over each taster group
for taster_name, group_df in grouped_by_taster:
    # Sort the group by descending points
    sorted_group = group_df.sort_values(by='points', ascending=False)
    
    # Get the top 10 wines for the taster
    top_10_wines = sorted_group.head(10)
    
    # Print the taster's name and their top 10 wines
    print(f"Taster: {taster_name}")
    for i, row in top_10_wines.iterrows():
        title = row['title']
        points = row['points']
        print(f"{i+1}. {title} - {points} points")
    print()  # Print an empty line for separation between tasters


Taster: Alexander Peartree
4900. Lovingston 2012 Josie's Knoll Merlot (Monticello) - 91 points
31279. Bel Lago 2013 Chardonnay (Leelanau Peninsula) - 91 points
31598. King Family 2012 Meritage (Monticello) - 90 points
7935. Canyon Wind 2012 Clone 4 Cabernet Sauvignon (Grand Valley) - 90 points
31581. The Infinite Monkey Theorem 2013 Cabernet Franc (Grand Valley) - 90 points
26819. CrossKeys 2013 Touriga (Virginia) - 90 points
21794. Snowy Peaks 2011 Malbec (Grand Valley) - 89 points
21920. Michael Shaps 2014 Viognier (Virginia) - 89 points
23803. Gill's Pier 2012 Cabernet Franc-Merlot (Leelanau Peninsula) - 89 points
36967. Fabbioli Cellars 2012 Tre Sorélle Red (Virginia) - 88 points

Taster: Anna Lee C. Iijima
16523. Robert Weil 2015 Kiedrich Gräfenberg Trockenbeerenauslese Riesling (Rheingau) - 98 points
348. Robert Weil 2014 Kiedrich Gräfenberg Trockenbeerenauslese Riesling (Rheingau) - 97 points
16526. Domdechant Werner 2015 Hochheimer Domdechaney Trockenbeerenauslese Grosse Lage R

# ***In the second part（Embeddings）***

Studying code from week 6.1 Embeddings

I want to get recommendations by the score of the wine with the wine and the corresponding taster.

In [25]:
df["points"].describe()

count    40430.000000
mean        88.438610
std          3.011842
min         80.000000
25%         86.000000
50%         88.000000
75%         91.000000
max        100.000000
Name: points, dtype: float64

In [26]:
from sklearn.preprocessing import MinMaxScaler
##Pick the range
df["points"] = MinMaxScaler().fit_transform(df["points"].values.reshape(-1, 1))

In [27]:
df["points"].describe()

count    40430.000000
mean         0.421930
std          0.150592
min          0.000000
25%          0.300000
50%          0.400000
75%          0.550000
max          1.000000
Name: points, dtype: float64

In [57]:
taster_name_id_to_index

{'Kerin O’Keefe': 0,
 'Roger Voss': 1,
 'Paul Gregutt': 2,
 'Alexander Peartree': 3,
 'Michael Schachner': 4,
 'Anna Lee C. Iijima': 5,
 'Virginie Boone': 6,
 'Matt Kettmann': 7,
 nan: 8,
 'Sean P. Sullivan': 9,
 'Jim Gordon': 10,
 'Joe Czerwinski': 11,
 'Anne Krebiehl\xa0MW': 12,
 'Lauren Buzzeo': 13,
 'Mike DeSimone': 14,
 'Jeff Jenssen': 15,
 'Susan Kostrzewa': 16,
 'Carrie Dykes': 17,
 'Fiona Adams': 18,
 'Christina Pickard': 19}

In [59]:
len(title_id_to_index)

39344

In [61]:
num_titles = len(df['title'])
print("Total number of titles:", num_titles)

Total number of titles: 40430


In [62]:
duplicate_titles = df[df['title'].duplicated()]
print("Duplicate titles:")
print(duplicate_titles['title'])


Duplicate titles:
959                 Jacquart NV Brut Mosaïque  (Champagne)
1176     Spagnol NV Col del Sas Extra Dry  (Prosecco di...
2360     Domaines Devillard 2011 Château de Chamirey  (...
2408               Souverain 2010 Chardonnay (North Coast)
2409     Tasca d'Almerita 2011 Sallier de la Tour Grill...
                               ...                        
40161                      Campbells NV Tokay (Rutherglen)
40299    Segura Viudas NV Aria Estate Extra Dry Sparkli...
40300    J. Garcia Carrion NV Opera Prima Sparkling Mos...
40303               Freixenet NV Spumante Sparkling (Cava)
40381    Louis Latour 2011 Morgeot Premier Cru  (Chassa...
Name: title, Length: 1086, dtype: object


In [65]:
duplicate_titles = df[df['title'].duplicated()]
duplicate_titles_with_taster = duplicate_titles[['title', 'taster_name']]
print(duplicate_titles_with_taster)


                                                   title        taster_name
959               Jacquart NV Brut Mosaïque  (Champagne)         Roger Voss
1176   Spagnol NV Col del Sas Extra Dry  (Prosecco di...                NaN
2360   Domaines Devillard 2011 Château de Chamirey  (...         Roger Voss
2408             Souverain 2010 Chardonnay (North Coast)     Virginie Boone
2409   Tasca d'Almerita 2011 Sallier de la Tour Grill...                NaN
...                                                  ...                ...
40161                    Campbells NV Tokay (Rutherglen)     Joe Czerwinski
40299  Segura Viudas NV Aria Estate Extra Dry Sparkli...  Michael Schachner
40300  J. Garcia Carrion NV Opera Prima Sparkling Mos...  Michael Schachner
40303             Freixenet NV Spumante Sparkling (Cava)  Michael Schachner
40381  Louis Latour 2011 Morgeot Premier Cru  (Chassa...         Roger Voss

[1086 rows x 2 columns]


In [30]:
#Make a new column in the dataframe which contains the appropriate index for each user and movie
df["taster_name_index"] = [taster_name_id_to_index[i] for i in df["taster_name"]]
df["title_index"] = [title_id_to_index[i] for i in df["title"]]

In [31]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,taster_name_index,title_index
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,0.35,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,0,0
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,0.35,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,1,1
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,0.35,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2,2
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,0.35,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,3,3
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,0.35,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,2,4


In [32]:
from sklearn.model_selection import train_test_split
#Inputs
x = df[["taster_name_index", "title_index"]]
#Outputs
y = df["points"]
#Get train-test split
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1, random_state=42)

In [33]:
#import library
import torch

In [34]:
#Define class and subclass torch.nn.Module
class DairunNet(torch.nn.Module):
    
    #Override __init__()
    def __init__(self):
        super().__init__()
        print("__init__ called")
    
    #Override forward()
    def forward(self, inputs):
        print("\nforwards pass (new batch)")
        print(inputs,"\n")
        #return the output (its just the input, unchanged)
        return inputs

#Make a new instance of LouisNet    
dairunNet = DairunNet()
loss_fn = torch.nn.MSELoss() 

#Fake dataset
x = torch.FloatTensor([[1],[2],[3],[4]])
y = torch.FloatTensor([[2],[3],[4],[5]])

#Do a forwards pass
prediction = dairunNet(x)
loss = loss_fn(prediction, y)

__init__ called

forwards pass (new batch)
tensor([[1.],
        [2.],
        [3.],
        [4.]]) 



In [35]:
class RecommenderNet(torch.nn.Module):
    def __init__(self, num_taster_name, num_title, embedding_size=20):
        super().__init__()
        self.taster_name_embedding = torch.nn.Embedding(num_taster_name, embedding_size)
        self.taster_name_bias = torch.nn.Embedding(num_taster_name, 1)
        self.title_embedding = torch.nn.Embedding(num_title, embedding_size)
        self.title_bias = torch.nn.Embedding(num_title, 1)
        self.sig = torch.nn.Sigmoid()

    def forward(self, inputs):
        #Split out indexes 
        taster_name_indexes = inputs[:, 0]
        title_indexes = inputs[:, 1]
        #Forward pass on embedding layer
        taster_name_vector = self.taster_name_embedding(taster_name_indexes)
        taster_name_bias = self.taster_name_bias(taster_name_indexes).flatten()
        title_vector = self.title_embedding(title_indexes)
        title_bias = self.title_bias(title_indexes).flatten()
        #Dot product
        dot = (taster_name_vector * title_vector).sum(1)
        with_bias = dot + taster_name_bias + title_bias
        #Activation function
        output = self.sig(with_bias)
        return output

In [97]:
#Pick Embedding size
EMBEDDING_SIZE = 16
#Make new object (calls __init__())
num_taster_name = len(taster_name_ids)
num_title = len(title_ids)
model = RecommenderNet(num_taster_name, num_title, EMBEDDING_SIZE)

In [37]:
print(num_taster_name)

20


In [38]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

#Make a subclass to hold our dataset (movie - user pairs (input) and a rating (label))
class MoviesDataset(Dataset):
    def __init__(self, X,y):
        self.X = torch.IntTensor(X)
        self.y = torch.FloatTensor(y)
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [39]:
#Use our train - validation split to make DataLoader objects
train_dl = DataLoader(MoviesDataset(x_train.values,y_train.values), batch_size=64, shuffle=True)
validation_dl = DataLoader(MoviesDataset(x_val.values,y_val.values), batch_size=64, shuffle=True)

In [40]:
epochs = 10
#Use Mean Squared Error as a loss function
loss_fn = torch.nn.MSELoss() 
#Use the Adam algorithm to update the weights based on the loss
optimizer = torch.optim.Adam(model.parameters(),lr=0.01)

In [41]:
#Use a for loop to repeat for the desired number of epochs 
for i in range(epochs):
    
    model.train(True)
    
    #Use a for loop for each batch (provided by the Dataloader)
    running_loss = 0.0
    for (index, batch) in enumerate(train_dl):
        
        #Get batch 
        inputs, labels = batch
        model.zero_grad()
        
        #Forward pass
        prediction = model(inputs)
        
        #Get Loss
        loss = loss_fn(prediction, labels)
        
        #Update weights (back prop)
        loss.backward()
        optimizer.step()
        running_loss += loss
    
    avg_loss = running_loss / (index + 1)

    model.train(False)

    #Now try with the validation set (no need to update weights, just get loss)
    running_vloss = 0.0
    for index, vdata in enumerate(validation_dl):
        vinputs, vlabels = vdata
        voutputs = model(vinputs)
        vloss = loss_fn(voutputs, vlabels)
        running_vloss += vloss

    avg_vloss = running_vloss / (index + 1)
    print('Loss {} Validation Loss {}'.format(avg_loss, avg_vloss))

Loss 0.10652416944503784 Validation Loss 0.06801959872245789
Loss 0.053029175847768784 Validation Loss 0.06705085933208466
Loss 0.02965335175395012 Validation Loss 0.06993348151445389
Loss 0.012448442168533802 Validation Loss 0.0724158063530922
Loss 0.006259355694055557 Validation Loss 0.07196589559316635
Loss 0.004682246129959822 Validation Loss 0.07002466917037964
Loss 0.003931661136448383 Validation Loss 0.06866613775491714
Loss 0.003514733863994479 Validation Loss 0.06738979369401932
Loss 0.003288489067927003 Validation Loss 0.0672011524438858
Loss 0.003191101597622037 Validation Loss 0.0659521222114563


In [42]:
torch.save(model.state_dict(), 'model_weights_1.pth')

In [43]:
model = RecommenderNet(num_taster_name, num_title, EMBEDDING_SIZE)
model.load_state_dict(torch.load('model_weights_1.pth'))
model.eval()

RecommenderNet(
  (taster_name_embedding): Embedding(20, 16)
  (taster_name_bias): Embedding(20, 1)
  (title_embedding): Embedding(39344, 16)
  (title_bias): Embedding(39344, 1)
  (sig): Sigmoid()
)

In [44]:
num_taster_name, EMBEDDING_SIZE, model.taster_name_embedding

(20, 16, Embedding(20, 16))

In [45]:
def get_top_n(taster_name = 0, n = 10): 

    top_n_indexes = get_top_n_indexes(taster_name, n)
   
    return top_n_indexes

In [46]:
def get_names_for_indexes(indexes):
    return [df[df["title"]==title_ids[i]]["title"].item() for i in indexes]

In [47]:
def get_top_n_indexes(taster_name = 0, n = 10):
    #For one user, make a pair with every movie index
    x = torch.IntTensor([[taster_name, i] for i in np.arange(num_title)])
    #Predict 
    predicted_ratings = model(x)
    #Get Top-N indexes
    top_n_indexes = predicted_ratings.argsort()[-n:]
    return top_n_indexes

In [48]:
import numpy as np

In [49]:
#random users' top 10
get_names_for_indexes(get_top_n(np.random.randint(num_taster_name)))

['Kay Brothers 2010 Amery Vineyard Block 6 Shiraz (McLaren Vale)',
 'Prà 2011 Staforte  (Soave Classico)',
 'Bodega Patritti 2010 Lassia Select Red (Patagonia)',
 'Château Laffitte Laujac 2010 L de Laffitte Laujac  (Médoc)',
 'Harlow Ridge 2011 Chardonnay (Lodi)',
 'Quilceda Creek 2008 Cabernet Sauvignon (Columbia Valley (WA))',
 'Signorello 2005 Padrone Proprietary Red Wine Red (Napa Valley)',
 "Paradise Ridge 2003 Ladi's Vineyard Cabernet Sauvignon (Sonoma County)",
 'Swanson 2005 Alexis Cabernet Sauvignon (Oakville)',
 'Incognito 2008 Red (Lodi)']

In [50]:
# iterate over all user indices
for taster_name_index in range(num_taster_name):
    # Get top 10 movies for the current user
    top_title = get_top_n(taster_name_index, 10)
    
    # Print the results
    print("taster_name", taster_name_index)
    print("Top 10 title:")
    for title_index in top_title:
        title = get_names_for_indexes([title_index])  # Pass a list with a single index
        print(title)


taster_name 0
Top 10 title:
['Conte Collalto NV Brut  (Valdobbiadene Prosecco Superiore)']
["Wayfarer 2012 Paige's Ridge Pinot Noir (Fort Ross-Seaview)"]
['Paratus 2012 Cabernet Sauvignon (Mount Veeder)']
['Spring Valley Vineyard 2009 Uriah Estate Grown Red Wine Red (Walla Walla Valley (WA))']
['Viña Godeval 2013 Godello (Valdeorras)']
['Castillo De Feliciana 2011 Reserve Tempranillo (Columbia Valley (WA))']
['Xavier Flouret 2005 Château Haut-Meneau La Victoire  (Premieres Côtes de Blaye)']
['Harlow Ridge 2011 Chardonnay (Lodi)']
['Larkspur 2013 Pinot Noir (Oregon)']
["Paradise Ridge 2003 Ladi's Vineyard Cabernet Sauvignon (Sonoma County)"]
taster_name 1
Top 10 title:
['Tenute Silvio Nardi 1999  Brunello di Montalcino']
['Uccelliera 2006 Riserva  (Brunello di Montalcino)']
['Ridge 2012 Bento Dusi Ranch Zinfandel (Paso Robles)']
['Frank Family 2012 Reserve Zinfandel (Chiles Valley)']
['Boeckel 2015 Zotzenberg Grand Cru Sylvaner (Alsace)']
['Harlow Ridge 2011 Chardonnay (Lodi)']
['Cottan

taster_name 14
Top 10 title:
['Pey-Marin 2013 The Shell Mound Riesling (Marin County)']
['Azelia 2011 Bricco Fiasco  (Barolo)']
['Ginestet 2013 Château Pierrouselle  (Bordeaux)']
['Betz Family 2013 Père de Famille Cabernet Sauvignon (Columbia Valley (WA))']
['Melville 2012 Clone 76 Inox Estate Chardonnay (Sta. Rita Hills)']
['Cayuse 2010 Edith Rosé (Walla Walla Valley (OR))']
["Paradise Ridge 2003 Ladi's Vineyard Cabernet Sauvignon (Sonoma County)"]
['Cottanera 2014 Contrada Calderara Bianco  (Etna)']
["Tenuta Rocca 2016  Dolcetto d'Alba"]
['Robert Ramsay 2014 Par La Mer Red (Horse Heaven Hills)']
taster_name 15
Top 10 title:
['Fenestra 2009 Graciano (Lodi)']
['Parras Wines 2015 Quinta do Gradil Branco White (Lisboa)']
['Pacifico Sur 2008 Reserve Pinot Noir (Curicó Valley)']
['Château Julian 2010  Bordeaux Blanc']
['Valle Hermoso 2014 Elegido Gran Reserva Malbec (Colchagua Valley)']
['Craneford 2006 Allyson Parsons Cabernet Sauvignon (Barossa Valley)']
['Markus Molitor 2010 Zeltinger S

ValueError: can only convert an array of size 1 to a Python scalar

In [107]:
def calculate_similarity_matrix(taster_name, n):
    top_title_indexes = get_top_n_indexes(taster_name, n)
    title_embeddings = model.title_embedding(torch.LongTensor(top_title_indexes)).detach().numpy()

    similarity_matrix = cosine_similarity(title_embedding)
    return similarity_matrix


In [108]:
# Instantiate the RecommenderNet class
model = RecommenderNet(num_taster_name, num_title, EMBEDDING_SIZE)

# Call the forward method with input data
inputs = torch.tensor([[taster_name_index, title_index]])
title_embedding = model.forward(inputs).detach().numpy()


In [109]:
title_embedding = title_embedding.reshape(-1, 1)

In [115]:
def calculate_mean_difference(taster_name=0, n=10):
    similarity_matrix = calculate_similarity_matrix(taster_name, n)
    difference_matrix = 1 - similarity_matrix
    return difference_matrix

def calculate_difference_matrix_for_all_taster_name(n=10):
    difference_matrices = []
    for taster_name in taster_name_ids:
        difference_matrix = calculate_difference_matrix(num_taster_name-1, n)
        difference_matrices.append(difference_matrix)
    return difference_matrices

# Example usage for calculating difference matrices for all users' top 10 movies
difference_matrices = calculate_difference_matrix_for_all_taster_name(n=10)
for i, taster_name in enumerate(taster_name_ids):
    print("Difference matrix for top 10 movies of user", taster_name)
    print(difference_matrices[i])
    print()


Difference matrix for top 10 movies of user Kerin O’Keefe
[[0.]]

Difference matrix for top 10 movies of user Roger Voss
[[0.]]

Difference matrix for top 10 movies of user Paul Gregutt
[[0.]]

Difference matrix for top 10 movies of user Alexander Peartree
[[0.]]

Difference matrix for top 10 movies of user Michael Schachner
[[0.]]

Difference matrix for top 10 movies of user Anna Lee C. Iijima
[[0.]]

Difference matrix for top 10 movies of user Virginie Boone
[[0.]]

Difference matrix for top 10 movies of user Matt Kettmann
[[0.]]

Difference matrix for top 10 movies of user nan
[[0.]]

Difference matrix for top 10 movies of user Sean P. Sullivan
[[0.]]

Difference matrix for top 10 movies of user Jim Gordon
[[0.]]

Difference matrix for top 10 movies of user Joe Czerwinski
[[0.]]

Difference matrix for top 10 movies of user Anne Krebiehl MW
[[0.]]

Difference matrix for top 10 movies of user Lauren Buzzeo
[[0.]]

Difference matrix for top 10 movies of user Mike DeSimone
[[0.]]

Diffe

In [116]:
#define calculate_mean_difference_for_dataset
def calculate_mean_difference_for_dataset(n=10):
    mean_differences = []
    for taster_name in taster_name_ids:
        mean_difference = calculate_mean_difference(num_taster_name-1, n)
        mean_differences.append(mean_difference)
    mean_difference_dataset = np.mean(mean_differences)
    return mean_difference_dataset

# example usage for calculating the mean difference for the entire dataset
mean_difference_dataset = calculate_mean_difference_for_dataset(n=10)
print("Mean difference for the entire dataset (all users' top 10 movies):")
print(mean_difference_dataset) #here we get result

Mean difference for the entire dataset (all users' top 10 movies):
0.0
