<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Capstone Project - Wine Recommender System <br> [Part 2 of 3]

## Contents:
- [Overview](##Overview)

---
## Overview
---

In [6]:
# Importing Libraries

import pandas as pd
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
import gensim.downloader as api
from gensim.corpora import Dictionary
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize

In [2]:
df = pd.read_csv('../data/wine_reviews_clean.csv')

In [3]:
df.head()

Unnamed: 0,country,description,points,price,province,region_1,taster_name,title,variety,winery,vintage,tokens
0,France,Medium-gold in color. Complex and inviting nos...,92,80,Alsace,Alsace,Anne Krebiehl MW,Dopff & Irion 2004 Schoenenbourg Grand Cru Ven...,Riesling,Dopff & Irion,2004,"['medium-gold', 'color', 'complex', 'inviting'..."
1,Italy,"Slightly backward, particularly given the vint...",92,70,Piedmont,Barolo,Kerin O’Keefe,Ceretto 2003 Bricco Rocche Prapó (Barolo),Nebbiolo,Ceretto,2003,"['slightly', 'backward', 'particularly', 'give..."
2,US,The vineyard is one of the better Chardonnay s...,92,36,California,Alexander Valley,Virginie Boone,Matrix 2007 Stuhlmuller Vineyard Chardonnay (A...,Chardonnay,Matrix,2007,"['vineyard', 'one', 'better', 'chardonnay', 's..."
3,US,Defines Rockpile Zinfandel in intensity of fru...,92,39,California,Rockpile,Virginie Boone,Mauritson 2007 Rockpile Cemetary Vineyard Zinf...,Zinfandel,Mauritson,2007,"['defines', 'rockpile', 'zinfandel', 'intensit..."
4,US,This sophisticated wine is mostly Cabernet Sau...,92,45,California,Napa Valley,Virginie Boone,Silverado 2006 Cabernet Sauvignon (Napa Valley),Cabernet Sauvignon,Silverado,2006,"['sophisticated', 'mostly', 'cabernet', 'sauvi..."


## Average Word2Vec

In [50]:
#splitting the description into words

corpus = []
for words in df['tokens']:
    corpus.append(words.split())

In [51]:
# Load the pre-trained Word2Vec model
word2vec_model = api.load('word2vec-google-news-300')

In [52]:
# create function to generate average word embeddings for each wine description

def average_word_embeddings(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float32")
    n_words = 0
    
    for word in words:
        if word in vocabulary:
            n_words += 1
            feature_vector = np.add(feature_vector, model[word])
    
    if n_words:
        feature_vector = np.divide(feature_vector, n_words)
    
    return feature_vector

In [53]:
# generate average word embeddings for all wine descriptions

vocabulary = set(word2vec_model.index_to_key)
num_features = 300

wine_embeddings = list(map(lambda x: average_word_embeddings(x, word2vec_model, vocabulary, num_features), corpus))

In [54]:
# calculate cosine similarity between wine descriptions using embeddings

wine_embeddings_matrix = np.vstack(wine_embeddings)
cosine_sim_word2vec = cosine_similarity(wine_embeddings_matrix, wine_embeddings_matrix)

: 

: 

In [47]:
def recommendations(title, cosine_sim_matrix):

    # Taking the title and creating a new dataframe called wines
    wines = df[['title']]
    
    # Reverse mapping of the index
    indices = pd.Series(df.index, index=df['title']).drop_duplicates()
         
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    wine_indices = [i[0] for i in sim_scores]
    recommend = wines.iloc[wine_indices]
    
    print("Top 10 wine recommendations for '{}':".format(title))
    for index, row in recommend.iterrows():
        print("- {}".format(row['title']))

In [49]:
# Test recommendation system
sample_wine = df.iloc[0]['title']
recommendations(sample_wine, cosine_sim_word2vec)

Top 10 wine recommendations for 'Dopff & Irion 2004 Schoenenbourg Grand Cru Vendanges Tardives Riesling (Alsace)':
- Ceretto 2003 Bricco Rocche Prapó  (Barolo)
- Matrix 2007 Stuhlmuller Vineyard Chardonnay (Alexander Valley)
- Mauritson 2007 Rockpile Cemetary Vineyard Zinfandel (Rockpile)
- Silverado 2006 Cabernet Sauvignon (Napa Valley)
- Le Riche 2003 Cabernet Sauvignon Reserve Cabernet Sauvignon (Stellenbosch)
- Pierre Sparr 2007 Vendages Tardives Gewurztraminer (Alsace)
- Pierre Sparr 2008 Alsace One White (Alsace)
- Kuentz-Bas 2008 Pinot Blanc (Alsace)
- Camberley 2004 Philosophers' Stone Red (Stellenbosch)
- Ceretto 2003 Bricco Rocche Brunate  (Barolo)


In [29]:
df['tokens'] = df['tokens'].apply(lambda x: word_tokenize(x.lower()))

KeyboardInterrupt: 

In [None]:
# Train a Word2Vec model
w2v = Word2Vec(df['tokens'], min_count=1)

# Compute Average Word2Vec embeddings
def average_w2v(tokens, model, num_features):
    feature_vec = np.zeros((num_features,), dtype='float32')
    n_words = 0
    for word in tokens:
        if word in model.wv.key_to_index:  # Updated to use key_to_index instead of vocab
            n_words += 1
            feature_vec = np.add(feature_vec, model.wv[word])
    if n_words:
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

avg_w2v_matrix = np.zeros((len(df), w2v.vector_size))
for i, tokens in enumerate(df['tokens']):
    avg_w2v_matrix[i] = average_w2v(tokens, w2v, w2v.vector_size)

# Compute TF-IDF Word2Vec embeddings
dct = Dictionary(df['tokens'])
corpus = [dct.doc2bow(line) for line in df['tokens']]
tfidf_model = TfidfModel(corpus)
tfidf_w2v_matrix = np.zeros((len(df), w2v.vector_size))

for i, tokens in enumerate(df['tokens']):
    weighted_vec = np.zeros((w2v.vector_size,), dtype='float32')
    n_words = 0
    for word in tokens:
        if word in w2v.wv.key_to_index:  # Updated to use key_to_index instead of vocab
            word_vec = w2v.wv[word]
            word_tfidf_list = tfidf_model[dct.doc2bow([word])]
            if word_tfidf_list:
                word_tfidf = word_tfidf_list[0][1]
                weighted_vec += word_vec * word_tfidf
                n_words += 1
    if n_words:
        weighted_vec /= n_words
    tfidf_w2v_matrix[i] = weighted_vec

# Compute cosine similarity for both matrices
cs_avg = cosine_similarity(avg_w2v_matrix, avg_w2v_matrix)
cs_tfidf = cosine_similarity(tfidf_w2v_matrix, tfidf_w2v_matrix)

: 

: 

In [None]:
# Build the recommendation functions
def recommend_wines_avg(user_input, n=10):
    user_tokens = preprocess_text(user_input)
    user_vec = average_word2vec(user_tokens, w2v, w2v.vector_size).reshape(1, -1)
    sim_scores = cs_avg.dot(user_vec.T).flatten()
    top_indices = sim_scores.argsort()[-n:][::-1]
    return df.iloc[top_indices]

def recommend_wines_tfidf(user_input, n=10):
    user_tokens = preprocess_text(user_input)
    user_vec = average_word2vec(user_tokens, w2v, w2v.vector_size).reshape(1, -1)
    sim_scores = cs_tfidf.dot(user_vec.T).flatten()
    top_indices = sim_scores.argsort()[-n:][::-1]
    return df.iloc[top_indices]

# Test the recommendation engines
user_input = "cherry grape fruity"
recommended_wines_avg = recommend_wines_avg(user_input, n=10)
recommended_wines_tfidf = recommend_wines_tfidf(user_input, n=10)

print("Recommendations based on Average Word2Vec:")
print(recommended_wines_avg[['title', 'variety', 'points', 'price']])

print("\nRecommendations based on TF-IDF Word2Vec:")
print(recommended_wines_tfidf[['title', 'variety', 'points', 'price']])