In [2]:
import pandas as pd 
import csv
import matplotlib.pyplot as plt


In [3]:
wines150 = pd.read_csv('/Users/jeanzayas/Desktop/Divergence/Portfolio/Wine/winemag-data_first150k.csv', index_col=0)
wines130 = pd.read_csv('/Users/jeanzayas/Desktop/Divergence/Portfolio/Wine/winemag-data-130k-v2.csv', index_col=0)

In [4]:
# CONCATENATE the two datasets
wine_data = pd.concat([wines150, wines130], ignore_index=True)

# DROP irrelevant columns
wine_data = wine_data.drop(['taster_twitter_handle', 'title', 'designation', 'region_2', 'taster_name'], axis=1)

# HANDLE missing values
wine_data = wine_data.dropna(subset=['price', 'country', 'province', 'region_1', 'variety'])


wine_data.head()

Unnamed: 0,country,description,points,price,province,region_1,variety,winery
0,US,This tremendous 100% varietal wine hails from ...,96,235.0,California,Napa Valley,Cabernet Sauvignon,Heitz
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",96,110.0,Northern Spain,Toro,Tinta de Toro,Bodega Carmen Rodríguez
2,US,Mac Watson honors the memory of a wine once ma...,96,90.0,California,Knights Valley,Sauvignon Blanc,Macauley
3,US,"This spent 20 months in 30% new French oak, an...",96,65.0,Oregon,Willamette Valley,Pinot Noir,Ponzi
4,France,"This is the top wine from La Bégude, named aft...",95,66.0,Provence,Bandol,Provence red blend,Domaine de la Bégude


In [5]:
# SELECT relevant features

features = ['description', 'country', 'points', 'price', 'province', 'winery', 'variety']
wine_features = wine_data[features]

## TEXT PROCESSING: 
As the "description" column contains text data, we need to preprocess it before using it as a feature. We can apply techniques such as tokenization, removing stop words, and transforming the text into numerical representations (e.g., TF-IDF or word embeddings). Here, we'll use TF-IDF vectorization.


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer object
vectorizer = TfidfVectorizer(stop_words='english')

#transform the description column
description_vectors = vectorizer.fit_transform(wine_features['description'])
description_vectors.shape

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/jeanzayas/Library/Python/3.10/lib/python/site-packages/IPython/core/interactiveshell.py", line 3460, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/nx/50qt6bg15q70844qzhxhmsmm0000gn/T/ipykernel_62336/628469264.py", line 7, in <module>
    description_vectors = vectorizer.fit_transform(wine_features['description'])
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/feature_extraction/text.py", line 2133, in fit_transform
    X = super().fit_transform(raw_documents)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/feature_extraction/text.py", line 1388, in fit_transform
    vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/feature_extraction/text.py", line -1, in _count_vocab
KeyboardInterrup

## COMBINING FEATURES:

Combine the TF-IDF vectors with the numerical features (points and price) using the 'hstack' function from the 'scipy' library

In [None]:
from scipy.sparse import hstack

# Convert points and price to sparse matrices
points_price = wine_features[['points', 'price']].values

# COMBINE TF-IDF vectors with points and price
combined_features = hstack([description_vectors, points_price])

points_price.shape
combined_features.shape

(215793, 32140)

## BUILDING THE RECOMMENDATION SYSTEM:
Now, we can build a recommendation system using a similarity metric, such as cosine similarity. We will calculate the cosine similiarity between the combined featured vectores for each wine. 

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate pairwise cosine similarity
similarity_matrix = cosine_similarity(combined_features)

NameError: name 'combined_features' is not defined

## Generating Recommendations:

To generate recommendations, we need to identify the most similar wines for a given wine or a set of preferences. We can create a function that takes a wine index as input and returns a list of recommended wine indices.

In [None]:
# Index mapping for wine names
wine_names = wine_features['winery']
def get_recommendations(wine_index, num_recommendations=5):
    # GET similarity scores for the given wine index
    similarity_scores = similarity_matrix[wine_index]
    
    #sort indices based on similarity scores
    similar_wine_indices = similarity_scores.argsort()[::-1]
    
    #Exclude the first wine index (as it is itself)
    similar_wine_indices = similar_wine_indices[1:]
    
    #return top n similar wine indices
    top_indices = similar_wine_indices[:num_recommendations]
    top_wine_names = [wine_names[idx] for idx in top_indices]
    return top_wine_names
    