## Feature Engineering
Prepare dataset for ML

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

### Load Dataset

In [2]:
lcbo = pd.read_csv('../data/interim/lcbo.csv')
vivino_product = 
vivino_review = pd.read_csv('')

### Fill Missing Values

In [3]:
# fill remaining missing values
lcbo['region'].fillna('General', inplace=True)
lcbo['score'].fillna(50, inplace=True)
lcbo['sugar_content'].fillna(
    round(lcbo.groupby('varietal')['sugar_content'].transform('mean'), 1),
    inplace=True
)
lcbo['sugar_content'].fillna(
    round(lcbo.groupby('wine_type')['sugar_content'].transform('mean'), 1),
    inplace=True
)
lcbo['description'].fillna('na', inplace=True)

# check missing values
lcbo.isnull().sum()

name             0
price            0
description      0
sku              0
bottle_size      0
alcohol_vol      0
sugar_content    0
varietal         0
country          0
region           0
score            0
wine_type        0
group            0
dtype: int64

### Dummy Encoding Categorical Features

In [5]:
# create dummy table for wine type, group, country and region
lcbo = pd.get_dummies(lcbo, columns=['wine_type', 'group', 'country', 'region'])

# create dummy table for varietal
## extract all varietals
varietals = []
for val in lcbo.varietal:
    val = val.split('/')
    varietals.extend(val)
varietals = set(varietals)

## create dummy table for varietal and drop original column
def split_varietal(val):
    try:
        if val.find(varietal) > -1:
            return 1
        else:
            return 0
    except AttributeError:
        return 0
    
for varietal in varietals:
    lcbo[varietal] = lcbo['varietal'].apply(split_varietal)

lcbo.drop(columns=['varietal'], inplace=True)

In [92]:
len(varietals)
print(varietals)

{'agiorgitiko',
 'aglianico',
 'albana',
 'albarino',
 'albarossa',
 'alentejo',
 'alicante',
 'alicante bouschet',
 'aligote',
 'altesse',
 'alvarinho',
 'amarone',
 'antao vaz',
 'aragonez',
 'arinto',
 'arneis',
 'assyrtiko',
 'avesso',
 'babeasca neagra',
 'baco noir',
 'baga',
 'barbaresco',
 'barbera',
 'bardolino',
 'barolo',
 'beaujolais',
 'blaufrankisch',
 'blend',
 'bobal',
 'bonarda',
 'bordeaux',
 'braucol',
 'brunello',
 'burgundy',
 'cabernet',
 'cabernet franc',
 'cabernet merlot',
 'cabernet sauvignon',
 'cabernet shiraz',
 'canaiolo',
 'cannonau ',
 'carignan',
 'carmenere',
 'carricante',
 'casavecchia',
 'castelao',
 'catarratto',
 'cava',
 'cesanese',
 'champagne',
 'chardonnay',
 'chardonnay musque',
 'chasselas',
 'chateauneuf-du-pape',
 'chenin blanc',
 'chianti',
 'cinsault',
 'clairette',
 'colombard',
 'colorino',
 'cortese',
 'corvina',
 'cremant',
 'dao',
 'debina',
 'dolcetto',
 'douro',
 'ehrenfelser',
 'falanghina',
 'feteasca alba',
 'feteasca neagra',


### Export Dataset

In [None]:
lcbo.to_pickle('../data/processed/lcbo.pkl')

# Vivino

### Collaborative Filtering

In [11]:
vivino_review = pd.read_csv('../data/interim/vivino_review.csv')
reviews = vivino_review[['wine_id', 'user_id', 'rating']]

In [12]:
# create user x item matrix
user_by_wine = reviews.groupby(['user_id', 'wine_id'])['rating'].max().unstack()

In [None]:
# Create a dictionary with users and corresponding movies seen

def wines_reviewed(user_id):
    '''
    INPUT:
    user_id - the user_id of an individual as int
    OUTPUT:
    movies - an array of movies the user has watched
    '''
    wines = user_by_wine.loc[user_id][user_by_wine.loc[user_id].isnull() == False].index.values

    return movies


def create_user_movie_dict():
    '''
    INPUT: None
    OUTPUT: movies_seen - a dictionary where each key is a user_id and the value is an array of movie_ids
    
    Creates the movies_seen dictionary
    '''
    n_users = user_by_movie.shape[0]
    movies_seen = {}

    for user1 in range(1, n_users+1):
        movies_seen[user1] = movies_watched(user1)
        
    return movies_seen
    
movies_seen = create_user_movie_dict()

## Content Based