## Data Preparation (Vivino)

In [4]:
import numpy as np
import pandas as pd
from fuzzywuzzy import process, fuzz

### Load Dataset

In [55]:
vivino_product = pd.read_csv('../data/raw/vivino_product_20190704.csv')
vivino_review = pd.read_csv('../data/raw/vivino_review_20190704.csv')

print(vivino_product.shape)
print(vivino_review.shape)

(9850, 16)
(1695743, 7)


In [56]:
PROD_COLS = ['wine_id', 'wine_type', 'name', 'seo_name', 'ratings_count', 'ratings_ave',
             'country', 'region', 'winery', 'year', 'acidity', 'body', 'flavor',
             'structure', 'style']
REV_COLS = ['wine_id', 'user_id', 'rating', 'created_at']

vivino_product = vivino_product[PROD_COLS]
vivino_review = vivino_review[REV_COLS]

### Initial Data Cleaning

In [57]:
# flatten flavor and structure columns in products
flavor = (vivino_product.flavor
          .dropna()
          .map(eval)
          .apply(lambda x: [i.get('group') for i in x])
          .apply(lambda x: pd.Series(1, x)).fillna(0))

structure = (vivino_product.structure
             .dropna()
             .map(eval)
             .apply(pd.Series)
             .rename(columns={'acidity': 'acidity_s'})
             .drop(columns=['user_structure_count', 'calculated_structure_count'], inplace=True))

vivino_product = pd.concat([vivino_product.drop(columns=['flavor', 'structure']),
                            flavor, structure],
                           axis=1)

vivino_product['wine_type'] = (vivino_product.wine_type
                               .map({1: 'red', 2: 'white', 3: 'sparkling', 4: 'rose'}))

### Find Matches Between Vivino Dataset and LCBO Dataset

In [219]:
lcbo = pd.read_csv('../data/interim/lcbo.csv', dtype={'year': 'str'})

In [233]:
def create_search_dict(df, ind, name, winery, year):
    search_dict = {}
    for i, n, w, y in zip(df[ind], df[name], df[winery], df[year].fillna('')):
        key = '{} {} {}'.format(n, w, y)
        search_dict[key] = i
    return search_dict

vivino_dict = create_search_dict(vivino_product, 'wine_id', 'name', 'winery', 'year')
lcbo_dict = create_search_dict(lcbo, 'sku', 'name', 'by', 'year')


def find_match(input_list, match_list, input_key, match_key):
    pairs = {}
    for item in set(input_list):
        match, ratio = process.extractOne(item, set(match_list))
        if ratio >= 90:
            pairs[item] = match
    pairs_df = (pd.DataFrame.from_dict(pairs, orient='index')
                .reset_index()
                .rename(columns={'index': input_key, 0: match_key}))
    return pairs_df

pairs_df = find_match(list(lcbo_dict.keys()), list(vivino_dict.keys()), 'lcbo_key', 'vivino_key')

In [255]:
lcbo_df = pd.DataFrame([[lcbo_dict.get(key), key] for key in pairs_df['lcbo_key']],
                       columns=['lcbo_sku', 'lcbo_key'])
vivino_df = pd.DataFrame([[vivino_dict.get(key), key] for key in pairs_df['vivino_key']],
                         columns=['vivino_id', 'vivino_key'])
match = pairs_df.merge(lcbo_df).merge(vivino_df).drop_duplicates()

### Export Interm Dataset

In [None]:
vivino_product.to_csv('../data/interim/vivino_product.csv', index=False)
vivino_review.to_csv('../data/interim/vivino_review.csv', index=False)
match.to_csv('../data/interim/match.csv', index=False)

### Subset Vivino Dataset for Common Products & Reviews

In [258]:
product = vivino_product.merge(match['vivino_id'], left_on='wine_id', right_on='vivino_id')
product

Unnamed: 0,wine_id,wine_type,name,seo_name,ratings_count,ratings_ave,country,region,winery,year,...,non_oak,oak,dried_fruit,tropical_fruit,tree_fruit,citrus_fruit,floral,vegetal,microbio,vivino_id
0,1651,red,Solaia,solaia,1035.0,4.7,Italy,Toscana,Antinori,2001,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1651
1,1651,red,Solaia,solaia,1188.0,4.6,Italy,Toscana,Antinori,2012,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1651
2,1651,red,Solaia,solaia,1108.0,4.6,Italy,Toscana,Antinori,2013,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1651
3,1651,red,Solaia,solaia,834.0,4.6,Italy,Toscana,Antinori,2000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1651
4,1651,red,Solaia,solaia,642.0,4.6,Italy,Toscana,Antinori,2005,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1651
5,79631,sparkling,Brut Champagne,brut-champagne,793.0,4.7,France,Champagne,Krug,2002,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,79631
6,79631,sparkling,Brut Champagne,brut-champagne,21707.0,4.6,France,Champagne,Krug,N.V.,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,79631
7,79631,sparkling,Brut Champagne,brut-champagne,1341.0,4.6,France,Champagne,Krug,2004,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,79631
8,79631,sparkling,Brut Champagne,brut-champagne,1049.0,4.6,France,Champagne,Krug,2000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,79631
9,84065,red,Amarone della Valpolicella Classico,amarone-della-valpolicella-classico,686.0,4.7,Italy,Amarone della Valpolicella Classico,Quintarelli Giuseppe,2007,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,84065
