## Data Preparation
Join lcbo_listing and lcbo_product as one dataframe, clean and tidy columns, and create some basic new features

In [223]:
import numpy as np
import pandas as pd

In [277]:
listing = pd.read_csv('../data/raw/lcbo_listing.csv')
product = pd.read_csv('../data/raw/lcbo_product.csv')
lcbo = pd.merge(listing, product)
print(lcbo.shape)
lcbo.isnull().sum()

(9499, 7)


name              0
price             0
prod_url          0
category        196
description    1040
details           0
sku               0
dtype: int64

In [278]:
# flatten 'details' column to separate columns
lcbo['details'] = lcbo['details'].map(eval)
lcbo = pd.concat([lcbo.drop(columns=['details']), lcbo['details'].apply(pd.Series)], axis=1)

# clean column names
lcbo.columns = (lcbo.columns.str.replace(':', '')
                            .str.replace(' ', '_')
                            .str.replace('/', '_')
                            .str.lower())

# clean up columns
lcbo['bottle_size'] = lcbo.bottle_size.str.extract('(\d+)')
lcbo['sugar_content'] = lcbo.sugar_content.str.extract('(\d+)')
lcbo['alcohol_vol'] = lcbo.alcohol_vol.str.replace('%', '')
lcbo['sweetness_descriptor'] = lcbo.sweetness_descriptor.str.strip()
lcbo['made_in'] = lcbo.made_in.str.strip().str.split(', ')
lcbo['country'] = np.where(lcbo.made_in.str.len() == 2, lcbo.made_in.str[1], lcbo.made_in.str[0])
lcbo['region'] = np.where(lcbo.made_in.str.len() == 2, lcbo.made_in.str[0], np.nan)
lcbo['style'] = lcbo['style'].str.strip()
lcbo['score'] = lcbo.description.str.extract('(?i)\Wscore\W.*?(\d{2})').astype(float)
lcbo['score'] = np.where(lcbo.score < 50, np.nan, lcbo.score)

# create new features
lcbo['wine_type'] = np.where(lcbo.prod_url.str.contains('red-wine-14001'), 'red',
                    np.where(lcbo.prod_url.str.contains('white-wine-14002'), 'white',
                    np.where(lcbo.prod_url.str.contains('ros%C3%A9-wine-14003'), 'rose',
                    np.where(lcbo.prod_url.str.contains('champagne-14004'), 'champagne',
                             'sparkling'))))

lcbo['group'] = (lcbo.sku.str.strip()
                         .str.split(':')
                         .str[0]
                         .str.replace('#', ''))

lcbo['sku'] = (lcbo.sku.str.strip()
                       .str.split(':')
                       .str[1]
                       .str.strip())

# fill empty strings with nan
lcbo.replace('', np.nan, inplace=True)

# drop columns
lcbo.drop(columns=['prod_url', 'made_in', 'this_is_a_vqa_wine', 'this_is_a_kosher_product.'],
          inplace=True)

In [336]:
lcbo.head()

Unnamed: 0,name,price,category,description,sku,bottle_size,alcohol_vol,by,sugar_content,sweetness_descriptor,style,varietal,release_date,country,region,wine_type,group,score
0,Folonari Valpolicella Classico DOC,14.95,Valpolicella,Valpolicella is the classic red wine of Italy'...,828,750,12.5,Folonari S.P.A.,4,,,Valpolicella,,Italy,Veneto,red,LCBO,
1,Ruffino Chianti,14.95,Chianti,"Ruffino's original wine, and one of the world'...",1743,750,12.5,Ruffino S.R.L.,4,,,Chianti,,Italy,Tuscany,red,LCBO,
2,Fontana Di Papa Red,14.75,Blend,The grapes for this food-friendly wine are gro...,3038,1500,12.0,Fontana Di Papa S.R.L.,10,,,Blend,,Italy,Lazio,red,LCBO,
3,Leeuwin Art Series Shiraz 2014,47.95,Shiraz/Syrah,"Lots of blackberry, blueberry and crushed whit...",4929,750,13.5,Leeuwin Estate,2,,,Shiraz/Syrah,"September 29, 2018",Australia,Western Australia,red,VINTAGES,94.0
4,Folonari Valpolicella Classico DOC,9.95,Valpolicella,Valpolicella is the classic red wine of Italy'...,6254,375,12.5,Folonari S.P.A.,4,,,Valpolicella,,Italy,Veneto,red,LCBO,
