## Data Preparation
Join lcbo_listing and lcbo_product as one dataframe, clean and tidy columns, and create some basic new features

In [342]:
import numpy as np
import pandas as pd

In [345]:
listing = pd.read_csv('../data/raw/lcbo_listing.csv')
product = pd.read_csv('../data/raw/lcbo_product.csv')
lcbo = pd.merge(listing, product)

print(lcbo.shape)
lcbo.isnull().sum()

(9499, 7)


name              0
price             0
prod_url          0
category        196
description    1040
details           0
sku               0
dtype: int64

In [346]:
# flatten 'details' column to separate columns
lcbo['details'] = lcbo['details'].map(eval)
lcbo = pd.concat([lcbo.drop(columns=['details']), lcbo['details'].apply(pd.Series)], axis=1)

# clean column names
lcbo.columns = (lcbo.columns.str.replace(':', '')
                            .str.replace(' ', '_')
                            .str.replace('/', '_')
                            .str.lower())

# clean up columns
lcbo['bottle_size'] = lcbo.bottle_size.str.extract('(\d+)')
lcbo['sugar_content'] = lcbo.sugar_content.str.extract('(\d+)')
lcbo['alcohol_vol'] = lcbo.alcohol_vol.str.replace('%', '')
lcbo['made_in'] = lcbo.made_in.str.strip().str.split(', ')
lcbo['country'] = np.where(lcbo.made_in.str.len() == 2, lcbo.made_in.str[1], lcbo.made_in.str[0])
lcbo['region'] = np.where(lcbo.made_in.str.len() == 2, lcbo.made_in.str[0], np.nan)
lcbo['score'] = lcbo.description.str.extract('(?i)\Wscore\W.*?(\d{2})').astype(float)
lcbo['score'] = np.where(lcbo.score < 50, np.nan, lcbo.score)

# create new features
lcbo['wine_type'] = np.where(lcbo.prod_url.str.contains('red-wine-14001'), 'red',
                    np.where(lcbo.prod_url.str.contains('white-wine-14002'), 'white',
                    np.where(lcbo.prod_url.str.contains('ros%C3%A9-wine-14003'), 'rose',
                             'sparkling')))

lcbo['group'] = (lcbo.sku.str.strip()
                         .str.split(':')
                         .str[0]
                         .str.replace('#', ''))

lcbo['sku'] = (lcbo.sku.str.strip()
                       .str.split(':')
                       .str[1]
                       .str.strip())

# fill empty strings with nan
lcbo.replace('', np.nan, inplace=True)

# drop columns
lcbo.drop(columns=['prod_url', 'made_in', 'sweetness_descriptor', 'style',
                   'this_is_a_vqa_wine', 'this_is_a_kosher_product.'],
          inplace=True)

In [347]:
lcbo.to_csv('../data/processed/lcbo.csv', index=False)