## Feature Engineering
Prepare dataset for ML

In [99]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

### Load Dataset

In [100]:
lcbo = pd.read_csv('../data/processed/lcbo.csv')

# check missing values
lcbo.isnull().sum()

name                0
price               0
description         0
sku                 0
bottle_size         0
alcohol_vol         0
sugar_content    3077
varietal            0
country             0
region           1818
score            6336
wine_type           0
group               0
dtype: int64

### Fill Missing Values

In [101]:
# fill remaining missing values
lcbo['region'].fillna('General', inplace=True)
lcbo['score'].fillna(50, inplace=True)
lcbo['sugar_content'].fillna(
    round(lcbo.groupby('varietal')['sugar_content'].transform('mean'), 1),
    inplace=True
)
lcbo['sugar_content'].fillna(
    round(lcbo.groupby('wine_type')['sugar_content'].transform('mean'), 1),
    inplace=True
)

# check missing values again
lcbo.isnull().sum()

name             0
price            0
description      0
sku              0
bottle_size      0
alcohol_vol      0
sugar_content    0
varietal         0
country          0
region           0
score            0
wine_type        0
group            0
dtype: int64

### Dummy Encoding Categorical Features

In [102]:
# create dummy table for wine type, group, country and region
lcbo = pd.get_dummies(lcbo, columns=['wine_type', 'group', 'country', 'region'])

# create dummy table for varietal
## extract all varietals
varietals = []
for val in lcbo.varietal:
    val = val.split('/')
    varietals.extend(val)
varietals = set(varietals)

## create dummy table for varietal and drop original column
def split_varietal(val):
    try:
        if val.find(varietal) > -1:
            return 1
        else:
            return 0
    except AttributeError:
        return 0
    
for varietal in varietals:
    lcbo[varietal] = lcbo['varietal'].apply(split_varietal)

lcbo.drop(columns=['varietal'], inplace=True)

### Create TF-IDF for Description

In [103]:
# create tf-idf matrix
vec = TfidfVectorizer()
X = vec.fit_transform(lcbo['description'].fillna('na'))

# convert tf-idf to dataframe and append to lcbo
tfidf = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
lcbo = pd.concat([lcbo.drop(columns=['description']), tfidf], axis=1)

### Reduce Dimensions with PCA

## Export Dataset

In [104]:
lcbo.to_csv('../data/processed/lcbo_sparse.csv', index=False)