In [None]:
import os

import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from spacy.gold import docs_to_json

# Data preparation

- Use `dtype` for performance
- `ingredients` is a string of ingredients delimeted with `|`, replace with `.`. Fill empty cells with `None` as the lack of ingredients is significant
- `cooking_type` is a string of cooking types categories delimeted with `|`, replace with `.`. However, before replacing, fill empty cells with `None` as the lack of a cooking type is significant
- Some products have duplicated `description`. To remove them, we set `pvid` as the index and sort it in an ascending order, then drop rows with duplicated `description` but keeping the one with the last `pvid` (i.e. the most recent product)
- Drop rows with any empty cell. `ingredients` and `cooking_type` empty cells are now `None` so will not be dropped
- Concatenate all columns using '. ' into new column `text`

In [None]:
df = pd.read_excel(
    os.path.join(
        'data',
        '200901_PHE_category_sheet.xlsx',
    ),
    usecols=[
        'lProductVersionID',
        'sDescription',
        'sCategoryLevel1',
        'sCategoryLevel2',
        'regulated_product_name',
        'ingredients',
        'storage_env',
        'pack_type',
        'cooking_type',
        'PHE_category_jan',
    ],
    dtype={
        'lProductVersionID': 'uint64',
        'sDescription': str,
        'sCategoryLevel1': 'category',
        'sCategoryLevel2': 'category',
        'regulated_product_name': str,
        'ingredients': str,
        'storage_env': 'category',
        'pack_type': 'category',
        'cooking_type': str,
        'PHE_category_jan': 'category',
    },
).rename(
    columns={
        'lProductVersionID': 'pvid',
        'sDescription': 'description',
        'sCategoryLevel1': 'category_level_1',
        'sCategoryLevel2': 'category_level_2',
        'PHE_category_jan': 'label',
    }
).assign(
    ingredients=lambda df: df['ingredients'].str.replace(
        '|', '.').fillna('None'),
    cooking_type=lambda df: df['cooking_type'].fillna('None').str.replace(
        '|', '.'),
).set_index(
    'pvid',
).sort_index(
    ascending=True,
).drop_duplicates(
    subset='description',
    keep='last',
).dropna(
    how='any',
).assign(
    text=lambda df: df.apply(
        '. '.join,
        axis=1,
    )
)

df.info()

#### - Get unique labels

In [None]:
labels = df['label'].unique()

#### - Concat labels for examples with multi-labels

In [None]:
df = df.groupby('text')['label'].apply(
    set).reset_index().rename(
        columns={'label': 'multilabel'}
)

#### - Convert text and labels into a SpaCy compatible format

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
def convert_to_spacy(s, labels):
    """
    Convert text and labels into a spaCy compitable format
    """
    cats = {
        label:
        1.0 if label in s['multilabel']
        else 0.0
        for label in labels
    }

    # make spacy document from the 'text' column
    # Update document categories to cats dictionary
    doc = nlp(s['text'])
    doc.cats = cats

    return docs_to_json([doc])

In [None]:
df['spacy'] = df.apply(
    lambda s: convert_to_spacy(s, labels),
    axis=1,
)

#### - Split data 70/30 for train/val and save results into json files

In [None]:
def split_save_json(df, test_size):
    """
    Split data 30/70 and stratify by label
    Save into json
    """
    train, val = train_test_split(
        df['spacy'],
        test_size=test_size,
        random_state=42,
        shuffle=True,
    )

    train.to_json(
        os.path.join(
            'data',
            'dataset_spacy_train.json',
        ),
        orient='records',
    )

    val.to_json(
        os.path.join(
            'data',
            'dataset_spacy_val.json',
        ),
        orient='records',
    )

In [None]:
split_save_json(df, test_size=0.3)

# Training and Validation

- AUC ROC score:
    - Training: $100\%$
    - Testing: $97\%$
    
#### - Run in CLI

`python -m spacy train en training data/dataset_spacy_train.json data/dataset_spacy_val.json --base-model en_core_web_md --pipeline textcat --n-iter 30 --n-early-stopping 3 --version 1.0`

# Predict

### Read example JSON file

- Parse out all the features
    - `category_level_1`: string category
    - `category_level_2`: string category
    - `regulated_product_name`: string
    - `ingredients`: list of strings. Join with '. '
    - `storage_env`: string category
    - `pack_type`: string category
    - `cooking_type`: a list of categories that only exists if there are cooking types. If it does exist, concatenate items with '. ', otherwise, return 'None'
    - `text`: concatenated from all above features

In [None]:
df = pd.read_json(
    os.path.join(
        'data',
        'trial-json-products.json',
    ),
    orient='records',
    encoding='utf-16',
    lines=False,
).set_index(
    'pvid',
).sort_index(
    ascending=True,
)

In [None]:
df['category_level_1'] = df['categories'].apply(
    lambda
    c: c[0]['description'],
)

df['category_level_2'] = df['categories'].apply(
    lambda
    c: c[1]['description'],
)

df['regulated_product_name'] = df['languages'].apply(
    lambda
    c: c[0]['groupingSets'][0]['attributes']['regulatedProductName']
)

df['ingredients'] = df['languages'].apply(
    lambda
    c: '.'.join(
        c[0]['groupingSets'][0]['attributes']['ingredients']
    )
)

df['storage_env'] = df['languages'].apply(
    lambda
    c: c[0]['groupingSets'][0]['attributes']['storageType'][0]
    ['lookupValue']
)

df['pack_type'] = df['languages'].apply(
    lambda
    c: c[0]['groupingSets'][0]['attributes']['packType'][0]
    ['lookupValue']
)


def parse_cooking_guidelines(c):
    try:
        guidelines = [
            item['nameValue']
            for item in c[0]['groupingSets'][0]['attributes']
            ['cookingGuidelines']
        ]
        return '. '.join(set(guidelines))

    except KeyError:
        return 'None'


df['cooking_type'] = df['languages'].apply(
    parse_cooking_guidelines
)

df = df[[
    'category_level_1',
    'category_level_2',
    'regulated_product_name',
    'ingredients',
    'storage_env',
    'pack_type',
    'cooking_type',
]]

df['text'] = df.apply(
    lambda s: '. '.join(s[s.notna()]),
    axis=1,
)

#### - Load best trained model

In [None]:
nlp = spacy.load(
    os.path.join(
        'training',
        'model-best',
    )
)

#### - Get the category with the highest score

In [None]:
def predict(text):
    doc = nlp(text)
    return max(
        doc.cats,
        key=lambda key: doc.cats[key],
    )


df['predict'] = df['text'].apply(predict)