In [None]:
from pathlib import Path

import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from spacy.training import docs_to_json

%load_ext nb_black

nlp = spacy.load('en_core_web_sm')

# Data preparation

- Use `dtype` for performance
- `ingredients` is a string of ingredients delimeted with `|`, replace with `.`. Fill empty cells with `None` as the lack of ingredients is significant
- `cooking_type` is a string of cooking types categories delimeted with `|`, replace with `.`. However, before replacing, fill empty cells with `None` as the lack of a cooking type is significant
- Some products have duplicated `description`. To remove them, we set `pvid` as the index and sort it in an ascending order, then drop rows with duplicated `description` but keeping the one with the last `pvid` (i.e. the most recent product)
- Drop rows with any empty cell. `ingredients` and `cooking_type` empty cells are now `None` so will not be dropped
- Concatenate all columns using '. ' into new column `text`

In [None]:
def process_file(file: str) -> pd.DataFrame:
    """Process data for training

    Args:
        file (str): file to process

    Returns:
        pd.DataFrame: processed dataFrame
    """
    return (
        pd.read_excel(
            Path(
                "data",
                file,
            ),
            usecols=[
                "lProductVersionID",
                "sDescription",
                "sCategoryLevel1",
                "sCategoryLevel2",
                "regulated_product_name",
                "ingredients",
                "storage_env",
                "pack_type",
                "cooking_type",
                "PHE_category",
            ],
            dtype={
                "lProductVersionID": "uint64",
                "sDescription": str,
                "sCategoryLevel1": "category",
                "sCategoryLevel2": "category",
                "regulated_product_name": str,
                "ingredients": str,
                "storage_env": "category",
                "pack_type": "category",
                "cooking_type": str,
                "PHE_category": "category",
            },
        )
        .rename(
            columns={
                "lProductVersionID": "pvid",
                "sDescription": "description",
                "sCategoryLevel1": "category_level_1",
                "sCategoryLevel2": "category_level_2",
                "PHE_category": "label",
            }
        )
        .assign(
            ingredients=lambda df: df["ingredients"]
            .str.replace("|", ".", regex=False)
            .fillna("None"),
            cooking_type=lambda df: df["cooking_type"]
            .fillna("None")
            .str.replace("|", ".", regex=False),
            label=lambda df: df["label"].str.lower(),
        )
        .set_index(
            "pvid",
        )
        .sort_index(
            ascending=True,
        )
        .drop_duplicates(
            subset="description",
            keep="last",
        )
        .dropna(
            how="any",
        )
        .assign(
            text=lambda df: df.apply(
                ". ".join,
                axis=1,
            )
        )
    )


## Read Data
- Get unique labels
- Concat labels for examples with multi-labels

In [None]:
df = process_file("210714_updated_clean_sheet.xlsx")
df.info()

# check labels frequencies
df["label"].value_counts().to_dict()

# move labels with low frequencies to other
df["label"] = df["label"].replace(
    to_replace=["other_sauces", "cream_alternative"],
    value="other",
)

labels = df["label"].unique()

In [None]:
df = process_file("210714_updated_clean_sheet-simple.xlsx")
df.info()

# check labels frequencies
df["label"].value_counts().to_dict()

# move labels with low frequencies to other
df["label"] = df["label"].replace(
    to_replace=["oils", "prepared_soups"],
    value="other",
)

labels = df["label"].unique()

#### - Convert text and labels into a SpaCy compatible format

In [None]:
def convert_to_spacy(s, labels):
    """
    Convert text and labels into a spaCy compitable format
    """
    # docs_to_json expects a dict of cats
    cats = {label: 1.0 if label in s["multilabel"] else 0.0 for label in labels}

    doc = nlp(s["text"])
    doc.cats = cats

    return docs_to_json([doc])

In [None]:
# concat labels for examples with multi-labels
df = (
    df.groupby("text")["label"]
    .apply(set)
    .reset_index()
    .rename(columns={"label": "multilabel"})
)

# for binary classification, resolve/drop ambiguous rows with multi-lables
len(df[df["multilabel"].apply(len) > 1])
df = df[df["multilabel"].apply(len) == 1]

df['spacy'] = df.apply(
    lambda s: convert_to_spacy(s, labels),
    axis=1,
)

#### - Split data 70/30 for train/val and save results into json files

In [None]:
def split_save_json(df, test_size):
    """
    Split data 30/70 and stratify by label
    Save into json
    """
    train, val = train_test_split(
        df['spacy'],
        test_size=test_size,
        random_state=42,
        shuffle=True,
    )

    train.to_json(
        Path(
            "spacy",
            "assets",
            'train.json',
        ),
        orient='records',
    )

    val.to_json(
        Path(
            "spacy",
            "assets",
            'dev.json',
        ),
        orient='records',
    )

In [None]:
split_save_json(df, test_size=0.3)

# Training and Validation

- AUC ROC score:
    - Training: $100\%$
    - Testing: $99\%$
    

In [None]:
!spacy project run all