In [1]:
from pathlib import Path

import joblib
import pandas as pd

%load_ext nb_black

<IPython.core.display.Javascript object>

# Predict

### Read example JSON file

- Parse out all the features as expected by trained models
    - `category_level_1`: string category
    - `category_level_2`: string category
    - `regulated_product_name`: string
    - `ingredients`: list of strings. Join with '. '
    - `text`: concatenated from `description`, `regulated_product_name`, and `ingredients` with '. '
    - `storage_env`: string category
    - `pack_type`: string category
    - `cooking_type`: a list of categories that only exists if there are cooking types. If it does exist, return the list, otherwise, return a list of 'None'

In [2]:
df = (
    pd.read_json(
        Path(
            "data",
            "trial-json-products.json",
        ),
        orient="records",
        encoding="utf-16",
        lines=False,
    )
    .set_index(
        "pvid",
    )
    .sort_index(
        ascending=True,
    )
)

<IPython.core.display.Javascript object>

In [3]:
df["category_level_1"] = df["categories"].apply(
    lambda c: c[0]["description"],
)

df["category_level_2"] = df["categories"].apply(
    lambda c: c[1]["description"],
)

df["regulated_product_name"] = df["languages"].apply(
    lambda c: c[0]["groupingSets"][0]["attributes"]["regulatedProductName"]
)

df["ingredients"] = df["languages"].apply(
    lambda c: ". ".join(c[0]["groupingSets"][0]["attributes"]["ingredients"])
)

df["text"] = df[["description", "regulated_product_name", "ingredients"]].apply(
    lambda s: ". ".join(s[s.notna()]),
    axis=1,
)

df["storage_env"] = df["languages"].apply(
    lambda c: c[0]["groupingSets"][0]["attributes"]["storageType"][0]["lookupValue"]
)

df["pack_type"] = df["languages"].apply(
    lambda c: c[0]["groupingSets"][0]["attributes"]["packType"][0]["lookupValue"]
)


def parse_cooking_guidelines(c):
    try:
        return [
            item["nameValue"]
            for item in c[0]["groupingSets"][0]["attributes"]["cookingGuidelines"]
        ]

    except KeyError:
        return ["None"]


df["cooking_type"] = df["languages"].apply(parse_cooking_guidelines)

df = df[
    [
        "category_level_1",
        "category_level_2",
        "storage_env",
        "pack_type",
        "cooking_type",
        "text",
    ]
]

df.head()

Unnamed: 0_level_0,category_level_1,category_level_2,storage_env,pack_type,cooking_type,text
pvid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6662781,Ready Made Foods,Meals,Chilled,Heat Sealed,[None],Taste Inc. 100% Chargrilled Chicken Fillet Spi...
8020100,Ready Made Foods,Snacks,Chilled,Tub,[None],Morrisons The Best Moroccan Inspired Couscous ...
8241820,Ready Made Foods,Meals,Chilled,Sleeve,"[Cooking Instructions, Microwave, Microwave, O...",Sainsbury's Classics Braised Beef & Mash 400g....


<IPython.core.display.Javascript object>

#### - Load label encoder to get label names

In [4]:
le = joblib.load(
    Path(
        "sklearn",
        "models-full",
        # "models-simple",
        "LabelEncoder.pkl",
    )
)

<IPython.core.display.Javascript object>

#### - Load VotingClassifier
- Make predictions
- Transform labels back to original encoding

In [6]:
vc = joblib.load(
    Path(
        "sklearn",
        "models-full",
        # "models-simple",
        "VotingClassifier.pkl",
    )
)

<IPython.core.display.Javascript object>

In [None]:
df['predict'] = le.inverse_transform(vc.predict(df))