In [13]:
from pathlib import Path

import pandas as pd
import spacy

%load_ext nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

# Install trained models

- This should be done in Dockerfile or dev/prod environment

In [None]:
!pip install spacy\results\packages-full\en_textcat-2.0.0.tar.gz

In [None]:
!pip install spacy\results\packages-simple\en_textcat_simple-2.0.0.tar.gz

# Predict

### Read example JSON file

- Parse out all the features
    - `category_level_1`: string category
    - `category_level_2`: string category
    - `regulated_product_name`: string
    - `ingredients`: list of strings. Join with '. '
    - `storage_env`: string category
    - `pack_type`: string category
    - `cooking_type`: a list of categories that only exists if there are cooking types. If it does exist, concatenate items with '. ', otherwise, return 'None'
    - `text`: concatenated from all above features

In [None]:
df = pd.read_json(
    Path(
        'data',
        'trial-json-products.json',
    ),
    orient='records',
    encoding='utf-16',
    lines=False,
).set_index(
    'pvid',
).sort_index(
    ascending=True,
)

In [None]:
df['category_level_1'] = df['categories'].apply(
    lambda
    c: c[0]['description'],
)

df['category_level_2'] = df['categories'].apply(
    lambda
    c: c[1]['description'],
)

df['regulated_product_name'] = df['languages'].apply(
    lambda
    c: c[0]['groupingSets'][0]['attributes']['regulatedProductName']
)

df['ingredients'] = df['languages'].apply(
    lambda
    c: '.'.join(
        c[0]['groupingSets'][0]['attributes']['ingredients']
    )
)

df['storage_env'] = df['languages'].apply(
    lambda
    c: c[0]['groupingSets'][0]['attributes']['storageType'][0]
    ['lookupValue']
)

df['pack_type'] = df['languages'].apply(
    lambda
    c: c[0]['groupingSets'][0]['attributes']['packType'][0]
    ['lookupValue']
)


def parse_cooking_guidelines(c):
    try:
        guidelines = [
            item['nameValue']
            for item in c[0]['groupingSets'][0]['attributes']
            ['cookingGuidelines']
        ]
        return '. '.join(set(guidelines))

    except KeyError:
        return 'None'


df['cooking_type'] = df['languages'].apply(
    parse_cooking_guidelines
)

df = df[[
    'category_level_1',
    'category_level_2',
    'regulated_product_name',
    'ingredients',
    'storage_env',
    'pack_type',
    'cooking_type',
]]

df['text'] = df.apply(
    lambda s: '. '.join(s[s.notna()]),
    axis=1,
)

#### - Load best trained model

In [None]:
nlp = spacy.load('en_textcat')
# nlp = spacy.load('en_textcat_simple')

#### - Get the category with the highest score

In [None]:
def predict(text):
    doc = nlp(text)
    
    return max(
        doc.cats,
        key=lambda key: doc.cats[key],
    )

In [None]:
df['predict'] = df['text'].apply(predict)
df.head()