## Sample Code

In [2]:
import spacy

In [1]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl (587.7 MB)
                                              0.0/587.7 MB ? eta -:--:--
                                              0.1/587.7 MB 1.6 MB/s eta 0:06:08
                                              0.2/587.7 MB 2.8 MB/s eta 0:03:34
                                              0.4/587.7 MB 3.4 MB/s eta 0:02:51
                                              0.4/587.7 MB 3.4 MB/s eta 0:02:51
                                              0.7/587.7 MB 3.6 MB/s eta 0:02:41
                                              0.8/587.7 MB 3.8 MB/s eta 0:02:34
                                              1.0/587.7 MB 3.8 MB/s eta 0:02:36
                                              1.2/587.7 MB 4.0 MB/s eta 0:02:26
                                              1.4/587.7 MB 4.2 MB/s eta 0:02:22
                              


[notice] A new release of pip is available: 23.1.2 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [20]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
nlp = spacy.load("en_core_web_lg")

In [None]:
nlp

<spacy.lang.en.English at 0x79461075f580>

In [None]:
doc = nlp("Donald Trump was President of USA")

In [None]:
doc

Donald Trump was President of USA

In [None]:
type(doc)

spacy.tokens.doc.Doc

In [None]:
doc.ents

(Donald Trump, USA)

In [None]:
doc.ents[1], type(doc.ents[1])

In [None]:
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)

## Category NER Model for Colors


In [141]:
import json

with open('color_search.json', 'r') as f:
    data = json.load(f)

In [142]:
data['searches'][1]['annotations']

[{'start': 0, 'end': 4, 'tag_name': 'Color.Red', 'value': 'ruby'}]

In [143]:
training_data = []

for search in data['searches']:
    temp_dict  = {}
    temp_dict['text'] = search['content']
    temp_dict['entities'] = []
    for annotation in search['annotations']:
        start = annotation['start']
        end = annotation['end']
        label = annotation['tag_name'].upper()
        temp_dict['entities'].append((start, end, label))
    training_data.append(temp_dict)

print(training_data[2])

{'text': 'scarlet', 'entities': [(0, 7, 'COLOR.RED')]}


In [144]:
from spacy.tokens import DocBin
from tqdm import tqdm

nlp = spacy.blank("en")
doc_bin = DocBin()

In [145]:
from spacy.util import filter_spans

for training_example in tqdm(training_data):
    text = training_example['text']
    labels = training_example['entities']
    ents = []
    doc = nlp.make_doc(text)
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)

    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents
    doc_bin.add(doc)

doc_bin.to_disk("train.spacy")


100%|██████████| 8/8 [00:00<00:00, 4001.24it/s]


In [146]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [147]:
!python -m spacy train config.cfg --output ./ --paths.train ./train.spacy --paths.dev ./train.spacy

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     13.22   43.48   35.71   55.56    0.43
200     200         19.47    612.19   90.00   81.82  100.00    0.90
400     400         15.98    440.68   88.89   88.89   88.89    0.89
600     600         16.31    446.98   88.89   88.89   88.89    0.89
800     800         17.34    434.49   88.89   88.89   88.89    0.89
1000    1000         12.93    437.38   87.50  100.00   77.78    0.88
1200    1200         11.97    418.55   87.50  100.00   77.78    0.88
1400    1400         21.88    427.53   88.89   88.89   88.89    0.89
1600    1600         15.88    426.62   88.89   88.89   88.89    0.89
1800    1800         16.58    431.38   87.50  1

[2023-08-15 15:04:45,749] [INFO] Set up nlp object from config
[2023-08-15 15:04:45,773] [INFO] Pipeline: ['tok2vec', 'ner']
[2023-08-15 15:04:45,779] [INFO] Created vocabulary
[2023-08-15 15:04:45,780] [INFO] Finished initializing nlp object
[2023-08-15 15:04:45,921] [INFO] Initialized pipeline components: ['tok2vec', 'ner']


In [154]:
nlp_ner = spacy.load("model-best")

In [None]:
#export model 
output_dir = "/exported_model"
nlp_ner.

In [161]:
doc = nlp_ner("scarlet womens dress and red")

colors = {"COLOR.RED": "#f67de3", "PRODUCT.DRESS": "#7df6d9"}
options = {"colors": colors}

spacy.displacy.render(doc, style = "ent", options = options, jupyter = True)

In [150]:
doc.ents

(womens, dress, ruby, dress, women)

In [17]:
# Women Jean, Blouse, Dress 4 to 18
#             Men Shirt, T-shirt, Suit 5 to 18
#             Boys Shirt, T-shirt, Jeans 2T to Y18
#             Girls Shirt, Dress, Jeans

ruby dress