In [1]:
from IPython.display import display, Markdown

with open("restaurent_search.md", "r") as f:
    display(Markdown(f.read()))

## restaurant_search
- i'm looking for a place to eat
- I want to grab lunch
- I am searching for a dinner spot
- i'm looking for a place in the [north](location) of town
- show me [chinese](cuisine) restaurants
- show me [chines](cuisine:chinese) restaurants in the [north](location)
- show me a [mexican](cuisine) place in the [centre](location)
- i am looking for an [indian](cuisine) spot called olaolaolaolaolaola
- search for restaurants
- anywhere in the [west](location)
- anywhere near [18328](location)
- I am looking for [asian fusion](cuisine) food
- I am looking a restaurant in [29432](location)
- I am looking for [mexican indian fusion](cuisine)
- [central](location) [indian](cuisine) restaurant

In [2]:
from spacy_crfsuite import read_file

train_data = read_file("restaurent_search.md")
train_data

[{'text': "i'm looking for a place to eat", 'entities': []},
 {'text': 'I want to grab lunch', 'entities': []},
 {'text': 'I am searching for a dinner spot', 'entities': []},
 {'text': "i'm looking for a place in the north of town",
  'entities': [{'start': 31,
    'end': 36,
    'value': 'north',
    'entity': 'location'}]},
 {'text': 'show me chinese restaurants',
  'entities': [{'start': 8,
    'end': 15,
    'value': 'chinese',
    'entity': 'cuisine'}]},
 {'text': 'show me chines restaurants in the north',
  'entities': [{'start': 8,
    'end': 14,
    'value': 'chinese',
    'entity': 'cuisine'},
   {'start': 34, 'end': 39, 'value': 'north', 'entity': 'location'}]},
 {'text': 'show me a mexican place in the centre',
  'entities': [{'start': 10,
    'end': 17,
    'value': 'mexican',
    'entity': 'cuisine'},
   {'start': 31, 'end': 37, 'value': 'centre', 'entity': 'location'}]},
 {'text': 'i am looking for an indian spot called olaolaolaolaolaola',
  'entities': [{'start': 20,
  

In [3]:
import spacy

from spacy_crfsuite.tokenizer import SpacyTokenizer
from spacy_crfsuite.train import gold_example_to_crf_tokens

nlp = spacy.load("en_core_web_sm", disable=["ner"])
tokenizer = SpacyTokenizer(nlp)

train_dataset = [
    gold_example_to_crf_tokens(ex, tokenizer=tokenizer) 
    for ex in train_data
]

train_dataset[0]



[CRFToken(text='i', tag='PRP', entity='O', shape='x', pattern={}, dense_features=[]),
 CRFToken(text="'m", tag='VBP', entity='O', shape="'x", pattern={}, dense_features=[]),
 CRFToken(text='looking', tag='VBG', entity='O', shape='xxxx', pattern={}, dense_features=[]),
 CRFToken(text='for', tag='IN', entity='O', shape='xxx', pattern={}, dense_features=[]),
 CRFToken(text='a', tag='DT', entity='O', shape='x', pattern={}, dense_features=[]),
 CRFToken(text='place', tag='NN', entity='O', shape='xxxx', pattern={}, dense_features=[]),
 CRFToken(text='to', tag='TO', entity='O', shape='xx', pattern={}, dense_features=[]),
 CRFToken(text='eat', tag='VB', entity='O', shape='xxx', pattern={}, dense_features=[])]

In [4]:
import srsly

component_config = srsly.read_json("default-config.json")
component_config

{'features': [['low', 'title', 'upper'],
  ['low',
   'bias',
   'prefix5',
   'prefix2',
   'suffix5',
   'suffix3',
   'suffix2',
   'upper',
   'title',
   'digit'],
  ['low', 'title', 'upper']],
 'c1': 0.003,
 'c2': 0.03}

In [5]:
from spacy_crfsuite import CRFExtractor

crf_extractor = CRFExtractor(component_config=component_config)
crf_extractor

<spacy_crfsuite.crf_extractor.CRFExtractor at 0x12e12a048>

In [6]:
rs = crf_extractor.fine_tune(train_dataset, cv=5, n_iter=50, random_state=42)
print("best_params:", rs.best_params_, ", score:", rs.best_score_)
crf_extractor.train(train_dataset)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.4s


best_params: {'c1': 0.4487523606548803, 'c2': 0.0023781924878204276} , score: 0.22222222222222224


[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed:    2.4s finished


<spacy_crfsuite.crf_extractor.CRFExtractor at 0x12e12a048>

In [7]:
# ! pip install sklearn

classification_report = crf_extractor.eval(train_dataset)
print(classification_report[1])

              precision    recall  f1-score   support

   B-cuisine      1.000     1.000     1.000         2
   I-cuisine      1.000     1.000     1.000         1
   L-cuisine      1.000     1.000     1.000         2
   U-cuisine      1.000     1.000     1.000         5
  U-location      1.000     1.000     1.000         7

   micro avg      1.000     1.000     1.000        17
   macro avg      1.000     1.000     1.000        17
weighted avg      1.000     1.000     1.000        17





In [8]:
print(crf_extractor.explain())

Most likely transitions:
O          -> O          1.637338
B-cuisine  -> I-cuisine  1.373766
U-cuisine  -> O          1.306077
I-cuisine  -> L-cuisine  0.915989
O          -> U-location 0.751463
B-cuisine  -> L-cuisine  0.698893
O          -> U-cuisine  0.480360
U-location -> U-cuisine  0.403487
O          -> B-cuisine  0.261450
L-cuisine  -> O          0.182695

Positive features:
1.976502 O          0:bias:bias
1.957180 U-location -1:low:the
1.216547 B-cuisine  -1:low:for
1.153924 U-location 0:prefix5:centr
1.153924 U-location 0:prefix2:ce
1.110536 U-location 0:digit
1.058294 U-cuisine  0:prefix5:chine
1.058294 U-cuisine  0:prefix2:ch
1.051457 U-cuisine  0:suffix2:an
0.999976 U-cuisine  -1:low:me


In [9]:
example = {"text": "show mexican restaurents up north"}
tokenizer.tokenize(example, attribute="text")
crf_extractor.process(example)

[{'start': 5,
  'end': 12,
  'value': 'mexican',
  'entity': 'cuisine',
  'confidence': 0.6679336521223249},
 {'start': 28,
  'end': 33,
  'value': 'north',
  'entity': 'location',
  'confidence': 0.946416065624759}]

In [10]:
import spacy

from spacy_crfsuite import CRFEntityExtractor

pipe = CRFEntityExtractor(nlp, crf_extractor=crf_extractor)
nlp.add_pipe(pipe)

doc = nlp("show mexican restaurents up north")
for ent in doc.ents:
    print(ent.text, "--", ent.label_)

mexican -- cuisine
north -- location


In [11]:
# Save model to disk ...
# crf_extractor.to_disk()