<a href="https://colab.research.google.com/github/swilsonmfc/nlp/blob/master/spaCyConciseConcept.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# spaCy and Concise Concepts

# Install

In [None]:
!pip install concise_concepts
!pip install gradio
!pip install pylighter

Collecting pylighter
  Downloading pylighter-0.0.3-py2.py3-none-any.whl (25 kB)
Collecting ipython>=7.18.1
  Downloading ipython-7.32.0-py3-none-any.whl (793 kB)
[K     |████████████████████████████████| 793 kB 9.3 MB/s 
Collecting prompt-toolkit!=3.0.0,!=3.0.1,<3.1.0,>=2.0.0
  Downloading prompt_toolkit-3.0.29-py3-none-any.whl (381 kB)
[K     |████████████████████████████████| 381 kB 56.0 MB/s 
Installing collected packages: prompt-toolkit, ipython, pylighter
  Attempting uninstall: prompt-toolkit
    Found existing installation: prompt-toolkit 1.0.18
    Uninstalling prompt-toolkit-1.0.18:
      Successfully uninstalled prompt-toolkit-1.0.18
  Attempting uninstall: ipython
    Found existing installation: ipython 5.5.0
    Uninstalling ipython-5.5.0:
      Successfully uninstalled ipython-5.5.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jupyter-co

# Setup

In [5]:
import pandas as pd

import spacy
from spacy import displacy
from spacy.scorer import Scorer
from spacy.training.example import Example

import concise_concepts

import gradio as gr
from IPython.core.display import display, HTML

from pylighter import Annotation

# Download Model

In [None]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.2.0/en_core_web_lg-3.2.0-py3-none-any.whl (777.4 MB)
[K     |████████████████████████████████| 777.4 MB 6.6 kB/s 
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.2.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


# NER Tagging

In [11]:
nlp = spacy.load("en_core_web_lg")
doc = nlp("London Heathrow is a large airport in the United Kingom.")

# document level
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print(ents)

# token level
ent_san = [doc[0].text, doc[0].ent_iob_, doc[0].ent_type_]
ent_francisco = [doc[1].text, doc[1].ent_iob_, doc[1].ent_type_]
print(ent_san)  # ['San', 'B', 'GPE']
print(ent_francisco)  # ['Francisco', 'I', 'GPE']

displacy.render(doc, style='ent', jupyter=True)

[('London', 0, 6, 'GPE'), ('Heathrow', 7, 15, 'GPE'), ('the United Kingom', 38, 55, 'GPE')]
['London', 'B', 'GPE']
['Heathrow', 'B', 'GPE']


## Entity Types
...

In [13]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7f3afbb7a8a0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x7f3afbb7a750>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7f3afb9bbbd0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7f3afc6c60f0>),
 ('lemmatizer',
  <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7f3afc653cd0>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x7f3afb9bb950>)]

# Construct Pipeline

In [None]:
nlp = spacy.load('en_core_web_lg')
data = {
    'fruit': ['apple', 'pear', 'orange'],
    'vegetable': ['broccoli', 'spinach', 'tomato'],
    'meat': ['beef', 'pork', 'fish', 'lamb']
}

nlp.add_pipe('concise_concepts', 
    config={'data': data}
)

<concise_concepts.conceptualizer.ConceptualSpacy at 0x7f218786f6d0>

# Run Concise Concepts

In [None]:
text = """
       Heat the oil in a large pan and add the Onion, celery and carrots.
       Then, cook over a medium–low heat for 10 minutes, or until softened.
       Add the courgette, garlic, red peppers and oregano and cook for 2–3 minutes.
       Later, add some oranges and chickens.
       """
doc = nlp(text)

options = {'colors': {'fruit': 'darkorange', 
                      'vegetable': 'limegreen', 
                      'meat': 'yellow'},
           'ents': ['fruit', 'vegetable', 'meat']}

displacy.render(doc, style="ent", options=options, jupyter=True)

# Gradio

In [None]:
def respond(sentence):
  doc = nlp(sentence)

  options = {'colors': {'fruit': 'darkorange', 
                        'vegetable': 'limegreen', 
                        'meat': 'yellow'},
            'ents': ['fruit', 'vegetable', 'meat']}
  
  html = ("<div style='max-width:100%; max-height:360px; overflow:auto'>"
          + displacy.render(doc, style="ent", options=options, page=True)
          + "</div>")
  
  return html 

iface = gr.Interface(fn=respond, inputs='text', outputs=['html'])
iface.launch()

Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`
Running on public URL: https://25161.gradio.app

This share link expires in 72 hours. For free permanent hosting, check out Spaces (https://huggingface.co/spaces)


(<fastapi.applications.FastAPI at 0x7f2186a5afd0>,
 'http://127.0.0.1:7870/',
 'https://25161.gradio.app')

# Flagged Items

In [None]:
!ls flagged

log.csv


In [None]:
with open('flagged/log.csv') as f:
  text = f.read()
  display(HTML(text))

# Scoring NER Models

In [9]:
def scores_to_frame(scores):
  """
  Convert the scores to a dataframe
  """
  df = pd.DataFrame(columns=['Precision', 'Recall', 'F1'])
  for k, v in scores['ents_per_type'].items():
    df.loc[k] = [v['p'], v['r'], v['f']]
  df.loc['TOTAL'] = [scores['ents_p'], scores['ents_r'], scores['ents_f']]
  return df

In [10]:
nlp = spacy.load('en_core_web_lg')

examples = [
    ("Trump says he's answered Mueller's Russia inquiry questions \u2013 live",{"entities":[[0,5,"PERSON"],[25,32,"PERSON"],[35,41,"GPE"]]}),
    ("Alexander Zverev reaches ATP Finals semis then reminds Lendl who is boss",{"entities":[[0,16,"PERSON"],[55,60,"PERSON"]]}),
    ("Britain's worst landlord to take nine years to pay off string of fines",{"entities":[[0,7,"GPE"]]}),
    ("Tom Watson: people's vote more likely given weakness of May's position",{"entities":[[0,10,"PERSON"],[56,59,"PERSON"]]}),
]
preds = []
scorer = Scorer()
for text, annotations in examples:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    preds.append(example)
scores = nlp.evaluate(preds)
scores_to_frame(scores)


Unnamed: 0,Precision,Recall,F1
ORG,0.0,0.0,0.0
GPE,1.0,1.0,1.0
PERSON,1.0,0.5,0.666667
DATE,0.0,0.0,0.0
TOTAL,0.5,0.625,0.555556


# Annotating

In [4]:
corpus = [
  'He ate the fresh fruit and loved the strawberry',
  'At one time the little boy despised beets',
  'Can I get two chicken sandwiches'
]
annotation = Annotation(corpus, labels_names=['fruit', 'vegetable', 'meat'])

HTML(value='Good job, you annotated <b>3</b>\n            documents ! Keep up the good work !')

In [9]:
!more annotation.csv

document;labels
He ate the fresh fruit and loved the strawberry;['O', 'O', 'O', 'O', 'O', 'O', '
O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '
O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-fru
it', 'I-fruit', 'I-fruit', 'I-fruit', 'I-fruit', 'I-fruit', 'I-fruit', 'I-fruit'
, 'I-fruit', 'I-fruit']
At one time the little boy despised beets;['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O
', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O
', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-vegetable', 'I
-vegetable', 'I-vegetable', 'I-vegetable', 'I-vegetable']
Can I get two chicken sandwiches;['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '
O', 'O', 'O', 'O', 'O', 'B-meat', 'I-meat', 'I-meat', 'I-meat', 'I-meat', 'I-mea
t', 'I-meat', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
