In [1]:
import spacy
from spacy import displacy

In [2]:
nlp = spacy.load('en_core_web_md')

doc = nlp('I have a Golden Retriever at my house')

<img src='img/pipe.png'>

In [5]:
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']


In [6]:
@nlp.component('length_component')
def length_custom_component(doc):
    print('Doc length:', len(doc))
    return doc

nlp.add_pipe('length_component', first=True)

<function __main__.length_custom_component(doc)>

<img src='img/component.png'>

In [7]:
print('New pipeline:', nlp.pipe_names)

New pipeline: ['length_component', 'tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']


In [8]:
doc = nlp('Have you ever seen the rain?')

Doc length: 7


In [9]:
nlp.remove_pipe('length_component')

('length_component', <function __main__.length_custom_component(doc)>)

In [10]:
print('After removal:', nlp.pipe_names)

After removal: ['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']


# Statistical predictions X Rules

<img src='img/statXrule.png'>

## How training works

<img src='img/training.png'>

- Initialize the model weights randomly with <i>nlp.begin_training</i>
- Predict a few examples with the current weights by calling <i>nlp.update</i>
- Compare prediction with true labels
- Calculate how to change weights to improve predictions
- Update weights slightly
- Go back to 2

## Training the entity recognizer

- Each token can only be part of one entity
- Examples need to come with an example


<img src='img/ex1.png'>

Goal: <b><i>teach the model to generalize</i></b>

<img src='img/ex2.png'>

In [3]:
from spacy.matcher import Matcher

In [4]:
proglang_patterns = [
        [{'LOWER': 'objective'}, {'IS_PUNCT': True, 'OP': '?'}, {'LOWER': 'c'}],
        [{'LOWER': 'objectivec'}],
        [{'LOWER': 'python'}],
        [{'LOWER': {'IN': ['js', 'javascript']}}],
        [{'LOWER': 'css'}],
        [{'LOWER': 'golang'}],
        [{'LOWER': 'c'}],
        [{'LOWER': 'c++'}],
        [{'LOWER': 'ruby'}],
        [{'LOWER': 'java'}],
        [{'LOWER': 'php'}],
    ]
matcher = Matcher(nlp.vocab)
matcher.add('proglangs', [*proglang_patterns])

In [11]:
def parse_train_data (doc):
    detections = [(doc[start:end].start_char, doc[start:end].end_char, 'PROGLANG') for idx, start, end in matcher(doc)]
    return (doc.text, {'entities': detections})

In [12]:
parse_train_data(nlp('I used to program in Java, nowaday I program in C and Python'))

('I used to program in Java, nowaday I program in C and Python',
 {'entities': [(21, 25, 'PROGLANG'),
   (48, 49, 'PROGLANG'),
   (54, 60, 'PROGLANG')]})

In [13]:
import pandas as pd
df = (pd.read_excel('labels.xlsx', usecols=['Label', 'Title']))

In [14]:
titles = df.loc[:544]['Title']

In [15]:
train_data = [parse_train_data(d) for d in nlp.pipe(titles) if len(matcher(d))==1]
train_data[:4]

[('How do I connect to a database and loop over a recordset in C#?',
  {'entities': [(60, 61, 'PROGLANG')]}),
 ('How do I delete a file which is locked by another process in C#?',
  {'entities': [(61, 62, 'PROGLANG')]}),
 ('Good STL-like library for C', {'entities': [(26, 27, 'PROGLANG')]}),
 ('MySQL/Apache Error in PHP MySQL query',
  {'entities': [(22, 25, 'PROGLANG')]})]

In [16]:
def create_blank_nlp(train_data):
    nlp = spacy.blank('en')
    ner = nlp.add_pipe('ner', last=True)
    ner = nlp.get_pipe("ner")
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    return nlp

In [17]:
import random 
import datetime as dt
from spacy.training import Example

nlp = create_blank_nlp(train_data)
optimizer = nlp.begin_training()  
for i in range(20):
    random.shuffle(train_data)
    losses = {}
    for text, annotations in train_data:
        example = Example.from_dict(nlp.make_doc(text), annotations)
        nlp.update([example], losses=losses)
    print(f"Losses at iteration {i} - {dt.datetime.now()}", losses)

Losses at iteration 0 - 2021-04-30 17:19:13.076828 {'ner': 100.05899049626645}
Losses at iteration 1 - 2021-04-30 17:19:16.389489 {'ner': 1.9396463561209527}
Losses at iteration 2 - 2021-04-30 17:19:19.733170 {'ner': 5.885151444565423e-06}
Losses at iteration 3 - 2021-04-30 17:19:23.079931 {'ner': 3.456799759239202e-08}
Losses at iteration 4 - 2021-04-30 17:19:26.677246 {'ner': 1.1620920524758504e-07}
Losses at iteration 5 - 2021-04-30 17:19:30.674881 {'ner': 7.429804436265265e-08}
Losses at iteration 6 - 2021-04-30 17:19:34.597493 {'ner': 5.2288994415198204e-08}
Losses at iteration 7 - 2021-04-30 17:19:38.609456 {'ner': 2.1583715285186297e-08}
Losses at iteration 8 - 2021-04-30 17:19:42.578979 {'ner': 2.8162778239716434e-07}
Losses at iteration 9 - 2021-04-30 17:19:46.513748 {'ner': 1.1424396285013037e-08}
Losses at iteration 10 - 2021-04-30 17:19:50.315312 {'ner': 3.8140310128773925e-09}
Losses at iteration 11 - 2021-04-30 17:19:54.113603 {'ner': 9.608126140298515e-09}
Losses at iter

In [18]:
nlp.pipeline

[('ner', <spacy.pipeline.ner.EntityRecognizer at 0x25308fb06a0>)]

In [19]:
doc = nlp("I should learn Ruby and JavaScript")
displacy.render(doc, style="ent")