In [20]:
import pandas as pd

In [22]:
df = pd.read_csv('manualy_annotated_dataset2.csv')
df

Unnamed: 0,req,pos,labels
0,Each college has a website\nEach college is st...,DET NOUN VERB DET NOUN SPACE DET NOUN AUX VERB...,O entity relationship O attr O O entity O rela...
1,"Among these teachers, one of them is responsib...",ADP DET NOUN PUNCT NUM ADP PRON AUX ADJ ADP DE...,O O entity O O O O O relationship O O entity O O
2,"A teacher is defined by his surname, first nam...",DET NOUN AUX VERB ADP PRON NOUN PUNCT ADJ NOUN...,O entity O relationship O O attr O O attr O at...
3,Each teacher teaches only one subject.\n,DET NOUN VERB ADV NUM NOUN PUNCT SPACE,O entity relationship O O entity O O
4,Students take several subjects and receive a m...,NOUN VERB ADJ NOUN CCONJ VERB DET NOUN ADP PRO...,entity relationship O entity O O O O O O O O O O
...,...,...,...
299,"Packets contain an originator, a destination a...",NOUN VERB DET NOUN PUNCT DET NOUN CCONJ NOUN P...,entity O O attr O O attr O attr O O O O O O O ...
300,A LAN is a circular configuration of nodes.,DET PROPN AUX DET ADJ NOUN ADP NOUN PUNCT,O entity O O O O O entity O
301,Each document is composed of one or more sheets.,DET NOUN AUX VERB ADP NUM CCONJ ADJ NOUN PUNCT,O entity O relationship O O O O entity O
302,A sheet contains text and geometrical objects ...,DET NOUN VERB NOUN CCONJ ADJ NOUN PRON VERB NU...,O entity relationship entity O O O O O O O O O...


In [46]:
import spacy

nlp = spacy.blank("en")

TRAIN_DATA = []
for sentence, labels in zip(df.req, df.labels):
    doc = nlp(sentence)
    entities = []
    for token, label in zip(doc, labels.split()):
        if label == "entity":
            entities.append([token.idx, token.idx + len(token.text), "CLASS"])
        elif label == "attr":
            entities.append([token.idx, token.idx + len(token.text), "ATTRIBUTE"])
        elif label == "relationship":
            entities.append([token.idx, token.idx + len(token.text), "RELATIONSHIP"])

    TRAIN_DATA.append([sentence, {"entities": entities}])

TRAIN_DATA

[['Each college has a website\nEach college is structured into departments, each of which brings together specific teachers.',
  {'entities': [[5, 12, 'CLASS'],
    [13, 16, 'RELATIONSHIP'],
    [19, 26, 'ATTRIBUTE'],
    [32, 39, 'CLASS'],
    [43, 53, 'RELATIONSHIP'],
    [59, 70, 'CLASS'],
    [86, 92, 'RELATIONSHIP'],
    [111, 119, 'CLASS']]}],
 ['Among these teachers, one of them is responsible for the department.\n',
  {'entities': [[12, 20, 'CLASS'],
    [37, 48, 'RELATIONSHIP'],
    [57, 67, 'CLASS']]}],
 ['A teacher is defined by his surname, first name, phone, email, date of taking office and his index.\n',
  {'entities': [[2, 9, 'CLASS'],
    [13, 20, 'RELATIONSHIP'],
    [28, 35, 'ATTRIBUTE'],
    [43, 47, 'ATTRIBUTE'],
    [49, 54, 'ATTRIBUTE'],
    [56, 61, 'ATTRIBUTE'],
    [63, 67, 'ATTRIBUTE'],
    [93, 98, 'ATTRIBUTE']]}],
 ['Each teacher teaches only one subject.\n',
  {'entities': [[5, 12, 'CLASS'],
    [13, 20, 'RELATIONSHIP'],
    [30, 37, 'CLASS']]}],
 ['Student

In [47]:
import srsly
import typer
import warnings
from pathlib import Path

import spacy
from spacy.tokens import DocBin

def convert(lang: str, TRAIN_DATA, output_path: Path):
    nlp = spacy.blank(lang)
    db = DocBin()
    for text, annot in TRAIN_DATA:
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annot["entities"]:
            span = doc.char_span(start, end, label=label)
            if span is None:
                msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n"
                warnings.warn(msg)
            else:
                ents.append(span)
        doc.ents = ents
        db.add(doc)
    db.to_disk(output_path)

In [48]:
convert("en", TRAIN_DATA, "data/train.spacy")
convert("en", TRAIN_DATA, "data/valid.spacy")

In [49]:
!python -m spacy init fill-config base_config.cfg config.cfg

✔ Auto-filled config with all values
✔ Saved config
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [50]:
!python -m spacy train config.cfg --output ./output --paths.train ./data/train.spacy --paths.dev ./data/valid.spacy

✔ Created output directory: output
ℹ Saving to output directory: output
ℹ Using CPU
[1m
✔ Initialized pipeline
[1m
ℹ Pipeline: ['tok2vec', 'ner']
ℹ Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     43.14    0.00    0.00    0.00    0.00
  3     200        120.31   3293.32   94.61   92.69   96.61    0.95
  7     400        157.46    848.95   98.23   97.32   99.15    0.98
 13     600        218.28    563.45   98.79   98.14   99.44    0.99
 20     800        236.84    415.02   99.10   99.34   98.87    0.99
 28    1000        297.39    394.55   99.30   98.97   99.62    0.99
 38    1200        338.10    381.86   99.34   99.43   99.25    0.99
 51    1400        290.93    351.44   99.62   99.44   99.81    1.00
 67    1600        308.53    321.15   99.67   99.72   99.62    1.00
 85    1800        422.11    329.94   99.67   99.72   99.62    1.00
109    2

[2022-12-22 19:37:24,963] [INFO] Set up nlp object from config
[2022-12-22 19:37:24,980] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-12-22 19:37:24,984] [INFO] Created vocabulary
[2022-12-22 19:37:24,985] [INFO] Finished initializing nlp object
[2022-12-22 19:37:25,315] [INFO] Initialized pipeline components: ['tok2vec', 'ner']


In [77]:
import spacy

trained_nlp = spacy.load("./output/model-best")
text = "Each customer has a VAT number, a name, a phone number and an address. There are no two clients with the same VAT number. When a customer wants to send a package to another customer, he just has to login to the company website, select the customer he wants to send the package to, enter the package weight and if the delivery is normal or urgent. He then receives a unique identifier code that he writes on the package. The package is then delivered by the customer at the delivery center of his choosing. A delivery center has a unique name and an address. Each client has an associated delivery center. This delivery center is chosen by the company, and it is normally the one closest to the customer house. The package is them routed through an internal system until it reaches the delivery center of the recipient. The package is then delivered by hand from that delivery center to the recipient by a courier. Couriers have a single VAT number, a name and a phone number. Each courier works in a single delivery center. A courier is assigned to a packet as soon as the packet is introduced in the system."
doc = trained_nlp(text)

spacy.displacy.render(doc, style="ent")

In [87]:
nlp = spacy.load("en_core_web_sm")
doc2 = nlp(text)
classes = set()
attributes = set()
relationships = set()
for token, token2 in zip(doc, doc2):
    if token.ent_type_ == "CLASS":
        classes.add(token2.lemma_)
    elif token.ent_type_ == "ATTRIBUTE":
        attributes.add(token2.lemma_)
    elif token.ent_type_ == "RELATIONSHIP":
        relationships.add(token2.text)

In [88]:
classes

{'client', 'courier', 'customer', 'package', 'packet'}

In [89]:
attributes

{'address', 'identifi', 'name', 'urgent', 'weight'}

In [90]:
relationships

{'associated', 'send'}