# OLAF : creating a simple pipeline demo

In this demo, we create a simple pipeline using components from the OLAF library. The corpus is composed of basic sentences. We want to extract concepts and relations from it.

In [2]:
import spacy

In [3]:
# Import all necessary items from the olaf package
from olaf import Pipeline
from olaf.pipeline.pipeline_component.term_extraction import POSTermExtraction
from olaf.pipeline.pipeline_component.concept_relation_extraction import CTsToConceptExtraction, CTsToRelationExtraction
from olaf.repository.serialiser import BaseOWLSerialiser

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Load the spacy language model according to the corpus
spacy_model = spacy.load("en_core_web_sm") 

In [5]:
# Initialise the corpus (for this example text version)
corpus = [
    "Alice is 25 years old. Bob, her brother, is 30 years old.",
    "Alex has a dog called Ouper. Claire's dog is Ouper's best friend.",
    "Martine is 22 years old. Leo is 27. Leo has a cousin that is 22 years old. Martine has a cousin that is 27 years old.",
    "Nicolas and Sarah are first cousins. Their grandmother, Louise, is 80.",
    "Paul and Marie are married. Their son, Thomas, is 10."
]

Now that the corpus is initialised, we can set up the items needed for our pipeline. We choose to extract all the NOUNS in the corpus as concepts,  and all the VERBS as relations. 

Term extraction based on NOUN POS tags :

In [6]:
# Extract all the nouns in the corpus using Part of Speech tagging 

concept_pos_selection= ["NOUN"] # POS tags to select concepts in the corpus

# Setting the parameters of the pipeline
my_term_extract_concept = POSTermExtraction(pos_selection=concept_pos_selection)

               By default the system will use the entire content of the document.]


Concept extraction based on validation of candidate terms found.

In [7]:
my_concept_extraction = CTsToConceptExtraction()

Term extraction based on VERB POS tags

In [8]:
# Extract all the verbs in the corpus using Part of Speech tagging

relation_pos_selection = ["VERB"] # POS tags to select relation in the corpus

# Setting the parameters of the pipeline
my_term_extract_relation = POSTermExtraction(pos_selection=relation_pos_selection)

               By default the system will use the entire content of the document.]


Relation extraction based on validation of candidate terms found

In [9]:
my_relation_extraction = CTsToRelationExtraction(concept_max_distance=2)

Now that all the components needed are created, the pipeline can be created too.

In [10]:
# Setting up my pipeline
my_olaf_pipeline = Pipeline(
    spacy_model=spacy_model,
    pipeline_components=[my_term_extract_relation, my_relation_extraction],
    corpus=[doc for doc in spacy_model.pipe(corpus)]
)

In [11]:
my_olaf_pipeline.add_pipeline_component(my_term_extract_concept)
my_olaf_pipeline.add_pipeline_component(my_concept_extraction)

In [12]:
# Checking the tokens in corpus
for doc in my_olaf_pipeline.corpus:
    print([token for token in doc])

[Alice, is, 25, years, old, ., Bob, ,, her, brother, ,, is, 30, years, old, .]
[Alex, has, a, dog, called, Ouper, ., Claire, 's, dog, is, Ouper, 's, best, friend, .]
[Martine, is, 22, years, old, ., Leo, is, 27, ., Leo, has, a, cousin, that, is, 22, years, old, ., Martine, has, a, cousin, that, is, 27, years, old, .]
[Nicolas, and, Sarah, are, first, cousins, ., Their, grandmother, ,, Louise, ,, is, 80, .]
[Paul, and, Marie, are, married, ., Their, son, ,, Thomas, ,, is, 10, .]


In [13]:
# The knowledge representation should be empty before running the pipeline   
my_olaf_pipeline.kr

KnowledgeRepresentation(concepts=set(), relations=set(), metarelations=set(), rdf_graph=<Graph identifier=Nfc1f7e25c0bf469ea9dc687c1750f9e6 (<class 'rdflib.graph.Graph'>)>)

In [14]:
# Running the pipeline
my_olaf_pipeline.run()

In [15]:
# Now the knowledge representation should now have the concepts
my_olaf_pipeline.kr

KnowledgeRepresentation(concepts={<olaf.data_container.concept_schema.Concept object at 0x7c83618b6e30>, <olaf.data_container.concept_schema.Concept object at 0x7c83618b7c70>, <olaf.data_container.concept_schema.Concept object at 0x7c83618b6e90>, <olaf.data_container.concept_schema.Concept object at 0x7c83618b7f10>, <olaf.data_container.concept_schema.Concept object at 0x7c83618b6920>, <olaf.data_container.concept_schema.Concept object at 0x7c83618b6d10>, <olaf.data_container.concept_schema.Concept object at 0x7c83618b7970>, <olaf.data_container.concept_schema.Concept object at 0x7c83618b6bc0>}, relations={<olaf.data_container.relation_schema.Relation object at 0x7c83618b6da0>, <olaf.data_container.relation_schema.Relation object at 0x7c83618b7130>, <olaf.data_container.relation_schema.Relation object at 0x7c83618b76d0>}, metarelations=set(), rdf_graph=<Graph identifier=Nfc1f7e25c0bf469ea9dc687c1750f9e6 (<class 'rdflib.graph.Graph'>)>)

In [16]:
# Check the final state of the knowledge representation
print("Concepts in KR:")
for concept in my_olaf_pipeline.kr.concepts:
    print(concept.label)

Concepts in KR:
brother
friend
cousin
cousins
dog
years
son
grandmother


In [17]:
# Let's do the same for the relations
print("Relations in KR:")
for relation in my_olaf_pipeline.kr.relations:
    print(relation.label)

Relations in KR:
has
married
called


Now let's export the results using a serializer.

In [18]:
# Instantiating serialiser
my_olaf_demo_serialiser = BaseOWLSerialiser("http://olaf_demo_results.org/")

In [21]:
# Build the RDF graph from the olaf pipeline KnowledgeRepresentation
my_olaf_demo_serialiser.build_graph(my_olaf_pipeline.kr)

In [20]:
# Export the RDF graph file path and in default format (turtle)
my_olaf_demo_serialiser.export_graph("demo_test_results.ttl")