# OLAF : creating a simple pipeline demo

In this demo, we create a simple pipeline using components from the OLAF library. The corpus is composed of basic sentences. We want to extract concepts and relations from it.

In [None]:
import spacy

In [None]:
# Import all necessary items from the olaf package
from olaf import Pipeline
from olaf.pipeline.pipeline_component.term_extraction import POSTermExtraction
from olaf.pipeline.pipeline_component.concept_relation_extraction import CTsToConceptExtraction, CTsToRelationExtraction
from olaf.repository.serialiser import BaseOWLSerialiser

In [None]:
# Load the spacy language model according to the corpus
spacy_model = spacy.load("en_core_web_sm") 

In [None]:
# Initialise the corpus (for this example text version)
corpus = [
    "Alice is 25 years old. Bob, her brother, is 30 years old.",
    "Alex has a dog called Ouper. Claire's dog is Ouper's best friend.",
    "Martine is 22 years old. Leo is 27. Leo has a cousin that is 22 years old. Martine has a cousin that is 27 years old.",
    "Nicolas and Sarah are first cousins. Their grandmother, Louise, is 80.",
    "Paul and Marie are married. Their son, Thomas, is 10."
]

Now that the corpus is initialised, we can set up the items needed for our pipeline. We choose to extract all the NOUNS in the corpus as concepts,  and all the VERBS as relations. 

Term extraction based on NOUN POS tags :

In [None]:
# Extract all the nouns in the corpus using Part of Speech tagging 

concept_pos_selection= ["NOUN"] # POS tags to select concepts in the corpus

# Setting the parameters of the pipeline
my_term_extract_concept = POSTermExtraction(pos_selection=concept_pos_selection)

Concept extraction based on validation of candidate terms found.

In [None]:
my_concept_extraction = CTsToConceptExtraction()

Term extraction based on VERB POS tags

In [None]:
# Extract all the verbs in the corpus using Part of Speech tagging

relation_pos_selection = ["VERB"] # POS tags to select relation in the corpus

# Setting the parameters of the pipeline
my_term_extract_relation = POSTermExtraction(pos_selection=relation_pos_selection)

Relation extraction based on validation of candidate terms found

In [None]:
my_relation_extraction = CTsToRelationExtraction(concept_max_distance=2)

Now that all the components needed are created, the pipeline can be created too.

In [None]:
# Setting up my pipeline
my_olaf_pipeline = Pipeline(
    spacy_model=spacy_model,
    pipeline_components=[my_term_extract_relation, my_relation_extraction],
    corpus=[doc for doc in spacy_model.pipe(corpus)]
)

In [None]:
my_olaf_pipeline.add_pipeline_component(my_term_extract_concept)
my_olaf_pipeline.add_pipeline_component(my_concept_extraction)

In [None]:
# Checking the tokens in corpus
for doc in my_olaf_pipeline.corpus:
    print([token for token in doc])

In [None]:
# The knowledge representation should be empty before running the pipeline   
my_olaf_pipeline.kr

In [None]:
# Running the pipeline
my_olaf_pipeline.run()

In [None]:
# Now the knowledge representation should now have the concepts
my_olaf_pipeline.kr

In [None]:
# Check the final state of the knowledge representation
print("Concepts in KR:")
for concept in my_olaf_pipeline.kr.concepts:
    print(concept.label)

In [None]:
# Let's do the same for the relations
print("Relations in KR:")
for relation in my_olaf_pipeline.kr.relations:
    print(relation.label)

Now let's export the results using a serializer.

In [None]:
# Instantiating serialiser
my_olaf_demo_serialiser = BaseOWLSerialiser("http://olaf_demo_results.org/")

In [None]:
# Build the RDF graph from the olaf pipeline KnowledgeRepresentation
my_olaf_demo_serialiser.build_graph(my_olaf_pipeline.kr)

In [None]:
# Export the RDF graph file path and in default format (turtle)
my_olaf_demo_serialiser.export_graph("demo_test_results.ttl")