In [2]:
import spacy
import textacy
import pandas as pd

In [39]:
# Load the English NLP model
nlp = spacy.load("en_core_web_lg") #("en_core_web_sm")

In [24]:
# Input sentence
sentence = "Bill Gates founded Microsoft in the same year as Steve Jobs founded Apple."

In [25]:
# Process the sentence with spaCy
doc = nlp(sentence)

In [26]:
# Extract entities and relationships
entities = [ent.text for ent in doc.ents]
relationships = []

In [27]:
entities

['Bill Gates', 'Microsoft', 'the same year', 'Steve Jobs', 'Apple']

In [28]:
for token in doc:
    print("-"*100)
    print(token)
    if token.pos_ == "VERB":
        print("ROOT VERB")
        # Found the root verb (e.g., "founded")
        subject = [child.text for child in token.children if child.dep_ == "nsubj"]
        print(f"Sub: {subject}")
        object_ = [child.text for child in token.children if child.dep_ == "dobj"]
        print(f"Obj: {object_}")
        if subject and object_:
            relationships.append((subject[0], token.lemma_, object_[0]))
            print(f"Rel: {relationships}")

----------------------------------------------------------------------------------------------------
Bill
----------------------------------------------------------------------------------------------------
Gates
----------------------------------------------------------------------------------------------------
founded
ROOT VERB
Sub: ['Gates']
Obj: ['Microsoft']
Rel: [('Gates', 'found', 'Microsoft')]
----------------------------------------------------------------------------------------------------
Microsoft
----------------------------------------------------------------------------------------------------
in
----------------------------------------------------------------------------------------------------
the
----------------------------------------------------------------------------------------------------
same
----------------------------------------------------------------------------------------------------
year
---------------------------------------------------------------

In [33]:
token.lemma_

'.'

In [45]:
relationships

[('Gates', 'found', 'Microsoft'), ('Jobs', 'found', 'Apple')]

In [46]:
er = []
# Print the entities and relationships
for i, rel in enumerate(relationships):
    en1 = ""
    en2 = ""
    relp = ""
    
    # print(f"ER{i + 1}: {entity} --> {relationships[i][1]} --> {relationships[i][2]}")
    print(rel)
    for r in rel:
        entity = [e for e in entities if r in e]
        if len(entity) >= 1:
            entity = entity[0]
            if en1 == "":
                en1 = entity
            else:
                en2 = entity
        else:
            relp = r
    er.append({
        "Entity1": en1,
        "Relationship": relp,
        "Entity2": en2
    })
            
    

('Gates', 'found', 'Microsoft')
('Jobs', 'found', 'Apple')


In [47]:
pd.DataFrame(er)

Unnamed: 0,Entity1,Relationship,Entity2
0,Bill Gates,found,Microsoft
1,Steve Jobs,found,Apple


In [41]:
def get_all_er(sentence):
    # Process the sentence with spaCy
    doc = nlp(sentence)

    # Extract entities and relationships
    entities = [ent.text for ent in doc.ents]
    print(entities)
    relationships = []

    for token in doc:
        # print("-"*100)
        print(token, ": ", token.pos_, ": ", token.dep_, ": ")
        if token.dep_ == "ROOT" or token.pos_ == "VERB" or token.pos_ == "AUX":
            # print("ROOT VERB")
            # Found the root verb (e.g., "founded")
            subject = [child.text for child in token.children if child.dep_ == "nsubj"]
            # print(f"Sub: {subject}")
            object_ = [child.text for child in token.children if child.dep_ == "dobj"]
            # print(f"Obj: {object_}")
            if subject and object_:
                relationships.append((subject[0], token.lemma_, object_[0]))
                # print(f"Rel: {relationships}")
    
        # Look for relationships involving prepositional phrases
        elif "prep" in token.dep_:
            preposition = token.text
            subject = token.head.text
            object_ = [child.text for child in token.children if child.dep_ == "pobj"]
            if object_:
                relationships.append((subject, preposition, object_[0]))

    print(relationships)

    # Finalizing ER
    er = []
    # Print the entities and relationships
    for i, rel in enumerate(relationships):
        en1 = ""
        en2 = ""
        relp = ""
        
        # print(f"ER{i + 1}: {entity} --> {relationships[i][1]} --> {relationships[i][2]}")
        # print(rel)
        for r in rel:
            entity = [e for e in entities if r in e]
            if len(entity) >= 1:
                entity = entity[0]
                if en1 == "":
                    en1 = entity
                else:
                    en2 = entity
            else:
                relp = r
        er.append({
            "Entity1": en1,
            "Relationship": relp,
            "Entity2": en2
        })

    return er

In [45]:
sentences = [
    "Bill Gates founded Microsoft in same year as Steve Jobs founded Apple.",
    "Sachin Tendulkar was the greatest batsman of the World.",
    "It's the job of CRMS to validate policy documents and process loan agreements",
    'Khuldabad: This ancient necropolis, also known as "City of the Dead," boasts mystical tombs, mosques, and dargahs, offering a unique glimpse into Mughal and Deccan architecture.'
]

In [46]:
entity_rels = []

for sentence in sentences:
    er = get_all_er(sentence)
    print(er)
    print("-"*100)
    entity_rels.extend(er)

['Bill Gates', 'Microsoft', 'same year', 'Steve Jobs', 'Apple']
Bill :  PROPN :  compound : 
Gates :  PROPN :  nsubj : 
founded :  VERB :  ROOT : 
Microsoft :  PROPN :  dobj : 
in :  ADP :  prep : 
same :  ADJ :  amod : 
year :  NOUN :  pobj : 
as :  SCONJ :  mark : 
Steve :  PROPN :  compound : 
Jobs :  PROPN :  nsubj : 
founded :  VERB :  advcl : 
Apple :  PROPN :  dobj : 
. :  PUNCT :  punct : 
[('Gates', 'found', 'Microsoft'), ('founded', 'in', 'year'), ('Jobs', 'found', 'Apple')]
[{'Entity1': 'Bill Gates', 'Relationship': 'found', 'Entity2': 'Microsoft'}, {'Entity1': 'same year', 'Relationship': 'in', 'Entity2': ''}, {'Entity1': 'Steve Jobs', 'Relationship': 'found', 'Entity2': 'Apple'}]
----------------------------------------------------------------------------------------------------
['Sachin Tendulkar']
Sachin :  PROPN :  compound : 
Tendulkar :  PROPN :  nsubj : 
was :  AUX :  ROOT : 
the :  DET :  det : 
greatest :  ADJ :  amod : 
batsman :  NOUN :  attr : 
of :  ADP :  prep

In [44]:
pd.DataFrame(entity_rels)

Unnamed: 0,Entity1,Relationship,Entity2
0,Bill Gates,found,Microsoft
1,same year,in,
2,Steve Jobs,found,Apple
3,,world,
4,CRMS,of,
5,,City,
6,,Dead,
7,,tombs,
8,Mughal,into,


### Version 2

In [62]:
nlp = spacy.load('en_core_web_sm')

In [63]:
sentences = ["All living things are made of cells.", 
             "Cells have organelles."]

In [64]:
verb_patterns = [[{"POS":"AUX"}, {"POS":"VERB"}, 
                  {"POS":"ADP"}], 
                 [{"POS":"AUX"}]]

In [71]:
def find_root_of_sentence(doc):
    root_token = None
    for token in doc:
        if (token.dep_ == "ROOT"):
            root_token = token
    return root_token

In [65]:
def contains_root(verb_phrase, root):
    vp_start = verb_phrase.start
    vp_end = verb_phrase.end
    if (root.i >= vp_start and root.i <= vp_end):
        return True
    else:
        return False

In [81]:
def get_verb_phrases(doc):
    root = find_root_of_sentence(doc)
    verb_phrases = textacy.extract.matches.token_matches(doc, 
                                                            verb_patterns)
    new_vps = []
    for verb_phrase in verb_phrases:
        if (contains_root(verb_phrase, root)):
            new_vps.append(verb_phrase)
    return new_vps

In [82]:
def longer_verb_phrase(verb_phrases):
    longest_length = 0
    longest_verb_phrase = None
    for verb_phrase in verb_phrases:
        if len(verb_phrase) > longest_length:
            longest_verb_phrase = verb_phrase
    return longest_verb_phrase

In [83]:
def find_noun_phrase(verb_phrase, noun_phrases, side):
    for noun_phrase in noun_phrases:
        if (side == "left" and \
            noun_phrase.start < verb_phrase.start):
            return noun_phrase
        elif (side == "right" and \
              noun_phrase.start > verb_phrase.start):
            return noun_phrase

In [84]:
def find_triplet(sentence):
    doc = nlp(sentence)
    verb_phrases = get_verb_phrases(doc)
    noun_phrases = doc.noun_chunks
    verb_phrase = None
    if (len(verb_phrases) > 1):
        verb_phrase = \
        longer_verb_phrase(list(verb_phrases))
    else:
        verb_phrase = verb_phrases[0]
    left_noun_phrase = find_noun_phrase(verb_phrase, 
                                        noun_phrases, 
                                        "left")
    right_noun_phrase = find_noun_phrase(verb_phrase, 
                                         noun_phrases, 
                                         "right")
    return (left_noun_phrase, verb_phrase, 
            right_noun_phrase)

In [88]:
for sentence in sentences:
    (left_np, vp, right_np) = find_triplet(sentence)
    print(left_np, "\t", vp, "\t", right_np)

IndexError: list index out of range

In [89]:
sentence

'Bill Gates founded Microsoft in same year as Steve Jobs founded Apple.'

### Version 3

In [10]:
# Load spaCy English NLP model
nlp = spacy.load("en_core_web_sm")


def extract_entities_and_relationships(sentence):

    # Process the input sentence
    doc = nlp(sentence)

    # Extract named entities
    entities = [(ent.text, ent.label_) for ent in doc.ents]

    # Extract relationships between entities
    relationships = []
    for token in doc:
        if token.dep_ == "ROOT":
            # Extract the root of the sentence
            root = token.text
            for child in token.children:
                # Extract relationships between the root and its children
                relationships.append((root, child.text, child.dep_))

    return entities, relationships



In [11]:
# Example sentence
input_sentence = "Apple Inc. was founded by Steve Jobs in Cupertino."

# Extract entities and relationships
entities, relationships = extract_entities_and_relationships(input_sentence)

# Display the results
print("Entities:", entities)
print("Relationships:", relationships)

Entities: [('Apple Inc.', 'ORG'), ('Steve Jobs', 'PERSON'), ('Cupertino', 'GPE')]
Relationships: [('founded', 'Inc.', 'nsubjpass'), ('founded', 'was', 'auxpass'), ('founded', 'by', 'agent'), ('founded', '.', 'punct')]
