# NLP Tools

In [7]:
import spacy 
nlp = spacy.load('en_core_web_lg')

In [9]:
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher
from spacy import displacy

In [29]:
def get_word_vectors(words):
    # converts a list of words into their word vectors
    return [nlp(word).vector for word in words]

def most_similar(word):
    queries = [w for w in word.vocab if w.is_lower == word.is_lower and w.prob >= -15 and w.vector_norm and word.vector_norm]
    by_similarity = sorted(queries, key=lambda w: word.similarity(w), reverse=True)
    return by_similarity[:20]

def collect_sents(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]  # Matched span
    sent = span.sent  # Sentence containing matched span
    # Append mock entity for match in displaCy style to matched_sents
    # get the match span by ofsetting the start and end of the span with the
    # start and end of the sentence in the doc
    match_ents = [{
        "start": span.start_char - sent.start_char,
        "end": span.end_char - sent.start_char,
        "label": string_id,
    }]
    
    i = 0
    doublon = False 
    for element in matched_sents:
        i=i+1
        if element["text"] == sent.text:
            matched_sents[0]["ents"].append({match_ents})
            doublon = True 
    if doublon == False :    
        matched_sents.append({"text": sent.text, "ents": match_ents})

# Graph Tools

In [23]:
from py2neo import Node, Graph, Relationship

graph = Graph(uri="bolt://localhost:7687", auth=("julien", "julien"))
graph.delete_all()

# Find Entities

In [24]:
research = nlp("Militias supported by Iran, are next to American position in Syria.")
displacy.render(research, style='ent', jupyter=True)

## Entities & Nodes

In [25]:
nodes = []

for ent in research.ents:
    nodes.append(Node(ent.label_, name=ent.text))

# Relationship

In [26]:
print(research)
displacy.render(research, style='ent', jupyter=True)


print('\nEntities detected : ')
for i, ent in enumerate(research.ents):
    print("{:10}{:5}{:3}{:20}".format(ent.text, ent.label_, " : ", spacy.explain(ent.label_)))
    
    # not the last one 
    if(i < len(research.ents)-1): 
        start = research.ents[i].end
        end = research.ents[i+1].start
        span = research[start : end]
        print("Between : ", span)
        
        graph.create(Relationship(nodes[i],str(span),nodes[i+1]))

Militias supported by Iran, are next to American position in Syria.



Entities detected : 
Iran      GPE   : Countries, cities, states
Between :  , are next to
American  NORP  : Nationalities or religious or political groups
Between :  position in
Syria     GPE   : Countries, cities, states


In [None]:
# Time

# Location 

# Possession

## Create location relationship

In [30]:
target = 'location'

similars = [w.lower_ for w in most_similar(nlp.vocab[target])]
tabSim = []

print("Target : ", target, "with lg model\n")

nlpTarget = nlp(target)
for similar in tab:
    nlpSimilar = nlp(similar)
    tabSim.append(nlpSimilar.similarity(nlpTarget))
    print("{:15}{:20}".format(similar, nlpSimilar.similarity(nlpTarget)))



Target :  location with lg model

location                        1.0
locations        0.7504167636593956
located          0.6757641947798738
proximity        0.6507855012696153
area             0.6248048225936038
destination      0.5925449592467911
place            0.5917388686463082
places           0.5914134270441289
vicinity         0.5882238803105991
map              0.5645077859007557


In [43]:
phraseMatcher = PhraseMatcher(nlp.vocab, attr="LOWER")

terms = ["position", "located", "close", "around", "near", "location", "next"]
patterns = [nlp.make_doc(text) for text in terms]

phraseMatcher.add("Location", None, *patterns)

In [44]:
phraseMatches = phraseMatcher(research)


In [45]:
phraseMatches

[(6406047268281013305, 6, 7), (6406047268281013305, 9, 10)]

In [46]:
matched_sents = []

for match_id, start, end in phraseMatches:
    rule_id = nlp.vocab.strings[match_id]  # get the unicode ID, i.e. 'COLOR'
    span = research[start : end]  # get the matched slice of the doc
    sent = span.sent
    match_ents = [{
        "start": span.start_char - sent.start_char,
        "end": span.end_char - sent.start_char,
        "label": rule_id,
    }]
    matched_sents.append({"text": sent.text, "ents": match_ents})


In [47]:
matched_sents_simple = matched_sents

if matched_sents_simple != []:
    i = 0
    for element in matched_sents_simple :         
        if(i < len(matched_sents_simple) - 1):
            if(element["text"] == matched_sents_simple[i+1]["text"]):        
                print("double : ", i)
                matched_sents_simple[i]["ents"].append(matched_sents_simple[i+1]["ents"][0])
                del matched_sents_simple[i+1]
        i = i+1
    
displacy.render(matched_sents_simple, style="ent", manual=True)

double :  0


In [48]:
matched_sents_simple

[{'text': 'Militias supported by Iran, are next to American position in Syria.',
  'ents': [{'start': 32, 'end': 36, 'label': 'Location'},
   {'start': 49, 'end': 57, 'label': 'Location'}]}]

In [None]:
relationships = []

for 