In [2]:
import re
import requests
import spacy
from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy import displacy
import bs4
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

In [3]:
nlp = spacy.load('es_core_news_sm') # para procesamiento de textos en español

doc = nlp("El tipo lanzó el ladrillo")

for tok in doc:
  print(tok.text, "...", tok.dep_)

El ... det
tipo ... nsubj
lanzó ... ROOT
el ... det
ladrillo ... obj


In [4]:
displacy.render(doc, style="dep")

In [5]:
nlp = spacy.load('en_core_web_sm') # para procesamiento de textos en inglés

doc = nlp("If your hate could be turned into electricity, it would light up the whole world.")

for tok in doc:
  print(tok.text, "-->", tok.dep_)

If --> mark
your --> poss
hate --> nsubjpass
could --> aux
be --> auxpass
turned --> advcl
into --> prep
electricity --> pobj
, --> punct
it --> nsubj
would --> aux
light --> ROOT
up --> prt
the --> det
whole --> amod
world --> dobj
. --> punct


In [6]:
displacy.render(doc, style="dep")

In [7]:
nlp = spacy.load('es_core_news_sm')

doc = nlp("La imagen del agujero negro fue renderizada por la joven ingeniera.")

for tok in doc:
  print(tok.text, "-->", tok.dep_)

La --> det
imagen --> nsubj
del --> case
agujero --> nmod
negro --> amod
fue --> aux
renderizada --> ROOT
por --> case
la --> det
joven --> amod
ingeniera --> obj
. --> punct


In [8]:
displacy.render(doc, style="dep")

In [9]:
nlp = spacy.load('en_core_web_sm')

In [10]:
# importar archivo con oraciones de wikipedia utilizando pandas (pd)
sentencias_candidatas = pd.read_csv("wiki_sentences.csv")
sentencias_candidatas.shape

(4318, 1)

In [11]:
# Realizar una muestra de 5 oraciones
sentencias_candidatas['sentence'].sample(5)

683     however, none of these earliest pornographic f...
1982                    to live won the grand jury prize.
4170    several versions of blade runner have been shown.
218     the science-fiction/horror film was the first ...
586     james mangold was later hired to direct the film.
Name: sentence, dtype: object

In [12]:
doc = nlp("the new york film critics circle awarded carol")

for tok in doc:
  print(tok.text, "-->", tok.dep_)

the --> det
new --> compound
york --> compound
film --> compound
critics --> compound
circle --> nsubj
awarded --> ROOT
carol --> dobj


In [13]:
def get_entities(sent):
  ## Parte 1
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    # etiqueta de dependencia del token anterior en la oración
  prv_tok_text = ""   # token anterior en la oración

  prefix = ""
  modifier = ""

  #############################################################
  
  for tok in nlp(sent):
    ## Parte 2
    # Si el token es un signo de puntuación, pase al siguiente token
    if tok.dep_ != "punct":
      # comprobar: token es una palabra compuesta o no
      if tok.dep_ == "compound":
        prefix = tok.text
        # si la palabra anterior también era un 'compound', entonces agregar la palabra actual
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text
      
      # comprobar: el token es un modificador o no
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        # si la palabra anterior también era un 'compound', entonces agregar la palabra actual
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text
      
      ## Parte 3
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""      

      ## Parte 4
      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
        
      ## Parte 5 
      # actualizar variables
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text
  #############################################################

  return [ent1.strip(), ent2.strip()]

In [14]:
get_entities("An electric motor uses electrical energy")

['electric  motor', 'electrical  energy']

In [15]:
pares_entidades = []

for i in tqdm(sentencias_candidatas["sentence"]): # con tqdm podemos mostrar una barra de progreso
  pares_entidades.append(get_entities(i))

100%|██████████| 4318/4318 [01:31<00:00, 47.10it/s]


In [16]:
pares_entidades[10:20]

[['we', 'tests'],
 ['m global', 'international sales rights'],
 ['robbie robertson', 'soundtrack'],
 ['it', 'original music tracks'],
 ['it', 'reviewed  franchise'],
 ['she', 'accidentally  mystique'],
 ['', 'military  arrest'],
 ['train', 'vuk'],
 ['telepath', 'gallio'],
 ['singer', 'men']]

In [17]:
def get_relation(sent):

  doc = nlp(sent)

  # objeto de la clase Matcher
  matcher = Matcher(nlp.vocab)

  # definir el patrón 
  pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

  matcher.add("matching_1", None, pattern) 

  matches = matcher(doc)
  k = len(matches) - 1

  span = doc[matches[k][1]:matches[k][2]] 

  return(span.text)

In [18]:
get_relation("An ECG detects the heartbeats.")

'detects'

In [19]:
relaciones = [get_relation(i) for i in tqdm(sentencias_candidatas['sentence'])]

100%|██████████| 4318/4318 [01:30<00:00, 47.53it/s]


In [20]:
pd.Series(relaciones).value_counts()[:20]

is             370
was            297
released on     87
include         73
were            71
are             71
released        40
's              38
composed by     35
became          31
have            31
has             31
become          29
released in     27
included        26
called          22
produced        22
made            20
had             20
considered      20
dtype: int64

In [None]:
# extraer sujeto
fuente = [i[0] for i in pares_entidades]

# extraer objeto
objetivo = [i[1] for i in pares_entidades]

gc_df = pd.DataFrame({'fuente':fuente, 'objetivo':objetivo, 'arista':relaciones})

In [None]:
gc_df

In [None]:
# crear un grafo dirigido desde un dataframe
G=nx.from_pandas_edgelist(gc_df, "fuente", "objetivo", 
                          edge_attr=True, create_using=nx.MultiDiGraph())

In [None]:
plt.figure(figsize=(12,12))

pos = nx.spring_layout(G)
nx.draw(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos)
plt.show()

In [None]:
G=nx.from_pandas_edgelist(gc_df[gc_df['arista']=="composed by"], "fuente", "objetivo", 
                          edge_attr=True, create_using=nx.MultiDiGraph())

plt.figure(figsize=(12,12))
pos = nx.spring_layout(G, k = 0.5) # k regula la distancia entre nodos
nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)
plt.show()

In [None]:
G=nx.from_pandas_edgelist(gc_df[gc_df['arista']=="written by"], "fuente", "objetivo", 
                          edge_attr=True, create_using=nx.MultiDiGraph())

plt.figure(figsize=(12,12))
pos = nx.spring_layout(G, k = 0.5)
nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)
plt.show()

In [None]:
G=nx.from_pandas_edgelist(gc_df[gc_df['arista']=="released in"], "fuente", "objetivo", 
                          edge_attr=True, create_using=nx.MultiDiGraph())

plt.figure(figsize=(12,12))
pos = nx.spring_layout(G, k = 0.5)
nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)
plt.show()