### 1 - Conexão ao Banco 

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import numpy as np
import os

os.system("ln -s /var/run/postgresql/.s.PGSQL.5432 /tmp/.s.PGSQL.5432")
os.environ['SNORKELDB'] = 'postgres:///lzirondi'

from snorkel import SnorkelSession
session = SnorkelSession()

In [2]:
print(session)

<sqlalchemy.orm.session.Session object at 0x7f9a737455f8>


### 2 - Dividindo o Corpus em Sentenças

In [None]:
#Numero total de artigos no articles.tsv
n_docs = 2591

from snorkel.parser import TSVDocPreprocessor

doc_preprocessor = TSVDocPreprocessor('data/articles.tsv', max_docs=n_docs)

#

from snorkel.parser.spacy_parser import Spacy
from snorkel.parser import CorpusParser

corpus_parser = CorpusParser(parser=Spacy())

import inspect
inspect.getmro(type(corpus_parser))

%time corpus_parser.apply(doc_preprocessor, count=n_docs, parallelism=16)

from snorkel.models import Document, Sentence

print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())

In [8]:
print(session.query(Sentence)[0])
print(session.query(Sentence)[1])
print(session.query(Sentence)[2])
print(session.query(Sentence)[3])
print(session.query(Sentence)[4])

Sentence(Document 88eb2437-93ce-452d-ada0-905a90d0ccac,0,b'Chinese President Xi Jinping has arrived in Washington for talks with US President Barack Obama.')
Sentence(Document 88eb2437-93ce-452d-ada0-905a90d0ccac,1,b'Key themes are expected to be Chinese cyber spying, economic policies and the territorial disputes in the South China\xe2\x80\xa6   ')
Sentence(Document 88eb2437-93ce-452d-ada0-905a90d0ccac,2,b'Chinese President Xi Jinping has arrived in Washington for talks with US President Barack Obama.')
Sentence(Document 88eb2437-93ce-452d-ada0-905a90d0ccac,3,b'Key themes are expected to be Chinese cyber spying, economic policies and the territorial disputes in the South China Sea.   ')
Sentence(Document 88eb2437-93ce-452d-ada0-905a90d0ccac,4,b'Washington rolled out the red carpet for President Xi on Thursday evening, hastily replacing Vatican banners in front of the White House with Chinese flags.   ')


### 3 - Gerando os Candidatos

In [None]:
from snorkel.models import candidate_subclass

remedio = candidate_subclass('Remedio,', ['remedio', 'caso adverso'])

from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.matchers import DictionaryMatch

ngrans = Ngrans(n_max=7)
remedio_matcher = DictionaryMatch(d)
caso_matcher = DictionaryMatch(d)
cand_extractor = CandidateExtractor(remedio, [ngrams, ngrams], [remedio_matcher, caso_matcher])

In [9]:
from snorkel.models import candidate_subclass

Spouse = candidate_subclass('Spouse', ['person1', 'person2'])
Remedio = candidate_subclass('Remedio', ['remedio', 'sintoma'])

#

from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.matchers import PersonMatcher

ngrams         = Ngrams(n_max=7)
person_matcher = PersonMatcher(longest_match_only=True)
cand_extractor = CandidateExtractor(Spouse, [ngrams, ngrams], [person_matcher, person_matcher])

#

from snorkel.models import Document
from util import number_of_people

docs = session.query(Document).order_by(Document.name).all()

train_sents = set()
dev_sents   = set()
test_sents  = set()

for i, doc in enumerate(docs):
    for s in doc.sentences:
        if number_of_people(s) <= 5:
            if i % 10 == 8:
                dev_sents.add(s)
            elif i % 10 == 9:
                test_sents.add(s)
            else:
                train_sents.add(s)
                
#

%time
for i, sents in enumerate([train_sents, dev_sents, test_sents]):
    cand_extractor.apply(sents, split=i, parallelism = 16)
    print("Number of candidates:", session.query(Spouse).filter(Spouse.split == i).count())
    
    

  0%|          | 0/54022 [00:00<?, ?it/s]

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 8.82 µs
Clearing existing...
Running UDF...


100%|██████████| 54022/54022 [14:00<00:00, 63.16it/s]
  0%|          | 0/6791 [00:00<?, ?it/s]

Number of candidates: 22254
Clearing existing...
Running UDF...


100%|██████████| 6791/6791 [01:53<00:00, 59.73it/s]
  0%|          | 0/6202 [00:00<?, ?it/s]

Number of candidates: 2811
Clearing existing...
Running UDF...


100%|██████████| 6202/6202 [01:32<00:00, 59.31it/s]

Number of candidates: 2701





In [24]:
cands = session.query(Spouse).filter(Spouse.split == 0).all()

s1 = cands[0]
s2 = cands[0][0].get_span()
s3 = cands[0][1].get_span()
#print(session.query(Sentence)[251948])


print(s1)
print(s2)
print(s3)
#print(help(s1))
#print("S2 " + s2)
#print("{} {}".format(s1, s2))

Spouse(Span("b'Pope'", sentence=251949, chars=[17,20], words=[4,4]), Span("b'Lady Queen'", sentence=251949, chars=[42,51], words=[9,10]))
Pope
Lady Queen


### 4 - Adicionando as Gold Labels

In [25]:
from util import load_external_labels

%time missed = load_external_labels(session, Spouse, annotator_name='gold')

#

from snorkel.annotations import load_gold_labels

L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)

AnnotatorLabels created: 2695
AnnotatorLabels created: 2615
CPU times: user 3min 34s, sys: 5.85 s, total: 3min 40s
Wall time: 4min 51s


### 6 - Criando as Labeling Functions

In [27]:
import re
from snorkel.lf_helpers import (
    get_left_tokens, get_right_tokens, get_between_tokens,
    get_text_between, get_tagged_text,
)

# LF por pattern

spouses = {'spouse', 'wife', 'husband', 'ex-wife', 'ex-husband'}
family = {'father', 'mother', 'sister', 'brother', 'son', 'daughter',
              'grandfather', 'grandmother', 'uncle', 'aunt', 'cousin'}
family = family | {f + '-in-law' for f in family}
other = {'boyfriend', 'girlfriend' 'boss', 'employee', 'secretary', 'co-worker'}

# Helper function to get last name
def last_name(s):
    name_parts = s.split(' ')
    return name_parts[-1] if len(name_parts) > 1 else None    

def LF_husband_wife(c):
    return 1 if len(spouses.intersection(get_between_tokens(c))) > 0 else 0

def LF_husband_wife_left_window(c):
    if len(spouses.intersection(get_left_tokens(c[0], window=2))) > 0:
        return 1
    elif len(spouses.intersection(get_left_tokens(c[1], window=2))) > 0:
        return 1
    else:
        return 0
    
def LF_same_last_name(c):
    p1_last_name = last_name(c.person1.get_span())
    p2_last_name = last_name(c.person2.get_span())
    if p1_last_name and p2_last_name and p1_last_name == p2_last_name:
        if c.person1.get_span() != c.person2.get_span():
            return 1
    return 0

def LF_no_spouse_in_sentence(c):
    return -1 if np.random.rand() < 0.75 and len(spouses.intersection(c.get_parent().words)) == 0 else 0

def LF_and_married(c):
    return 1 if 'and' in get_between_tokens(c) and 'married' in get_right_tokens(c) else 0
    
def LF_familial_relationship(c):
    return -1 if len(family.intersection(get_between_tokens(c))) > 0 else 0

def LF_family_left_window(c):
    if len(family.intersection(get_left_tokens(c[0], window=2))) > 0:
        return -1
    elif len(family.intersection(get_left_tokens(c[1], window=2))) > 0:
        return -1
    else:
        return 0

def LF_other_relationship(c):
    return -1 if len(other.intersection(get_between_tokens(c))) > 0 else 0

#LF para Distant Supervision

import bz2

# Function to remove special characters from text
def strip_special(s):
    return ''.join(c for c in s if ord(c) < 128)

# Read in known spouse pairs and save as set of tuples
with bz2.BZ2File('data/spouses_dbpedia.csv.bz2', 'rb') as f:
    known_spouses = set(
        tuple(strip_special(x.decode('utf-8')).strip().split(',')) for x in f.readlines()
    )
# Last name pairs for known spouses
last_names = set([(last_name(x), last_name(y)) for x, y in known_spouses if last_name(x) and last_name(y)])
    
def LF_distant_supervision(c):
    p1, p2 = c.person1.get_span(), c.person2.get_span()
    return 1 if (p1, p2) in known_spouses or (p2, p1) in known_spouses else 0

def LF_distant_supervision_last_names(c):
    p1, p2 = c.person1.get_span(), c.person2.get_span()
    p1n, p2n = last_name(p1), last_name(p2)
    return 1 if (p1 != p2) and ((p1n, p2n) in last_names or (p2n, p1n) in last_names) else 0

#

LFs = [
    LF_distant_supervision, LF_distant_supervision_last_names, 
    LF_husband_wife, LF_husband_wife_left_window, LF_same_last_name,
    LF_no_spouse_in_sentence, LF_and_married, LF_familial_relationship, 
    LF_family_left_window, LF_other_relationship
]

### 7 - Aplicando as LFs **paralelismo_resolvido

In [28]:
from snorkel.annotations import LabelAnnotator
labeler = LabelAnnotator(lfs=LFs)

#


np.random.seed(1701)
%time L_train = labeler.apply(split=0, parallelism=16)
L_train


  0%|          | 0/22254 [00:00<?, ?it/s]

Clearing existing...
Running UDF...


100%|██████████| 22254/22254 [05:22<00:00, 68.94it/s]


CPU times: user 56.9 s, sys: 5.83 s, total: 1min 2s
Wall time: 5min 23s


<22254x10 sparse matrix of type '<class 'numpy.int64'>'
	with 22342 stored elements in Compressed Sparse Row format>

### 8 - Fitting the Generative Model (?)

In [10]:
from snorkel.learning import GenerativeModel

gen_model = GenerativeModel()
gen_model.train(L_train, epochs=100, decay=0.95, step_size=0.1 / L_train.shape[0], reg_param=1e-6)

#

gen_model.weights.lf_accuracy

#

train_marginals = gen_model.marginals(L_train)


Inferred cardinality: 2


### 9 - Using the Model to Iterate on Labeling Functions

In [None]:
L_dev = labeler.apply_existing(split=1)

#

tp, fp, tn, fn = gen_model.error_analysis(session, L_dev, L_gold_dev)

### 10 - Salvando as Training Labels

In [None]:
from snorkel.annotations import save_marginals
%time save_marginals(session, L_train, train_marginals)

### 11 - Reloading de certas coisas (para estarem com o mesmo nome do tutorial 3)

In [None]:
from snorkel.annotations import load_marginals

train_marginals = load_marginals(session, split=0)

train_cands = session.query(Spouse).filter(Spouse.split == 0).order_by(Spouse.id).all()
dev_cands   = session.query(Spouse).filter(Spouse.split == 1).order_by(Spouse.id).all()
test_cands  = session.query(Spouse).filter(Spouse.split == 2).order_by(Spouse.id).all()

L_gold_dev  = load_gold_labels(session, annotator_name='gold', split=1)
L_gold_test = load_gold_labels(session, annotator_name='gold', split=2)

### 12 - Setup of the discriminative model **paralelismo

In [None]:
from snorkel.learning.pytorch import LSTM

train_kwargs = {
    'lr':            0.01,
    'embedding_dim': 50,
    'hidden_dim':    50,
    'n_epochs':      10,
    'dropout':       0.25,
    'seed':          1701
}

lstm = LSTM(n_threads=None)
lstm.train(train_cands, train_marginals, X_dev=dev_cands, Y_dev=L_gold_dev, **train_kwargs)

#

p, r, f1 = lstm.score(test_cands, L_gold_test)
print("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format(p, r, f1))

#

tp, fp, tn, fn = lstm.error_analysis(session, test_cands, L_gold_test)

### 13 - Salvando e finalizando o tutorial

In [None]:
lstm.save_marginals(session, test_cands)

In [None]:
from snorkel.models import Document, Sentence

from snorkel.models import candidate_subclass

Spouse = candidate_subclass('Spouse', ['person1', 'person2'])


print(session.query(Document)[0])
print()

print(session.query(Sentence)[0])
print(session.query(Sentence)[1])
print()


print(session.query(Spouse)[0])
print(session.query(Spouse)[1000])