# **THIS SHOULD BE RUN ON SNORKEL-EXTRACTION**

# Pre-reqs:

1. https://github.com/snorkel-team/snorkel-extraction shoud be installed and *working*
2. https://github.com/gpgs1978/AlexandreMartins should be git-cloned on '~'
3. Preprocessing.py (link to it in the future) should be in the same folder as this notebook

## Paths, magic functions and needed vars and dicts

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os

inPath = os.path.expanduser('~') + '/AlexandreMartins'
outPath = os.path.expanduser('~') 

import Preprocessing as prep

util = prep.Util(inPath, outPath, True)

substancias = util.subsToList()
eventos = util.eventosToList()

#Calculando o número de linhas no corpus
corpusPath = os.path.expanduser('~') + '/Processed/Tweets Anotados.tsv'

with open(corpusPath) as tsv:
    s = tsv.readlines()
    n_docs = len(s)
    s = None

Arquivo Tweets Anotados.txt copiado com sucesso.
Arquivo Eventos.txt copiado com sucesso.
Arquivo Substâncias2-br-gazette.txt copiado com sucesso.
Arquivo Remédios2-br-gazette.txt copiado com sucesso.
Arquivo Substâncias-br-gazette.txt copiado com sucesso.
Arquivo Remédios-br-gazette.txt copiado com sucesso.
Arquivo EventosAdversos-gazette.txt copiado com sucesso.
Tsv do Corpus criado com sucesso.
Dicionário de Substâncias gerado com sucesso.
Dicionário de Eventos gerado com sucesso.


## Conexão ao banco

In [2]:

#Fazendo o link simbólico para o acesso ao banco de dados e definindo o banco como o PSQL
os.system("ln -s /var/run/postgresql/.s.PGSQL.5432 /tmp/.s.PGSQL.5432")

#THIS SHOULD CHANGE IN THE FUTURE
os.environ['SNORKELDB'] = 'postgres:///lzirondi'

#O import do SnorkelSession SEMPRE vai ter que estar após definir o BD nas variaveis de ambiente do SO, ao contrário ele defaulta para SQLite
from snorkel import SnorkelSession
session = SnorkelSession()

## Gerando Sentenças

In [3]:
from snorkel.parser import TSVDocPreprocessor

doc_preprocessor = TSVDocPreprocessor(corpusPath, max_docs=n_docs)


from snorkel.parser.spacy_parser import Spacy
from snorkel.parser import CorpusParser

corpus_parser = CorpusParser(parser=Spacy(lang='pt'))


%time corpus_parser.apply(doc_preprocessor, count=n_docs, parallelism=16)

from snorkel.models import Document, Sentence

print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())

  0%|          | 0/5984 [00:00<?, ?it/s]Clearing existing...
Running UDF...
100%|██████████| 5984/5984 [06:19<00:00, 15.79it/s]CPU times: user 7.94 s, sys: 1.58 s, total: 9.51 s
Wall time: 6min 19s
Documents: 5984
Sentences: 8280



## Gerando a Classe de Relação

In [4]:
from snorkel.models import candidate_subclass

relation = candidate_subclass('Relation', ['Substância', 'Evento_Adverso'])

## Extraindo Candidatos do Corpus

In [5]:
from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.matchers import DictionaryMatch

ngrams = Ngrams(n_max=7)
substancia_matcher = DictionaryMatch(d=substancias, longest_match_only=True, ignore_case=True)
eventos_matcher = DictionaryMatch(d=eventos, longest_match_only=True, ignore_case=True)
cand_extractor = CandidateExtractor(relation, [ngrams, ngrams], [substancia_matcher, eventos_matcher], symmetric_relations=True)

from snorkel.models import Document

docs = session.query(Document).order_by(Document.name).all()

train_sents = set()
dev_sents   = set()
test_sents  = set()

for i, doc in enumerate(docs):
    for s in doc.sentences:
        if i % 10 == 8:
            dev_sents.add(s)
        elif i % 10 == 9:
            test_sents.add(s)
        else:
            train_sents.add(s)

## Aplicando o extrator de relação

In [6]:
%time
for i, sents in enumerate([train_sents, dev_sents, test_sents]):
    print(i)
    cand_extractor.apply(sents, split=i, parallelism = 16)
    print("Number of candidates:", session.query(relation).filter(relation.split == i).count())

  0%|          | 0/6659 [00:00<?, ?it/s]CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.68 µs
0
Clearing existing...
Running UDF...
100%|██████████| 6659/6659 [01:59<00:00, 55.72it/s]
  0%|          | 0/810 [00:00<?, ?it/s]Number of candidates: 5156
1
Clearing existing...
Running UDF...
100%|██████████| 810/810 [00:12<00:00, 66.36it/s]
  0%|          | 0/811 [00:00<?, ?it/s]Number of candidates: 652
2
Clearing existing...
Running UDF...
100%|██████████| 811/811 [00:12<00:00, 63.17it/s]Number of candidates: 626



## Definindo Helper Functions

In [7]:
from snorkel.lf_helpers import is_inverted

def get_text(cand):
    return cand.get_parent()._asdict().get('text')

def get_spans_all(cand):
    return [ 
                [
                    cand.get_contexts()[0].get_span(), 
                    str(cand.get_contexts()[0].char_start), 
                    str(cand.get_contexts()[0].char_end), 
                    cand.get_contexts()[0].__str__().split(' ')[3][9:-1].replace(']', '').replace(',', '')
                ],
                [
                    cand.get_contexts()[1].get_span(), 
                    str(cand.get_contexts()[1].char_start), 
                    str(cand.get_contexts()[1].char_end), 
                    cand.get_contexts()[1].__str__().split(' ')[3][9:-1].replace(']', '').replace(',', '')
                ]
            ]

## Gerando os tsvs para a execução do snorkel9

In [8]:
#ajustar output folder
from tqdm import tqdm
names = ['train_sets.tsv', 'dev_sets.tsv', 'test_sets.tsv']

columns = "Candidate\tEvent\tE_Char_Start\tE_Char_end\tE_Word_Index\tRemedy\tR_Char_Start\tR_Char_end\tR_Word_Index\tIs_inverted\tLabel\n"

label = '-1'

for i in range(3):
    with open(names[i], 'w') as f:
        print('\n', names[i])
        f.write(columns)
        for c in tqdm(session.query(relation).filter(relation.split == i).all()):
            
            candidate = get_text(c).replace('\n', '')
            remedy, event = get_spans_all(c)
            inverted = '1' if is_inverted(c) else '0'

            f.write(
                candidate + '\t' +  
                remedy[0] + '\t' + 
                remedy[1] + '\t' + 
                remedy[2] + '\t' + 
                remedy[3] + '\t' +  
                event[0] + '\t' + 
                event[1] + '\t' + 
                event[2] + '\t' + 
                event[3] + '\t' +
                inverted + '\t' +
                label + '\n'
            )
        print('Done ', names[i])
            

  0%|          | 18/5156 [00:00<00:29, 174.34it/s]
 train_sets.tsv
100%|██████████| 5156/5156 [00:25<00:00, 200.90it/s]
  3%|▎         | 20/652 [00:00<00:03, 199.27it/s]Done  train_sets.tsv

 dev_sets.tsv
100%|██████████| 652/652 [00:03<00:00, 202.23it/s]
  4%|▎         | 22/626 [00:00<00:02, 211.71it/s]Done  dev_sets.tsv

 test_sets.tsv
100%|██████████| 626/626 [00:03<00:00, 203.74it/s]Done  test_sets.tsv

