# **THIS MUST BE RUN ON SNORKEL-EXTRACTION**
## Pre-reqs:

1. https://github.com/snorkel-team/snorkel-extraction shoud be installed and *working*
2. https://github.com/gpgs1978/AlexandreMartins should be git-cloned on '~'
3. https://github.com/zirondi/Snorkel_Farmacovigilancia should be git-cloned on '~'
4. [Preprocessing.py](https://github.com/zirondi/Snorkel_Farmacovigilancia/blob/master/Scripts/Text-Preprocessing/Preprocessing.py) (link to it in the future) should be in the same folder as this notebook

## Paths, magic functions and needed vars and dicts

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
separator = os.path.sep
inPath = os.path.expanduser('~') + separator + 'AlexandreMartins'
outPath = os.path.abspath('').split('Scripts' + separator + 'Text-Preprocessing')[0] + 'Datasets'

import Preprocessing as prep

util = prep.Util(inPath, outPath + separator + 'Preprocessed' + separator + 'Source')

drugs = util.drugs_to_list()
events = util.events_to_list()

#Calculating the number of lines in the corpus
corpusPath = os.path.expanduser('~') + f'{separator}Processed{separator}Tweets Anotados.tsv'

with open(corpusPath) as tsv:
    s = tsv.readlines()
    n_docs = len(s)
    s = None

## PSQL Connection

In [None]:

#Symbolic ling for linux psql connection (No ideia if it is needed for windows)
#Old snorkel-thingy
if separator == '/':
    os.system("ln -s /var/run/postgresql/.s.PGSQL.5432 /tmp/.s.PGSQL.5432")

#Adding the DB to the PATH
os.environ['SNORKELDB'] = 'postgres:///lzirondi'

#SnorkelSession MUST ALWAYS BE after setting the os.environ, it will default to SQLite if the var is not set.
from snorkel import SnorkelSession
session = SnorkelSession()

## Sentence Parser

In [None]:
from snorkel.parser import TSVDocPreprocessor

doc_preprocessor = TSVDocPreprocessor(corpusPath, max_docs=n_docs)


from snorkel.parser.spacy_parser import Spacy
from snorkel.parser import CorpusParser

corpus_parser = CorpusParser(parser=Spacy(lang='pt'))


%time corpus_parser.apply(doc_preprocessor, count=n_docs, parallelism=16)

from snorkel.models import Document, Sentence

print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())
print('Done')

## Relation Class

In [None]:
from snorkel.models import candidate_subclass

relation = candidate_subclass('Relation', ['Drug', 'Event'])

## Candidate Extractor and sets

In [None]:
from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.matchers import DictionaryMatch

ngrams = Ngrams(n_max=7)
drug_matcher = DictionaryMatch(d=drugs, longest_match_only=True, ignore_case=True)
event_matcher = DictionaryMatch(d=events, longest_match_only=True, ignore_case=True)
cand_extractor = CandidateExtractor(relation, [ngrams, ngrams], [drug_matcher, events_matcher], symmetric_relations=True)

from snorkel.models import Document

docs = session.query(Document).order_by(Document.name).all()

train_sents = set()
dev_sents   = set()
test_sents  = set()

for i, doc in enumerate(docs):
    for s in doc.sentences:
        if i % 10 == 8:
            dev_sents.add(s)
        elif i % 10 == 9:
            test_sents.add(s)
        else:
            train_sents.add(s)

## Applying the candidate extractor

In [None]:
%time
for i, sents in enumerate([train_sents, dev_sents, test_sents]):
    print(i)
    cand_extractor.apply(sents, split=i, parallelism = 16)
    print("Number of candidates:", session.query(relation).filter(relation.split == i).count())

## Helper Functions

In [None]:
from snorkel.lf_helpers import is_inverted

def get_text(cand):
    return cand.get_parent()._asdict().get('text')

def get_spans_all(cand):
    return [ 
                [
                    cand.get_contexts()[0].get_span(), 
                    str(cand.get_contexts()[0].char_start), 
                    str(cand.get_contexts()[0].char_end), 
                    cand.get_contexts()[0].__str__().split(' ')[3][9:-1].replace(']', '').replace(',', '')
                ],
                [
                    cand.get_contexts()[1].get_span(), 
                    str(cand.get_contexts()[1].char_start), 
                    str(cand.get_contexts()[1].char_end), 
                    cand.get_contexts()[1].__str__().split(' ')[3][9:-1].replace(']', '').replace(',', '')
                ]
            ]

## TSVs for Snorkel 9

In [None]:
from tqdm import tqdm

names = [
    f'{outPath}{separator}Preprocessed{separator}train_sets.tsv', 
    f'{outPath}{separator}Preprocessed{separator}dev_sets.tsv', 
    f'{outPath}{separator}Preprocessed{separator}test_sets.tsv'
    ]

COLUMNS = "Candidate\tDrug\tDrug_Char_Start\tDrug_Char_end\tDrug_Word_Index\tEvent\tEvent_Char_Start\tEvent_Char_end\tEvent_Word_Index\tIs_inverted\tLABEL\n"
LABEL = '-1'

for i in range(3):
    with open(names[i], 'w') as f:
        print('\n', names[i])
        f.write(COLUMNS)
        for c in tqdm(session.query(relation).filter(relation.split == i).all()):
            
            candidate = get_text(c).replace('\n', '')
            drug, event = get_spans_all(c)
            inverted = '1' if is_inverted(c) else '0'

            f.write(
                candidate + '\t' +  
                drug[0] + '\t' + 
                drug[1] + '\t' + 
                drug[2] + '\t' + 
                drug[3] + '\t' +  
                event[0] + '\t' + 
                event[1] + '\t' + 
                event[2] + '\t' + 
                event[3] + '\t' +
                inverted + '\t' +
                LABEL + '\n'
            )
        print('Done ', names[i])
            