In [None]:
import flair.datasets
from pathlib import Path
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.embeddings import TokenEmbeddings, BertEmbeddings,  ELMoEmbeddings, FlairEmbeddings, WordEmbeddings, PooledFlairEmbeddings, StackedEmbeddings, CharacterEmbeddings, CharLMEmbeddings
from flair.models import LanguageModel
from gensim.models.keyedvectors import KeyedVectors
from typing import List
import os

In [None]:
# Disease
EBI_data_folder = '/nfs/gns/literature/machine-learning/Datasets/NER_Datasets/EBI_standard-IOB'
# define columns
columns = {0: 'text', 1: 'ner'}

In [None]:
EBI = ColumnCorpus(EBI_data_folder, columns, 
                              train_file='train.csv',  test_file='test.csv', dev_file='dev.csv', in_memory=False)



In [None]:
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = EBI.make_tag_dictionary(tag_type=tag_type)
print(tag_dictionary.idx2item)

In [None]:
embedding_types: List[TokenEmbeddings] = [
#                    WordEmbeddings('glove'),
#                    FlairEmbeddings('news-forward'),
#                    FlairEmbeddings('news-backward'),
#                    CharacterEmbeddings(), 
#                    PooledFlairEmbeddings('pubmed-backward', pooling='min'),
#                    PooledFlairEmbeddings('pubmed-forward', pooling='min')
                    FlairEmbeddings('pubmed-forward'),
                    FlairEmbeddings('pubmed-backward')
                   ]

In [None]:
embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

In [None]:
# 5. initialize sequence tagger
from flair.models import SequenceTagger

tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                        embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        use_crf=True)

In [None]:
# 6. initialize trainer
from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, EBI)

In [None]:
trainer.train('/nfs/gns/literature/Santosh_Tirunagari/GitHub/flair_models/ner/manual_annotated_dataset/only_flair_embeddings/',
              learning_rate=0.1,
              mini_batch_size=32,
              patience=3,
              max_epochs=100)

In [None]:
from flair.data import Sentence, Token
from flair.models import SequenceTagger

flair_model = SequenceTagger.load('/nfs/gns/literature/Santosh_Tirunagari/GitHub/flair_models/ner/manual_annotated_dataset/only_flair_embeddings/best-model.pt')


In [None]:
from flask import Flask, jsonify, request

def term_highlighter(text: str = None, terms: list = None) -> str:
    if not text or not terms:
        raise ValueError('Either the supplied text or list of terms and scores is empty or of type None')

    used_term_strs = set()

    for term in terms:

        # because here  each term is something like 'ProjectSummary:AI' followed by a float score
        # (this is just the format that solr returns the 'interesting terms' list).
        term_str = term[3]

        if type(term_str) != str:
            continue

        # prevent double highlighting
        if term_str in used_term_strs:
            continue

        used_term_strs.add(term_str)
        new_term = set()
        for s in filter(lambda x: term_str in x, wordpunct_tokenize(text)):
            new_term.add(s)
        try:
            new_term_str = list(new_term)[0]
        except:
            new_term_str = term_str

        if term[2] == 'GP':
            text = text.replace(new_term_str, '<span class=\'GP\'>' + new_term_str + '</span>')
        elif term[2] == 'DS':
            text = text.replace(new_term_str, '<span class=\'DS\'>' + new_term_str + '</span>')
        elif term[2] == 'OG':
            text = text.replace(new_term_str, '<span class=\'OG\'>' + new_term_str + '</span>')

    return text


def pcse_ner_predictor(text_sentence):

    data_dict ={}

    if not text_sentence:
        return jsonify({
            'error': 'No parameters supplied',
            "status": 400,
            "service": 'pcse_ner_predictor'
        })

    sentence = Sentence(' '.join(wordpunct_tokenize(text_sentence)))
    # print(sentence)
    # print(text_sentence)
    flair_model.predict(sentence)

    try:
        data_dict['tagged'] = sentence.to_dict(tag_type='ner')

        text_input = data_dict['tagged']['text']

        terms_entities = []
        for each_entity in data_dict['tagged']['entities']:
            terms_entities.append(
                [each_entity['start_pos'], each_entity['end_pos'], each_entity['type'], each_entity['text']])

        data_dict['highlighted_text'] = term_highlighter(text_input,terms_entities)

        data_dict['status'] = 200
    except:
        data_dict['status'] = 400

    if data_dict['status'] != 200:
        data_dict['status'] = 400
        return data_dict
    else:
        return data_dict

In [None]:
from nltk.tokenize import wordpunct_tokenize

text_sentence = 'AS1411 Aptamer-functionalized liposome was successfully formulated and found to be nanosized. Flow cytometer and CLSM results demonstrated that Aptamer enhanced the targeting of carrier in the cancer cells via nucleolin-mediated transmembrane endocytosis pathway. The lipofectaminebased miR-29b showed a typical concentration-dependent cytotoxic effect in the cancer cells. LP-miR induced a significant reduction in the cell viability of A2780 cells compared to that of nontreated control, while LP-Mut (mutant loaded) did not have any effect on the cell viability indicating the importance of the specific gene sequencing. LP-miR induced a significant decrease in the green fluorescence which is indicative of the decrease in the cell viability. Simultaneously, higher PI positive cells were observed for LP-miR treated cancer cells in Live/Dead assay. Cells treated with LP-miR exhibited the brightest fluorescence indicating the presence of apoptotic cells. Significant increase in the Annexin-V+ cells and PI+ cells were observed for cell treated with LP-miR compared to that of non-treated control indicating the potential of miR-29b. This novel miR-29b-loaded Aptamer-directed liposome could potential serve as a new platform to improve the therapeutic outcome in ovarian cancers.'

print(pcse_ner_predictor(text_sentence))

    
    

In [None]:
from nltk.tokenize import wordpunct_tokenize, WordPunctTokenizer
from typing import List
def custom_tokenizer(text: str) -> List[Token]:
    """
    Tokenizer based on space character only.
    """
    tokens: List[Token] = []
    
    tokenizer = WordPunctTokenizer()

    text = tokenizer.tokenize(text)

    index = 0
    for index, word in enumerate(text):
            tokens.append(
                Token(
                    text=word, start_position=index, whitespace_after=False
                )
            )

    return tokens

# custom_tokenizer(text_temp_)


text_temp = 'The disparate diversity in immunoglobulin (Ig) repertoire has been a subject of fascination since the emergence of prototypic adaptive immune system in vertebrates. The carboxy terminus region of activation-induced cytidine deaminase (AID) has been well established in tetrapod lineage and is crucial for its function in class switch recombination (CSR) event of Ig diversification. The absence of CSR in the paraphyletic group of fish is probably due to changes in catalytic domain of AID and lack of cis-elements in IgH locus. Therefore, understanding the arrangement of Ig genes in IgH locus and functional facets of fish AID opens up new realms of unravelling the alternative mechanisms of isotype switching and antibody diversity. Further, the teleost AID has been recently reported to have potential of catalyzing CSR in mammalian B cells by complementing AID deficiency in them. In that context, the present review focuses on the recent advances regarding the generation of diversity in Ig repertoire in the absence of AID-regulated class switching in teleosts and the possible role of T cell-independent pathway involving B cell activating factor and a proliferation-inducing ligand in activation of CSR machinery.'

sentence_1 = Sentence(text_temp, use_tokenizer=custom_tokenizer)

flair_model.predict(sentence_1)

manual_json = sentence_1.to_dict(tag_type='ner')




In [None]:
all_entities = []
for ea_an in manual_json['entities']:
    all_entities.append([ea_an['start_pos'],ea_an['end_pos'],ea_an['text'],ea_an['type']])
    
[text_temp]+all_entities   

In [None]:
from nltk.tokenize import wordpunct_tokenize, WordPunctTokenizer
from typing import List
def custom_tokenizer(text: str) -> List[Token]:
    """
    Tokenizer based on space character only.
    """
    tokens: List[Token] = []
    
    tokenizer = WordPunctTokenizer()

    text = tokenizer.tokenize(text)

    index = 0
    for index, word in enumerate(text):
            tokens.append(
                Token(
                    text=word, start_position=index, whitespace_after=False
                )
            )

    return tokens

# custom_tokenizer(text_temp_)

In [None]:
result_path = '/nfs/gns/literature/machine-learning/evaluation/2000articles/ML-NER/flair/'
result_file_name = 'flair_2000.csv'

from nltk.tokenize import wordpunct_tokenize, WordPunctTokenizer
from tqdm import tqdm
import csv
import pandas as pd

test_df = pd.read_csv('/nfs/gns/literature/machine-learning/evaluation/2000articles/europePMC-NER/annotations_API/full_sentences/tagged_sentences/Europe_PMC_annotation.csv', sep='\t', names = ['pmc_id', 'section','sentence', 'gt'])


with open(result_path + result_file_name, 'a', newline='\n') as f1:
    public_writer = csv.writer(f1, delimiter='\t', lineterminator='\n')

    for index_, each_annotation in tqdm(test_df.iterrows(), total=test_df.shape[0]):
        text_temp = each_annotation['sentence'].encode('utf-8').decode('utf-8')
        all_entities = []
        sentence = Sentence(text_temp, use_tokenizer=custom_tokenizer)
        flair_model.predict(sentence)
        PCSE_json = sentence.to_dict(tag_type='ner')

        for ea_an in PCSE_json['entities']:
            all_entities.append([ea_an['start_pos'],ea_an['end_pos'],ea_an['text'],ea_an['type']])
    
        public_writer.writerow([text_temp]+[all_entities])
#         all_entities = []
        
#         if (index_ == 10):
#             break

    

In [None]:
for token in sentence:
    # print what you need (text and NER value)
    print(f"{token.text}\t{token.get_tag('ner').value}")

In [None]:
import csv
data_path = '/nfs/gns/literature/machine-learning/Datasets/NER_Datasets/BC2GM-IOB/test.tsv'
targets = ['GENE']
                
def load_IOBdataset(data_path,targets):
    """
    load the IOB dataset, which is in csv format
    :param data_path: path to the csv file of IOB dataset
    :type data_path: str
    :param targets: a list of interest types
    :type targets: List[str]
    :return: list of labels of every sentence in dataset
    :rtype: List[List[str
    """
    X = []
    y = []

    X_sent = []
    y_sent = []
    with open(data_path, 'r') as f:
        csv_reader = csv.reader(f, delimiter='\t')
        for line in csv_reader:
            if line:
                token, tag = line[0], line[-1]
                X_sent.append(token)
                if targets:
                    if tag.split('-')[-1] in set(targets):
                        y_sent.append(tag)
                    else:
                        y_sent.append('O')
                else:
                    y_sent.append(tag)
            else:
                # we reach the end of a sentence
                if len(X_sent) > 0:
                    X.append(' '.join(X_sent))
                    y.append(y_sent)
                X_sent = []
                y_sent = []
    return X, y

BC2GM_test,y = load_IOBdataset(data_path,targets)

In [None]:
result_path = '/nfs/gns/literature/machine-learning/Santosh/PCSE_on_public_test_data/'
result_file_name = 'PCSE_on_BC2GM_IOB.csv'

from nltk.tokenize import wordpunct_tokenize, WordPunctTokenizer
from tqdm import tqdm
import csv
import pandas as pd

with open(result_path + result_file_name, 'a', newline='\n') as f1:
    public_writer = csv.writer(f1, delimiter='\t', lineterminator='\n')

    for each_sentence in tqdm(BC2GM_test):
        text_temp = each_sentence

#         sentence = ' '.join(text_temp.split(' '))
        sentence = Sentence(text_temp)
        flair_model.predict(sentence)

        for token in sentence:
        # print what you need (text and NER value)
            ner_value = token.get_tag('ner').value
            if  ner_value == 'B-GP':
                ner_value = 'B-GENE'
                
            if  ner_value == 'I-GP':
                ner_value = 'I-GENE'   
            
            public_writer.writerow([token.text, ner_value])
        public_writer.writerow('')

In [None]:
data_path = '/nfs/gns/literature/machine-learning/Datasets/NER_Datasets/NCBI-disease-IOB/test.tsv'
targets = ['Disease']
NCBI_test,y = load_IOBdataset(data_path,targets)

result_path = '/nfs/gns/literature/machine-learning/Santosh/PCSE_on_public_test_data/'
result_file_name = 'PCSE_on_NCBI_IOB.csv'

from nltk.tokenize import wordpunct_tokenize, WordPunctTokenizer
from tqdm import tqdm
import csv
import pandas as pd

with open(result_path + result_file_name, 'a', newline='\n') as f1:
    public_writer = csv.writer(f1, delimiter='\t', lineterminator='\n')

    for each_sentence in tqdm(NCBI_test):
        text_temp = each_sentence

#         sentence = ' '.join(text_temp.split(' '))
        sentence = Sentence(text_temp)
        flair_model.predict(sentence)

        for token in sentence:
        # print what you need (text and NER value)
            ner_value = token.get_tag('ner').value
            if  ner_value == 'B-Disease':
                ner_value = 'B-DS'
                
            if  ner_value == 'I-Disease':
                ner_value = 'I-DS'   
            
            public_writer.writerow([token.text, ner_value])
        public_writer.writerow('')

In [None]:
data_path = '/nfs/gns/literature/machine-learning/Datasets/NER_Datasets/linnaeus-IOB/test.tsv'
targets = ['Species']
NCBI_test,y = load_IOBdataset(data_path,targets)

result_path = '/nfs/gns/literature/machine-learning/Santosh/PCSE_on_public_test_data/'
result_file_name = 'PCSE_on_linnaeus_IOB.csv'

from nltk.tokenize import wordpunct_tokenize, WordPunctTokenizer
from tqdm import tqdm
import csv
import pandas as pd

with open(result_path + result_file_name, 'a', newline='\n') as f1:
    public_writer = csv.writer(f1, delimiter='\t', lineterminator='\n')

    for each_sentence in tqdm(NCBI_test):
        text_temp = each_sentence

#         sentence = ' '.join(text_temp.split(' '))
        sentence = Sentence(text_temp)
        flair_model.predict(sentence)

        for token in sentence:
        # print what you need (text and NER value)
            ner_value = token.get_tag('ner').value
            if  ner_value == 'B-OG':
                ner_value = 'B-Species'
                
            if  ner_value == 'I-OG':
                ner_value = 'I-Species'   
            
            public_writer.writerow([token.text, ner_value])
        public_writer.writerow('')