# Preprocessing script to prepare the biomedical training corpus for RNN training in TensorFlow


Takes the original format text files (sentences file and annotations file) and converts them to a format that can be passed to the LSTM model.

Input (in raw directory):

    1. protein-test.txt - the text

Output (in preprocessed directory):

   1. protein-test-ground-truth-annotations.txt - an IOB format file showing all proteins, where the tokens are aligned with the tokens from protein-test.txt.

In [None]:
import os
DATA_DIR = os.environ['RNN_HOME']

In [None]:
raw_dir = DATA_DIR + "raw/"
preprocessed_dir = DATA_DIR + "preprocessed/"

In [None]:
raw_annotations_file = None
raw_text_file = raw_dir + 'protein-test.txt'
output_annotations_file_test = preprocessed_dir + '/protein-test-ground-truth-annotations.txt'

In [None]:
import tw_utils

In [None]:
import re
from collections import namedtuple

Protein = namedtuple('Protein', 'text within_doc_start_index within_doc_end_index sentence_no within_sentence_start_index within_sentence_end_index within_sentence_token_start_index within_sentence_token_end_index')
Token = namedtuple('Token', 'text within_sentence_start_index within_sentence_end_index')
Sentence = namedtuple('Sentence', 'start_index end_index text tokens')

# Define tokeniser

Define tokeniser that we will use for protein data.

Can't use normal NLP / NLTK tokeniser as it must be robust to strange punctuation that appears in biomed texts.

Splitting on Unicode general category.

In [None]:
tokeniser = tw_utils.tw_protein_tokenizer

In [None]:
tokeniser("Smad1/5/8 Smad1?5?8")

# Load the training sentences without annotations, and tokenise them

This is quite ugly as the sentences will be tokenised a second time using the same method in the train procedure - in a production environment this would be done in a more unified way.

In [None]:
def tokenise(text):
    tokens = tokeniser(text)
    current_index = 0
    for t in tokens:
        char_index = text.index(t, current_index)
        current_index = char_index + 1
        yield Token(t, char_index, char_index + len(t))

In [None]:
def split_sentences(text):
    while ". " in text:
        index = text.index(". ")
        if index > len(text) - 3:
            index = len(text) - 1
        yield text[:(index + 1)]
        text = text[(index + 1):]
    if len(text) > 0:
        yield text

In [None]:
char_index = 0
sentences = []
sentence_to_proteins = []

total_string = ""
with open(raw_text_file, 'r') as f:
    for line in f:
        for l in split_sentences(line):
            sentence_to_proteins.append([])
            tokens = list(tokenise(l))

            sentence = Sentence(char_index, char_index + len(l), l, tokens)
            sentences.append(sentence)
            total_string += l
            char_index += len(l)

In [None]:
sentences[1].start_index

In [None]:
tokens

# Assign proteins to sentences

Preprocess the list of annotations and raw text to produce a map from sentence to proteins and within-sentence char indices

Preprocessing the input files is a little messy as the character indices in the annotations file don't align totally
with the indices in the text file.
Possibly this is due to my system (due to linebreaks, encoding, or using Python 2).

In [None]:
all_proteins = []

'''
correction_to_add = 0
sentence_id = 0
tmpcorr = 0
with open(raw_annotations_file, 'r') as f:
    for idx, l in enumerate(f):
            l = re.sub('\s+$', '', l)
            token_id, type_and_char_indices, correct_entity = l.split('\t')
            type_and_char_indices_split = type_and_char_indices.split(" ")
            start = int(type_and_char_indices_split[1])
            end = int(type_and_char_indices_split[2])
            extracted_entity_from_substring = total_string[(start + correction_to_add):(end + correction_to_add)]
            # Sometimes the protein indices get out of alignment. In this case we search the nearby points
            # in the document until we find where the protein has got to, normally it's only
            # a few letters to the left or right.
            if correct_entity != extracted_entity_from_substring:
                for tmpcorr in [-1,1,-2,2,-3,3,-4,4,-5,5,-6,6,-7,7]:
                    extracted_entity_from_substring = total_string[(start + correction_to_add + tmpcorr):(end + correction_to_add + tmpcorr)]
                    if extracted_entity_from_substring == correct_entity:
                        correction_to_add = correction_to_add + tmpcorr
                        break
            corrected_start = start + correction_to_add
            corrected_end = end + correction_to_add
            # Work out which sentence we're in
            for tmp_sentence_id in range(sentence_id, len(sentences)):
                if corrected_start >= sentences[tmp_sentence_id].start_index and corrected_start < sentences[tmp_sentence_id].end_index:
                    sentence_id = tmp_sentence_id
                    break
            # Where does this protein start & end in the sentence in character indices?
            protein_start_index_in_sentence = corrected_start - sentences[sentence_id].start_index
            protein_end_index_in_sentence = corrected_end - sentences[sentence_id].start_index
            
            # Where are its begin and end tokens?
            tokens = sentences[sentence_id].tokens
            token_start_index, token_end_index = None, None
            for token_idx, token in enumerate(tokens):
                if token.within_sentence_start_index >= protein_start_index_in_sentence and token_start_index == None:
                    token_start_index = token_idx
                if token.within_sentence_end_index >= protein_end_index_in_sentence and token_end_index == None:
                    token_end_index = token_idx
            
            assert token_start_index is not None and token_end_index is not None, "TOKENS NOT ALIGNED"
                
            if idx % 4000 == 0:
                print correct_entity,"tokens:", tokens[token_start_index].text, tokens[token_end_index].text
            
            protein = Protein(correct_entity, corrected_start, corrected_end, sentence_id, protein_start_index_in_sentence, protein_end_index_in_sentence, token_start_index, token_end_index)
            
            
            all_proteins.append(protein)
            sentence_to_proteins[sentence_id].append(protein)
            assert extracted_entity_from_substring == correct_entity, "WARNING: ALIGNMENT FAILED"
'''

# Convert the tokens to IOB notation

This is really rough and ready, as the default tokenise method I'm using doesn't return character indices so I have to infer them to produce the correct annotation. It should work but it is an ugly way of coding so in a production environment this would be done properly.

In [None]:
def get_bio_tokens(tokens, proteins):
    bio_tokens = ["O"] * len(tokens)
    for p in proteins:
        bio_tokens[p.within_sentence_token_start_index] = "BPROTEIN"
        for token_idx in range(p.within_sentence_token_start_index + 1, p.within_sentence_token_end_index + 1):
            bio_tokens[token_idx] = "IPROTEIN"
    return bio_tokens

In [None]:
import random, csv

In [None]:
with open(output_annotations_file_test, "w") as f_test:
        writer_test = csv.writer(f_test, delimiter='\t')
        
        for sentence_idx, sentence in enumerate(sentences):
            writer = writer_test
            
            proteins = sentence_to_proteins[sentence_idx]
            bio_tokens = get_bio_tokens(sentence.tokens, proteins)
            
            for idx, token in enumerate(sentence.tokens):
                # Omit newlines
                if token.text == "\n" or token.text == "\r\n":
                    continue
                row = [str(idx + 1),
                       token.text,
                       "_",
                       bio_tokens[idx],
                       bio_tokens[idx],
                       "_",
                       "_",
                       "_",
                       "_",
                       "_"
                       ]
                writer.writerow(row)
            writer.writerow([])
            