# Evaluate the trained model against the validation subset of the training data

Calculates:

* precision
* recall
* F1


Converts the validation ground truth CONLL file and the validation predictions CONLL file to the same format (similar to the task output format but with token indices instead of character indices).

This allows you to diff the files to work out a score.

In [1]:
import os
DATA_DIR = os.environ['RNN_HOME']

In [2]:
raw_dir = DATA_DIR + "raw/"
preprocessed_dir = DATA_DIR + "preprocessed/"
train_dir = DATA_DIR + "train/"
validation_dir = DATA_DIR + "validation/"

In [3]:
raw_text_file = raw_dir + 'protein-test.txt'

parser_output_file_ground_truth = preprocessed_dir + 'protein-validation-ground-truth-annotations.txt'
parser_output_file_predicted = train_dir + 'brain_pos/greedy/128-0.08-3600-0.9-0/tagged-tuning-corpus'

output_file_ground_truth = validation_dir + "protein-validation-ground-truth.txt"
output_file_predicted = validation_dir + "protein-validation-predicted.txt"

print "ground truth file", parser_output_file_ground_truth
print "predicted file", parser_output_file_predicted

ground truth file /media/thomas/5849-28A2/rnndir_3/preprocessed/protein-validation-ground-truth-annotations.txt
predicted file /media/thomas/5849-28A2/rnndir_3/train/brain_pos/greedy/128-0.08-3600-0.9-0/tagged-tuning-corpus


In [4]:
def convert_conll_to_entity_format(parser_output_file, output_file):
    result = []
    current_protein = None
    proteins = []

    import csv

    counter = 0

    with open(parser_output_file, "r") as f:
        tsvin = csv.reader(f, delimiter='\t')
        for cols in tsvin:
            '''
            if counter > 10:
                break
            '''
            if len(cols) < 4:
                continue
            counter += 1

            token = cols[1]
            tag = cols[3]

            token_index = counter

            if (tag == "O" or tag == "BPROTEIN") and current_protein is not None:            
                proteins.append((current_protein["start"], current_protein["end"], current_protein["tokens"]))
                current_protein = None
            if tag == "BPROTEIN":
                current_protein = {"start":token_index,"tokens":[]}
            if current_protein is not None:
                current_protein["end"] = token_index
                current_protein["tokens"].append(token)

    if current_protein is not None:
        proteins.append((current_protein["start"], current_protein["end"], current_protein["tokens"]))

    with open(output_file, "w") as f_test:
        output_writer = csv.writer(f_test, delimiter='\t')
        output_writer.writerow(["ExampleID", "Class", "Start", "End", "Entity"])
        for index, protein in enumerate(proteins):
            cols = ["T" + str(index+1), "Protein", str(protein[0]), str(protein[1]),  " ".join(protein[2])]
            output_writer.writerow(cols)
            result.append(" ".join(cols[1:]))
    return result

# Compare the ground truth and predicted files

In [5]:
results = dict()
for key, parser_output_file, output_file in [("groundtruth", parser_output_file_ground_truth, output_file_ground_truth),
                                       ("predicted", parser_output_file_predicted, output_file_predicted)]:
    results[key] = convert_conll_to_entity_format(parser_output_file, output_file)
    print len(results[key]), key, "entities present"

1722 groundtruth entities present
1732 predicted entities present


Now work out how many entities are in common between ground truth and predicted

In [6]:
num_intersection = 0
all_predicted = set(results["predicted"])
for g in results["groundtruth"]:
    if g in all_predicted:
        num_intersection += 1
print num_intersection, "of the entities are present in groundtruth and predicted"

1503 of the entities are present in groundtruth and predicted


A quick overview of the first entities found in ground truth and predicted (shown as start and end token indices)

In [7]:
results["groundtruth"][0:10]

['Protein 13 14 CD 40',
 'Protein 39 40 CD 40',
 'Protein 46 47 CD 40',
 'Protein 71 73 beta - globin',
 'Protein 109 112 B 7 . 1',
 'Protein 116 119 B 7 . 1',
 'Protein 147 150 B 7 . 1',
 'Protein 208 210 GATA - 3',
 'Protein 217 219 IL - 5',
 'Protein 232 234 GATA - 3']

In [8]:
results["predicted"][0:10]

['Protein 13 14 CD 40',
 'Protein 39 40 CD 40',
 'Protein 46 47 CD 40',
 'Protein 71 73 beta - globin',
 'Protein 208 210 GATA - 3',
 'Protein 217 219 IL - 5',
 'Protein 232 234 GATA - 3',
 'Protein 243 245 IL - 5',
 'Protein 256 258 GATA - 3',
 'Protein 272 274 IL - 5']

# Calculate precision, recall and F-score

In [9]:
precision = 1.0 * num_intersection / (len(results["predicted"]))
recall = 1.0 * num_intersection / (len(results["groundtruth"]))

f1 = 2.0 * precision * recall / (precision + recall)

print "precision", precision
print "recall", recall
print "f1", f1

precision 0.867782909931
recall 0.872822299652
f1 0.870295309786
