In [3]:
# THIS IS OUR BASELINE MODEL.  STRIPS OUT THE BIO TAGS, which is an enhancement.

import operator
import re
from sklearn.metrics import f1_score, accuracy_score


class ScienceNerBaseline():
    
    # load the training set, and insert 2 'unknown' tokens between each paragraph
    def load(self,inFile) :
        txtfile = open(inFile, "r")
        self.tokens = []
        for i in txtfile :
            if i.strip() == "" :
                self.tokens.append(("<UNK>", "<UNK>"))
                self.tokens.append(("<UNK>", "<UNK>"))
                continue
            wd, tag = i.strip().split()
            if (tag == "O") :
                tag_pred = ""
            else :
                m = re.match("(.*)-(.*)", tag)
                if m != None :
                    tag_pred = m.group(2)
            self.tokens.append((wd, tag_pred))
        txtfile.close()
        
    
    # this function slides a 3-word window over the entire training set word-by-word to
    # get every trigram, and classifies the 3-gram to be whatever tag the current word is
    def countTrigrams(self):
        self.counts = defaultdict(lambda: defaultdict(lambda: 0.0))
        self.context_totals = dict()
        w_1, w_2 = None, None
        for word in self.tokens:
            
            # word and tag
            wd, tk = word
            if w_1 is not None and w_2 is not None:
                self.counts[(w_2,w_1,wd)][tk] += 1
            w_2 = w_1
            w_1 = wd

    
    # this function run the test set and calculates accuracy
    def evaluate(self, testfile) :   
        y_pred = []
        y_true = []
        txtfile = open(testfile, "r")
        testtokens = []
        for i in txtfile :
            
            # if there is a blank space in the test file, add 2 unknown tokens
            if i.strip() == "" :
                testtokens.append(("<UNK>", "<UNK>"))
                testtokens.append(("<UNK>", "<UNK>"))
                continue
            wd, tag = i.strip().split()
            testtokens.append((wd, tag))
        txtfile.close()
        
                
        w_1, w_2 = None, None
        num_correct = num_test = 0
        for word in testtokens:
            tag_true = ""
            wd, tk = word

            m = re.match("(.*)-(.*)", tk)
            if m != None :
                tag_true = m.group(2)
            
            # skip unknown words and create 2 dummy trigram tokens for beginning
            if w_1 is not None and w_2 is not None and wd is not "<UNK>":
                # context is w_2, w_1, wd
                if (w_2,w_1,wd) not in self.counts :
                    prediction = [""]
                else :
                    prediction = max(self.counts[(w_2,w_1,wd)].iteritems())
                
                # correct prediction
                y_pred += [prediction[0]]
                y_true += [tag_true]
                    
            w_2 = w_1
            w_1 = wd
            
            
        # find the number of labeled predictions - for use in precision calculation
        num_pos_predictions = len([pr for pr in y_pred if pr!=""])
        
        # find number of true labels - for use in recall calculation
        num_true_labels = len([pr for pr in y_true if pr!=""])

        num_correct = numcorrect_all = 0
        for (a,b) in zip(y_pred,y_true) :
            if (a == b) :
                numcorrect_all+=1
                if (a != "") :
                    
                    # get the number of cases correctly labeled
                    num_correct+=1

        precision = num_correct * 1.0 / num_pos_predictions
        recall = num_correct * 1.0 / num_true_labels
        accuracy = numcorrect_all * 1.0 / len(y_pred)
        f1 = 2 * precision * recall / (precision + recall)
        

        return (f1, precision, accuracy)
        

In [5]:
import os
from collections import defaultdict

modelInst = ScienceNerBaseline()

os.chdir('..')
os.chdir('data')
modelInst.load("train.txt")
modelInst.countTrigrams()
f1, precision, acc = modelInst.evaluate("test.txt")
print "Accuracy is :", acc
print "F1 is :", f1
print "Precision is :", precision



Accuracy is : 0.847214107399
F1 is : 0.0515313563442
Precision is : 0.12441314554
