# Welcome!

This notebook is a tutorial of how to do named entity recognition (NER) using the CoNLL-2003 baseline model. 
In the baseline model, the training involves creating a "dictionary" that records all the named entities encountered in the train dataset.

During inference, a phrase is tagged if it appears in the "dictionary"


## Import

In [11]:

import pandas as pd




## Util functions for data preparation


In [12]:
def preprocess(filename,delimiter=','):
    
    with open(filename) as myfile:
            data = myfile.readlines()
            data = [i.rstrip('\n') for i in data]

    data = [i.split(delimiter) for i in data]



    for i in data:
                if i != [''] and i!=[]:
                    del i[1]
                    del i[1]  # delete the middle 2 columns from the data
    for i in range(0, len(data)):
                if data[i] == [''] or data[i]==[]:
                    data[i] = ["", "O"]


    return data


In [13]:
def word2sents(data_string_list):
    data = list()
    X = list()
    Y = list()
    for data_string in data_string_list:

        if data_string == ['', 'O'] or data_string == ['']:
            if X == ['-DOCSTART-']:
                X = list()
                Y = list()
                continue

            data.append((X, Y))
            X = list()
            Y = list()
        else:

            X.append(data_string[0])
            Y.append(data_string[-1])

    if len(X) > 0:
        data.append((X, Y))

    data = [x for x in data if len(x) != 0]

    return data


## Data preparation

In [None]:

tags_space=['B-PER', 'B-LOC', 'B-ORG','B-MISC', 'I-PER', 'I-LOC', 'I-ORG', 'I-MISC', 'O']

"""
the data should be in CoNLL-2003 dataset's format.
each line looks like this:
(this tutorial does not provide the CoNLL-2003 annotated dataset)

-DOCSTART- -X- O O

EU NNP I-NP I-ORG
rejects VBZ I-VP O
German JJ I-NP I-MISC
call NN I-NP O
to TO I-VP O
boycott VB I-VP O
British JJ I-NP I-MISC
lamb NN I-NP O
. . O O

"""

train_path= 'your_path' #you need to put train path here
test_path='your_path' #you need to put test path here

train=preprocess(train_path, delimiter=' ')
test=preprocess(test_path, delimiter=' ')




train_set=word2sents(train)


test_set=word2sents(test)

In [15]:

y_truth=[x[1] for x in test_set]
test_tokens=[x[0] for x in test_set]

# Training

## Util functions for training

In [16]:
def segmentize(lis):
    #for a segment, the end is inclusive, ie, (3,4) means 3rd and 4th token forms a segment

    segs=[]

    currenttag=lis[0]
    buffer=0

    for i in range(1,len(lis)):
        if lis[i]!=currenttag:
            if currenttag!='O' and len(currenttag)>1:
                currenttag=currenttag[2:]
            segs.append((buffer,i-1,currenttag))
            currenttag=lis[i]
            if lis[i].startswith('B-'):
                currenttag=lis[i].replace('B-','I-')
            buffer=i

    if currenttag != 'O' and len(currenttag)>1:
        currenttag = currenttag[2:]
    segs.append((buffer,len(lis)-1,currenttag))

    return segs



In [17]:


def get_class_priority(entity_class):
    # Define the priority for the classes (higher values mean higher priority)
    class_priority = {
        "LOC": 4,
        "ORG": 3,
        "PER": 2,
        "MISC": 1
    }
    # Return the priority of the class, default to 0 if class is not found
    return class_priority.get(entity_class, 0)



## Training step

In [18]:


entity_dictionary={}


for i in range(0,len(train_set)):
    sentence=train_set[i]
    tokens=sentence[0]
    tags=sentence[1]
    tags=segmentize(tags)
    tags=[x for x in tags if x[2]!='O']

    for ent in tags:
        phrase=tokens[ent[0]:ent[1]+1]
        phrase=' '.join(phrase)
        ent_class=ent[2]


        if phrase not in entity_dictionary:
            # If the phrase doesn't exist, initialize an empty dictionary for class counters
            entity_dictionary[phrase] = {}

        # If the class already exists for this phrase, increment the counter
        if ent_class in entity_dictionary[phrase]:
            entity_dictionary[phrase][ent_class] += 1
        else:
            # Otherwise, initialize the counter for this class
            entity_dictionary[phrase][ent_class] = 1


entity_dictionary = {key: entity_dictionary[key] for key in sorted(entity_dictionary)}




for phrase, classes in entity_dictionary.items():
    # If the phrase has more than one class, find the one with the highest counter
    if len(classes) > 1:
        # Sort classes by counter and select the class with the highest count
        max_count = max(classes.values())

        # Step 2: Filter classes that have the highest count
        candidate_classes = [cls for cls, count in classes.items() if count == max_count]

        # Step 3: Apply tiebreaking rule to choose the class with the highest priority. See the "get_class_priority" function. You can customize it according to your preference
        best_class = max(candidate_classes, key=get_class_priority)

        # Keep only the class with the highest counter
        entity_dictionary[phrase] = {best_class: classes[best_class]}


# Inference

## Utils for inference

In [19]:
def baseline_tag(test_sentence, entity_dictionary, minoccur=0 ): 


    sentence_length = len(test_sentence)

    prefixes = set()
    for key in entity_dictionary:
        key_tokens = key.split()
        for i in range(1, len(key_tokens) + 1):
            prefixes.add(' '.join(key_tokens[:i]))

    prefixes=sorted(list(prefixes))


    found_entities=[]

    i = 0
    while i <  sentence_length:
        buffer = test_sentence[i]  # Start the buffer with the current token
        buffer_str = buffer

        j = i + 1  # Pointer to track subsequent tokens
        last_match = None  # To store the longest valid match

        # Keep expanding the buffer while it matches any dictionary key
        while j <= sentence_length:
            # Join the tokens in the buffer and check against the entity dictionary
            if buffer_str in prefixes:
                # Check if buffer_str is a valid entity in the dictionary
                if buffer_str in entity_dictionary:
                    ent_class, counter = list(entity_dictionary[buffer_str].items())[0]

                    last_match = (buffer_str, (i, j-1), ent_class, counter)

                # If more tokens are available, add them to buffer_str
                if j < sentence_length:
                    buffer_str += ' ' + test_sentence[j]
            else:
                break


            j += 1  # Expand the buffer by adding one more token

        # If we found a match, append the longest match found to the result list
        if last_match is not None:
            found_entities.append(last_match)

        if last_match is not None:
            i = j-1  # Move the index to the position after the matched entity
        else:
            i = i + 1  # Move to the next token if no match was found

    found_entities= [x for x in found_entities if x[3]>=minoccur]
    found_entities = sorted(found_entities, key=lambda x: x[1][0])




    predlist = ["O"] * sentence_length

    for entity, (start_idx, end_idx), entity_class, _ in found_entities:
        if start_idx > 0 and predlist[start_idx - 1].endswith(entity_class):
            # Mark the first token with "B-" if the previous token is part of the same class
            predlist[start_idx] = f"B-{entity_class}"
        else:
            # Otherwise, mark the first token with "I-"
            predlist[start_idx] = f"I-{entity_class}"

        # Mark the rest of the tokens in the entity with "I-"
        for i in range(start_idx + 1, end_idx + 1):
            predlist[i] = f"I-{entity_class}"

    return predlist






# Inference step

In [20]:
predictions=[]
for i in range(0,len(test_tokens)):

    test_sentence= test_tokens[i]

    predlist=baseline_tag(test_sentence,entity_dictionary)

    predictions.append(predlist)


0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400


# Evaluation

## Utils for evaluation

In [21]:

def endofphrase(prev, current):#if the previous word is the last word of a NE phrase, then returns true
    answer=False
    if prev.startswith("B") and current.startswith("B"):
        answer=True
    if prev.startswith("B") and current.startswith("O"):
        answer=True
    if prev.startswith("I") and current.startswith("B"):
        answer=True
    if prev.startswith("I") and current.startswith("O"):
        answer=True
    if prev!="O" and current!="O" and prev[2:]!=current[2:]:
        answer=True
    return answer




def startofphrase(prev, current):  #if the current word is the first word of a NE phrase, then returns true
    answer=False
    if current.startswith("B"):
        answer=True
    if prev.startswith("O") and current.startswith("I"):
        answer=True
    if prev!="O" and current!="O" and prev[2:]!=current[2:]:
        answer=True
    return answer


In [22]:
def performance_basic(predlist,truelist):
    if len(predlist)!=len(truelist):
        #sanity check
        print("not same!!!")
        return None
    total = len(predlist)
    tp = 0
    retrieved = 0
    relevant = 0
    correct=0

    check=False #checks if the prediction matches the true tag. this turns true when both detects a start a phrase. It remains true until an error occurs, or if the phrase ends


    if total>1:

        #loop through the sentence. True positive, retrieved and relevant are all counted on a phrase-basis (one phrase is counted as 1)

        for i in range(0,total):
            if i==0:
                trueprev='O'
                predprev='O'
            else:
                trueprev = truelist[i - 1]
                predprev = predlist[i - 1]


            truecurrent=truelist[i]
            predcurrent = predlist[i]


            if startofphrase(trueprev,truecurrent)==True:
                relevant+=1

            if startofphrase(predprev,predcurrent)==True:
                retrieved+=1

            if check==True:
                if endofphrase(trueprev,truecurrent) ==True and endofphrase(predprev,predcurrent)==True and trueprev[2:]==predprev[2:]:
                    tp+=1
                    check=False
                if truecurrent[2:]!=predcurrent[2:] or endofphrase(trueprev,truecurrent)!=endofphrase(predprev,predcurrent):
                    check=False

            if startofphrase(trueprev, truecurrent) == True and startofphrase(predprev,predcurrent) == True and truecurrent[2:] == predcurrent[2:]:
                        check = True
                
        if check==True: #this is to fill in the gap of the for-loop above. if the last word is in a NE and so far the check is ok , then this is also a true positive
            tp+=1

    elif total==1: #one-token sentence

        if truelist[0] == predlist[0]:
            correct += 1


        if truelist[0] != "O":
            relevant += 1

        if predlist[0] != "O":
            retrieved += 1
        if truelist[0] == predlist[0] and truelist[0]!='O':
            tp+=1


    """
    #you can print it if you want, to inspect it
    print("Total:",total)
    print("Releant:" , relevant)
    print("Retrieved:" , retrieved)
    print("TP:" , tp)
    """

    return relevant,retrieved,tp


## Evaulation step

In [23]:



tp = 0
retrieved = 0
relevant= 0


results=[]


for i in range(0,len(predictions)):

    truelist=y_truth[i]
    predlist=predictions[i]
    relevant,retrieved,tp=performance_basic(predlist,truelist)

    result=(relevant,retrieved,tp)



    results.append(result)



relevant = [x[0] for x in results]
relevant = sum(relevant)

retrieved = [x[1] for x in results]
retrieved = sum(retrieved)

tp = [x[2] for x in results]
tp = sum(tp)



try:
    precision = tp / retrieved
except:
    precision=0


try:
    recall = tp / relevant
except:
    recall=0

try:
    fscore = 2 * precision * recall / (precision + recall)
except:
    fscore=0


precision=round(precision*100,1)
recall=round(recall*100,1)
fscore=round(fscore*100,1)



#print(relevant)
print(f"Baseline &  {precision} & {recall} & {fscore} & {tp} & {retrieved} & {relevant} ")




Baseline &  72.0 & 50.9 & 59.7 & 2875 & 3991 & 5648 


# Conclusions

We have shown how to do NER using HMM. If trained and tested CoNLL-2003 dataset,
it should have 72% precision, 50.9% recall and 59.7% F-score.