In [1]:
import flair, torch
flair.device = torch.device('cpu') 

The default behavior is that the model gets put on GPU if available and runs on CPU if there is no GPU.
The flair.device parameter gets called all over the code to move models and tensor to the device on which flair is run.
Change this behavior, for instance to direct it to run on CPU even if you have a GPU available, you need to run this code before instantiating your model.

In [8]:
import pandas as pd
train = pd.read_excel('PII_Train_Large_Data_Test_Data.xlsx',sheet_name=1)
train.columns = train[0:1].values[-1]
train = train.drop(labels=0, axis=0).reset_index(drop=True)
train.head(101)

Unnamed: 0,Text,Labels,PII
0,Candidate economic character present money dau...,Address,Apt. 026
1,Film range sound. People age Apt. 476 that.,Address,Apt. 476
2,Back want myself class certain. Tree pretty ca...,Address,Suite 492
3,Bring guy 81627 Kimberly Squares Washingtonber...,Address,"81627 Kimberly Squares Washingtonberg, RI 13540"
4,"52013 Jason Vista Lake Kathleen, PA 89168 May ...",Address,"52013 Jason Vista Lake Kathleen, PA 89168"
...,...,...,...
96,Speech national especially available own black...,Address,Apt. 646
97,Forward 8278 Torres Branch Apt. 177 Robertsbur...,Address,"8278 Torres Branch Apt. 177 Robertsburgh, NJ 0..."
98,Test artist person billion. Trouble staff indu...,Address,"4058 Gordon Fields South Charlestown, NJ 40537"
99,Group think step increase answer know. Agreeme...,Address,Apt. 212


In [3]:
train.head(1)

Unnamed: 0,Text,Labels,PII
0,Candidate economic character present money dau...,Address,Apt. 026


In [4]:
#Flair library requires train data to be in this format.
# converting pII column to string. bcz some are integers

In [5]:
train['PII'] = train['PII'].astype('str')
train['Labels'] = train['Labels'].apply(str.upper)

In [6]:
# cast Labels and PII to a tuple list by zip
annotation = list(zip(train['PII'], train['Labels']))
aaa = []
for i in range(len(annotation)):
    aaa.append([annotation[i]])
train['annotation'] = aaa
# we do not need Labels and PII columns anymore
train = train.drop(columns=['Labels', 'PII'])
train = train.loc[:699]
train.columns = ['text', 'annotation']
train

Unnamed: 0,text,annotation
0,Candidate economic character present money dau...,"[(Apt. 026, ADDRESS)]"
1,Film range sound. People age Apt. 476 that.,"[(Apt. 476, ADDRESS)]"
2,Back want myself class certain. Tree pretty ca...,"[(Suite 492, ADDRESS)]"
3,Bring guy 81627 Kimberly Squares Washingtonber...,"[(81627 Kimberly Squares Washingtonberg, RI 13..."
4,"52013 Jason Vista Lake Kathleen, PA 89168 May ...","[(52013 Jason Vista Lake Kathleen, PA 89168, A..."
...,...,...
695,Coach he west magazine against 510-81-5182 bea...,"[(510-81-5182, SSN)]"
696,Speech national especially 471 33 3655 availab...,"[(471 33 3655, SSN)]"
697,Forward listen step this community financial m...,"[(008-52-3159, SSN)]"
698,Test artist person billion. Trouble staff indu...,"[(818-43-7502, SSN)]"


In [11]:
train.head(4)

Unnamed: 0,Text,Labels,PII
0,Candidate economic character present money dau...,Address,Apt. 026
1,Film range sound. People age Apt. 476 that.,Address,Apt. 476
2,Back want myself class certain. Tree pretty ca...,Address,Suite 492
3,Bring guy 81627 Kimberly Squares Washingtonber...,Address,"81627 Kimberly Squares Washingtonberg, RI 13540"


In [57]:
train.tail(20)

Unnamed: 0,text,annotation
680,Talk while American suddenly 587-05-0700 parti...,"[(587-05-0700, SSN)]"
681,Camera wall continue top us clearly hot includ...,"[(896-11-9761, SSN)]"
682,Course college still loss scene. Series 530 12...,"[(530 12 8752, SSN)]"
683,243 53 6315 Movement near value number able ab...,"[(243 53 6315, SSN)]"
684,Pull key other nor 166-43-2545 save perform. L...,"[(166-43-2545, SSN)]"
685,Ready off score foot market protect. 829 49 9139,"[(829 49 9139, SSN)]"
686,Up course education involve thousand. Treat sc...,"[(211 72 2423, SSN)]"
687,Huge develop environmental president their. Ma...,"[(423 72 5067, SSN)]"
688,Possible both develop 595-23-4635 claim. Call ...,"[(595-23-4635, SSN)]"
689,Describe question cover suggest 344 81 3132 ac...,"[(344 81 3132, SSN)]"


In [7]:
import pandas as pd
from tqdm import tqdm
from difflib import SequenceMatcher
import re
import pickle

def matcher(string, pattern):
    '''
    Return the start and end index of any pattern present in the text.
    '''
    match_list = []
    pattern = pattern.strip()
    seqMatch = SequenceMatcher(None, string, pattern, autojunk=False)
    match = seqMatch.find_longest_match(0, len(string), 0, len(pattern))
    if (match.size == len(pattern)):
        start = match.a
        end = match.a + match.size
        match_tup = (start, end)
        string = string.replace(pattern, "X" * len(pattern), 1)
        match_list.append(match_tup)
        
    return match_list, string
def mark_sentence(s, match_list):
    '''
    Marks all the entities in the sentence as per the BIO scheme. 
    '''
    word_dict = {}
    for word in s.split():
        word_dict[word] = 'O'
        
    for start, end, e_type in match_list:
        temp_str = s[start:end]
        tmp_list = temp_str.split()
        if len(tmp_list) > 1:
            word_dict[tmp_list[0]] = 'B-' + e_type
            for w in tmp_list[1:]:
                word_dict[w] = 'I-' + e_type
        else:
            word_dict[temp_str] = 'B-' + e_type
    return word_dict

def clean(text):
    '''
    Just a helper fuction to add a space before the punctuations for better tokenization
    '''
    filters = ["!", "#", "$", "%", "&", "(", ")", "/", "*", ".", ":", ";", "<", "=", ">", "?", "@", "[",
               "\\", "]", "_", "`", "{", "}", "~", "'"]
    for i in text:
        if i in filters:
            text = text.replace(i, " " + i)
            
    return text


In [8]:
def create_data(df, filepath):
    '''
    The function responsible for the creation of data in the said format.
    '''
    # here in the original code that I used the encoding was not specified which took me some time to figure
    # out, and put the output files into required formats.
    with open(filepath , 'w',encoding='utf-8') as f:
        for text, annotation in zip(df.text, df.annotation):
            #text = clean(text)
            #text_ = text        
            match_list = []
            for i in annotation:
                a, text_ = matcher(text, i[0])
                match_list.append((a[0][0], a[0][1], i[1]))

            d = mark_sentence(text, match_list)

            for i in d.keys():
                f.writelines(i + ' ' + d[i] +'\n')
            f.writelines('\n')
            
            

In [9]:
def main():
 
    ## path to save the txt file. 
    ## run three times with train, test, and dev .txt filenames.
    ## as shown in the below screen shot snippet. 
    filepath = 'C:/Users/Yamini/Documents/NER/dev.txt'
    ## creating the file.
    create_data(train, filepath)
    
if __name__ == '__main__':
    main()


In [10]:
#Reading the Corpus:

In [11]:
# imports
from flair.data import Corpus
from flair.datasets import ColumnCorpus

# define columns
columns = {0 : 'text', 1 : 'ner'}
# directory where the data resides
data_folder = 'C:/Users/Yamini/Documents/NER'
# initializing the corpus
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file = 'train.txt',
                              test_file = 'test.txt',
                              dev_file = 'dev.txt')

2022-01-13 09:13:18,360 Reading data from C:\Users\Yamini\Documents\NER
2022-01-13 09:13:18,368 Train: C:\Users\Yamini\Documents\NER\train.txt
2022-01-13 09:13:18,368 Dev: C:\Users\Yamini\Documents\NER\dev.txt
2022-01-13 09:13:18,368 Test: C:\Users\Yamini\Documents\NER\test.txt


In [12]:
# lenght of our corpus and one example of tagging.
print(len(corpus.train))
print(corpus.train[0].to_tagged_string('ner'))

700
Candidate economic character present money daughter Apt. <B-ADDRESS> 026 <I-ADDRESS> world well. Open analysis center.


In [40]:
# tag to predict
tag_type = 'ner'
# make tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

  tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)


In [41]:
# here are a list of avalibale tags that we created, which aligns with our PII on initial data.
tag_dictionary.item2idx

{b'O': 0,
 b'B-ADDRESS': 1,
 b'I-ADDRESS': 2,
 b'B-CREDITCARDNUMBER': 3,
 b'B-EMAIL': 4,
 b'B-NAME': 5,
 b'I-NAME': 6,
 b'B-PHONE_NUMBER': 7,
 b'B-PLATES': 8,
 b'I-PLATES': 9,
 b'B-SSN': 10,
 b'I-SSN': 11,
 b'<START>': 12,
 b'<STOP>': 13}

In [42]:
#initialize the embeddings
from flair.embeddings import WordEmbeddings, StackedEmbeddings

from typing import List
embedding_types : List['TokenEmbeddings'] = [
        WordEmbeddings('glove'),
        ## other embeddings
        WordEmbeddings('crawl'),
        # ELMoEmbeddings() wanted to use this embedding since i read it improves the model accuracy significantly

        ]
embeddings : StackedEmbeddings = StackedEmbeddings(
                                 embeddings=embedding_types)

In [43]:
#Here we initialize a Sequence Tagger. We activate Conditional Random Fields with use_crf=True flag. 
#On the backend training is being done by bi-directional LSTM.

In [44]:
from flair.models import SequenceTagger

tagger : SequenceTagger = SequenceTagger(hidden_size=256,
                                       embeddings=embeddings,
                                       tag_dictionary=tag_dictionary,
                                       tag_type=tag_type,
                                       use_crf=True)
print(tagger)

SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings(
      'glove'
      (embedding): Embedding(400001, 100)
    )
    (list_embedding_1): WordEmbeddings(
      'crawl'
      (embedding): Embedding(1000001, 300)
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=400, out_features=400, bias=True)
  (rnn): LSTM(400, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=14, bias=True)
  (beta): 1.0
  (weights): None
  (weight_tensor) None
)


In [45]:
#Here we train our model.

In [46]:
from flair.trainers import ModelTrainer
trainer : ModelTrainer = ModelTrainer(tagger, corpus)
trainer.train('resources/taggers/example-ner',
              learning_rate=0.1,
              mini_batch_size=32,
              max_epochs=150)

2022-01-13 09:56:00,479 ----------------------------------------------------------------------------------------------------
2022-01-13 09:56:00,487 Model: "SequenceTagger(
  (embeddings): StackedEmbeddings(
    (list_embedding_0): WordEmbeddings(
      'glove'
      (embedding): Embedding(400001, 100)
    )
    (list_embedding_1): WordEmbeddings(
      'crawl'
      (embedding): Embedding(1000001, 300)
    )
  )
  (word_dropout): WordDropout(p=0.05)
  (locked_dropout): LockedDropout(p=0.5)
  (embedding2nn): Linear(in_features=400, out_features=400, bias=True)
  (rnn): LSTM(400, 256, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=512, out_features=14, bias=True)
  (beta): 1.0
  (weights): None
  (weight_tensor) None
)"
2022-01-13 09:56:00,488 ----------------------------------------------------------------------------------------------------
2022-01-13 09:56:00,488 Corpus: "Corpus: 700 train + 700 dev + 700 test sentences"
2022-01-13 09:56:00,488 -----------------



2022-01-13 09:56:00,743 ----------------------------------------------------------------------------------------------------
2022-01-13 09:56:01,302 epoch 1 - iter 2/22 - loss 3.93779930 - samples/sec: 114.53 - lr: 0.100000
2022-01-13 09:56:01,809 epoch 1 - iter 4/22 - loss 3.14496055 - samples/sec: 126.22 - lr: 0.100000
2022-01-13 09:56:02,209 epoch 1 - iter 6/22 - loss 2.38903525 - samples/sec: 159.97 - lr: 0.100000
2022-01-13 09:56:02,659 epoch 1 - iter 8/22 - loss 1.90877115 - samples/sec: 147.26 - lr: 0.100000
2022-01-13 09:56:03,049 epoch 1 - iter 10/22 - loss 1.63348920 - samples/sec: 164.07 - lr: 0.100000
2022-01-13 09:56:03,470 epoch 1 - iter 12/22 - loss 1.47632219 - samples/sec: 152.25 - lr: 0.100000
2022-01-13 09:56:03,909 epoch 1 - iter 14/22 - loss 1.32397090 - samples/sec: 145.54 - lr: 0.100000
2022-01-13 09:56:04,277 epoch 1 - iter 16/22 - loss 1.20476542 - samples/sec: 174.31 - lr: 0.100000
2022-01-13 09:56:04,721 epoch 1 - iter 18/22 - loss 1.13077813 - samples/sec: 1

{'test_score': 0.9235167977126518,
 'dev_score_history': [0.045416316232127836,
  0.2702205882352941,
  0.4471698113207547,
  0.48785046728971965,
  0.5537583254043769,
  0.6123222748815166,
  0.6539923954372624,
  0.6438746438746439,
  0.6666666666666667,
  0.6615969581749049,
  0.6418527708850289,
  0.6666666666666667,
  0.6454248366013072,
  0.6469740634005764,
  0.6754221388367728,
  0.6717267552182162,
  0.6717267552182162,
  0.6740331491712708,
  0.6736045411542101,
  0.714501510574018,
  0.6935483870967742,
  0.6796875,
  0.7148120854826824,
  0.68801191362621,
  0.6730627306273063,
  0.7473524962178516,
  0.7481259370314842,
  0.7350180505415163,
  0.7538126361655774,
  0.7680115273775217,
  0.7546897546897546,
  0.7164612037708484,
  0.7385057471264368,
  0.7823571945047,
  0.7994248741912294,
  0.800578034682081,
  0.7706821480406386,
  0.768790264853257,
  0.7943786982248521,
  0.8065902578796561,
  0.7494521548575602,
  0.8287769784172663,
  0.8058608058608059,
  0.81112737

In [53]:
from flair.data import Sentence
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
# load the trained model
model = SequenceTagger.load('resources/taggers/example-ner/best-model.pt')
#'resources/taggers/example-pos/final-model.pt'
# create example sentence
sentence = Sentence('Candidate economic character present money daughter Apt. 026')
# predict the tags
model.predict(sentence)
print(sentence.to_tagged_string('ner'))

2022-01-13 10:34:42,082 loading file resources/taggers/example-ner/best-model.pt
Candidate economic character present money daughter Apt <B-ADDRESS> . <I-ADDRESS> 026 <I-ADDRESS>
