In [1]:
import spacy
import pandas as pd

nlp = spacy.load("en_core_web_sm")


# Function for tokenising with space as token

In [2]:
#tokenize text

def tokenize(raw):
    doc = nlp(raw)
    token_texts = []
    for token in doc:
        token_texts.append(token.text)
        if token.whitespace_:  # filter out empty strings
            token_texts.append(token.whitespace_)
    return token_texts

# Function for obtaining pos tags of token

In [3]:
def pos(raw):
    doc = nlp(raw)
    
    pos_dict = {}
    
    for token in doc:
        pos_dict[str(token)] = str(token.pos_)
        
    return pos_dict

# Illustrative example on English Legal Train set

In [29]:

data = pd.read_json('~/github/diptesh/PR-AAAI22-SDU-ST1-AE/data/english/legal/train.json', encoding='utf8')

In [30]:
data['tokenized'] =data['text'].apply(tokenize)

In [31]:
data['pos_dict'] = data['text'].apply(pos)

In [32]:
# change the name of long-forms column for applying the function directly on the column
data = data.rename(columns={"long-forms": "longforms"})
data.head()

Unnamed: 0,text,acronyms,longforms,ID,tokenized,pos_dict
0,12).; Terms of reference A Correspondence Gro...,"[[194, 199]]","[[164, 192]]",1,"[12, ), ., ;, , , Terms, , of, , reference...","{'12': 'NUM', ')': 'PUNCT', '.': 'PUNCT', ';':..."
1,The comprehensive list of currently identifie...,"[[233, 238]]",[],2,"[ , The, , comprehensive, , list, , of, , ...","{' ': 'SPACE', 'The': 'DET', 'comprehensive': ..."
2,Subregional activities for development Legisl...,"[[142, 147]]","[[85, 140]]",3,"[ , Subregional, , activities, , for, , dev...","{' ': 'SPACE', 'Subregional': 'ADJ', 'activiti..."
3,OIOS recommended that Secretariat programmes t...,"[[239, 247], [142, 146], [0, 4]]","[[167, 237]]",4,"[OIOS, , recommended, , that, , Secretariat...","{'OIOS': 'PROPN', 'recommended': 'VERB', 'that..."
4,98. The Ministry of Education and Culture has...,"[[82, 86]]","[[71, 80]]",5,"[ , 98, ., , The, , Ministry, , of, , Educ...","{' ': 'SPACE', '98': 'NUM', '.': 'PUNCT', 'The..."


#  The function for annotating with BIO :
 1. Long-forms are annotated as: B-LF, I-LF
 2. Acronyms are annotated as: B-AC (all subwords are concatenated e.g. un-women are treated as one acronym)
 3. Other is annotated as: B-O

In [38]:
def bio(text,longforms,acronyms,tokens):
    
    acr_acc = ''
    acr_start_flag = 0
    char_index = 0
    label_flag = 0
    sent_labeled = []
    for w in tokens:
        label_flag = 0
        if w in [' ', '(', ')'] and not(acr_start_flag):
            char_index += 1
            continue  
        #check longforms
        for indx in longforms:
            if char_index == indx[0]:
                sent_labeled.append((w,'B-LF'))
                label_flag = 1
                break
            elif indx[0] < char_index < indx[1]:
                sent_labeled.append((w,'I-LF'))
                label_flag = 1
                break
        
        # check acronym
        for indx in acronyms:
            
            if acr_start_flag and char_index >= indx[1]:
               
                sent_labeled.append((acr_acc,'B-AC'))
                acr_start_flag = 0
                acr_acc = ''
            if char_index == indx[0]:
                
                acr_acc += w
                label_flag = 1
                acr_start_flag = 1
                break
            elif indx[0] < char_index < indx[1]:
                
                acr_acc += w
                
                label_flag = 1
                break
            #  check O label
        if label_flag == 0:
            sent_labeled.append((w,'B-O'))
        char_index += len(w)
        
    return sent_labeled

# The function for annotating with BIO and POS

In [4]:
def pos_bio(text,longforms,acronyms,tokens, pos_dict):
    
    acr_acc = ''
    acr_start_flag = 0
    acr_acc_pos = 'PROPN'
    char_index = 0
    label_flag = 0
    sent_labeled = []
    
    for w in tokens:
        label_flag = 0
        if w in [' ', '(', ')'] and not(acr_start_flag):
            char_index += 1
            continue  
        #check longforms
        for indx in longforms:
            if char_index == indx[0]:
                sent_labeled.append((w, pos_dict[w],'B-LF'))
                label_flag = 1
                break
            elif indx[0] < char_index < indx[1]:
                if(w not in pos_dict.keys()):
                    pos_dict[w] = 'PUNCT'
                sent_labeled.append((w, pos_dict[w],'I-LF'))
                label_flag = 1
                break
        
        # check acronym
        for indx in acronyms:
            
            if acr_start_flag and char_index >= indx[1]:
               
                sent_labeled.append((acr_acc,acr_acc_pos,'B-AC'))
                acr_start_flag = 0
                acr_acc = ''
            if char_index == indx[0]:
                
                acr_acc += w
               
                label_flag = 1
                acr_start_flag = 1
                break
            elif indx[0] < char_index < indx[1]:
                
                acr_acc += w
            
                label_flag = 1
                break
            #  check O label
        if label_flag == 0:
            if(w not in pos_dict.keys()):
                pos_dict[w] = 'PUNCT'
                print(w)
            sent_labeled.append((w,pos_dict[w],'B-O'))
        char_index += len(w)
        
    return sent_labeled

In [92]:
data.shape

(3564, 6)

# Apply the function to the dataframe directly


In [94]:
bio_eng_train_legal = data.apply(lambda x: bio(x.text, x.longforms,x.acronyms,x.tokenized), axis=1)

In [95]:
pos_bio_eng_train_legal = data.apply(lambda x: pos_bio(x.text, x.longforms,x.acronyms,x.tokenized, x.pos_dict), axis=1)

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 


# Save the POS, BIO list into a txt file with each token, POS, and BIO annotation on one line

In [98]:
bio_eng_train_legal

0       [(12, B-O), (., B-O), (;, B-O), (Terms, B-O), ...
1       [(The, B-O), (comprehensive, B-O), (list, B-O)...
2       [(Subregional, B-O), (activities, B-O), (for, ...
3       [(OIOS, B-AC), ( , B-O), (recommended, B-O), (...
4       [(98, B-O), (., B-O), (The, B-O), (Ministry, B...
                              ...                        
3559    [(The, B-O), (Twenty, B-O), (-, B-O), (fourth,...
3560    [(UNCITRAL, B-AC), ( , B-O), (Model, B-O), (La...
3561    [(62, B-O), (., B-O), (Some, B-O), (of, B-O), ...
3562    [(It, B-O), (stressed, B-O), (the, B-O), (impo...
3563    [(44, B-O), (., B-O), (At, B-O), (the, B-O), (...
Length: 3564, dtype: object

In [99]:
pos_bio_eng_train_legal

0       [(12, NUM, B-O), (., PUNCT, B-O), (;, PUNCT, B...
1       [(The, DET, B-O), (comprehensive, ADJ, B-O), (...
2       [(Subregional, ADJ, B-O), (activities, NOUN, B...
3       [(OIOS, PROPN, B-AC), ( , SPACE, B-O), (recomm...
4       [(98, NUM, B-O), (., PUNCT, B-O), (The, DET, B...
                              ...                        
3559    [(The, DET, B-O), (Twenty, NUM, B-O), (-, PUNC...
3560    [(UNCITRAL, PROPN, B-AC), ( , PUNCT, B-O), (Mo...
3561    [(62, NUM, B-O), (., PUNCT, B-O), (Some, DET, ...
3562    [(It, PRON, B-O), (stressed, VERB, B-O), (the,...
3563    [(44, NUM, B-O), (., PUNCT, B-O), (At, ADP, B-...
Length: 3564, dtype: object

In [67]:
bio_eng_train_legal[0]

[('12', 'B-O'),
 ('.', 'B-O'),
 (';', 'B-O'),
 ('Terms', 'B-O'),
 ('of', 'B-O'),
 ('reference', 'B-O'),
 ('A', 'B-O'),
 ('Correspondence', 'B-O'),
 ('Group', 'B-O'),
 ('coordinated', 'B-O'),
 ('by', 'B-O'),
 ('the', 'B-O'),
 ('expert', 'B-O'),
 ('of', 'B-O'),
 ('the', 'B-O'),
 ('United', 'B-O'),
 ('Kingdom', 'B-O'),
 (',', 'B-O'),
 ('and', 'B-O'),
 ('including', 'B-O'),
 ('representatives', 'B-O'),
 ('of', 'B-O'),
 ('affected', 'B-O'),
 ('industries', 'B-O'),
 ('and', 'B-O'),
 ('users', 'B-O'),
 ('of', 'B-O'),
 ('intermediate', 'B-LF'),
 ('bulk', 'I-LF'),
 ('containers', 'I-LF'),
 ("IBC's", 'B-AC'),
 (')', 'B-O'),
 (',', 'B-O'),
 ('shall', 'B-O'),
 ('examine', 'B-O'),
 ('current', 'B-O'),
 ('practices', 'B-O'),
 ('of', 'B-O'),
 ('UN', 'B-O'),
 ('composite', 'B-O'),
 ('IBC', 'B-O'),
 ('re', 'B-O'),
 ('-', 'B-O'),
 ('bottling', 'B-O'),
 ('and', 'B-O'),
 ('cross', 'B-O'),
 ('bottling', 'B-O'),
 ('in', 'B-O'),
 ('several', 'B-O'),
 ('countries', 'B-O'),
 ('.', 'B-O')]

In [10]:
with open('bio_eng_leg_train.txt', 'w') as outfile:
    for ls in bio_eng_train_legal:
        for tup in ls:
            line = " ".join(map(str, tup))
        
            outfile.write(line + '\n')

In [101]:
with open('pos_bio_eng_leg_train.txt', 'w') as outfile:
    for ls in pos_bio_eng_train_legal:
        for tup in ls:
            line = " ".join(map(str, tup))
        
            outfile.write(line + '\n')