In [1]:
import pandas as pd
import spacy, re
from spacy.attrs import ENT_IOB
from spacy.matcher import PhraseMatcher
from spacy import displacy
from spacy.pipeline import EntityRuler
from spacy.symbols import ORTH,LEMMA,POS
from pathlib import Path
from spacy.tokens import Span
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from spacy.strings import StringStore
import json
from collections import Counter
import pickle

In [2]:
#import training data
recordDictBack = []
jsonFile = "D:\\Users\\figohjs\\Documents\\NLP\\NER\\data\\training\\2020-12-04_LablledStr.json"

with open(jsonFile, 'r', encoding='utf-8') as input_file:
    for row in input_file.readlines():
        recordDictBack.append(json.loads(row))

#all desc
descList = [i['content'] for i in recordDictBack]

#all tag
tagList = [i['tagList'] for i in recordDictBack]

### munirah's processing pipeline and model

In [3]:
def change_case_gen_pattern(parsed_doc):
    strg_2=''
    passport=False
    
    #   Change format of capitalized words into proper case (specifically nouns: NNP, NNS, NNPS) 
    for token in parsed_doc:
        tagged_sent = [(token.text, token.tag_)]
        
        normalized_sent = [w.capitalize() if (t in ["NNP","NNS","NNPS"] and w.isupper()) else w for (w,t) in tagged_sent]
        strg = re.sub(" (?=[\.,'!?:;])", "", ' '.join(normalized_sent))   

        lowerList = ('name','cash','bank','inter','amlatfa','cdd','jalan','ic','cust','business','director', 'sole', 'proprietor','str', 'cdm','saving', 'rm', 'myr','place','branch')

        if any(substring in strg.lower() for substring in lowerList):
               strg=strg.lower()
                
        # convert the remaining uppercase to lowercase
        if strg.isupper():
            strg = strg.lower()
        
        # remove unnecessary punctuations 
        strg_2 += re.sub(r'[=*()]',r'',strg.strip())
        strg_2 += ' '
        
            

    # remove space before certain symbols like <space><.> or <space><,>
    strg_2=re.sub(r'\s+([?.!,:"/\'])', r'\1', strg_2)
    strg_2=strg_2.replace("Õ"," ").replace("ð"," ").replace("õ"," ")
    strg_2=strg_2.replace("  ",". ")
     # remove space after certain symbols like <space><.> or <space><,>
    strg_2=re.sub(r'([?/])+\s', r'\1', strg_2)
    strg_2=strg_2.replace('"','\\"')
    
    if(strg_2.lower().__contains__('passport')):
        passport=True
    phrase_matcher(strg_2, passport)
    
#     print ('\n')
    
    return strg_2.strip()


# phrase matcher: shape
def phrase_matcher(strg, p_e):
    nlp = English()
    
    matcher = PhraseMatcher(nlp.vocab, attr="SHAPE")
    matcher.add(u"STR_ID", None, nlp(u"aa/025/s/2016/000019"), nlp(u"aa/025/s/2016/000075"))
    matcher.add(u"PERSON_ID", None, nlp(u"881102-08-5192"))
#     matcher.add(u"PERSON_ID_2", None, nlp(u"661124085949"), nlp(u"710905125067"))
    matcher.add(u"PASSPORT", None, nlp(u"r711493"), nlp(u"ma438972"), nlp(u"a3894268"))
    matcher.add(u"PASSPORT_OR_NRIC", None, nlp(u"706251339"), nlp(u"720201106027"), nlp(u"nric: 811018105265"), nlp(u"710905125067"))
    stringstore = StringStore([u"STR_ID",u"PERSON_ID",u"PASSPORT",u"PASSPORT_OR_NRIC"])
    
#     doc = nlp(strg.lower())
#     for match_id, start, end in matcher(doc):
#         span = doc[start:end]
#     #         with open("D:\\Users\\mcazwan\\Desktop\\fisitti\\str-id-patterns.jsonl", "a") as text_file:
#     #             text_file.write('{\n"label": "STR_ID", [{"TEXT":' + span.text + '}]),\n')
        
#         #  only allow 12 characters for person_id
#         if match_id == stringstore[u"PERSON_ID"] or match_id == stringstore[u"PERSON_ID_2"]:
#             if(len(span.text.replace(' ','').replace('-','')) == 12):
#                 print("(person_id):", doc[start:end])
#         elif match_id == stringstore[u"PASSPORT"] and p_e == True:
#         # check pattern
#             if(7 <= len(span.text.replace(' ','').replace('-','')) <= 9):
#                 print("(passport):", doc[start:end])
#         elif match_id == stringstore[u"PASSPORT_OR_NRIC"]:
#             if(7 <= len(span.text.replace(' ','').replace('-','')) <= 9) and p_e == True:
#                 print("(passport-digit):", doc[start:end])
                
#             elif(len(span.text.replace(' ','').replace('-','')) == 12):
#                 ic=span.text.replace(' ','').replace('-','')
#                 if(int(ic[2:4]) <= 12 and int(ic[4:6]) <= 31 and int(ic[6:8]) <= 14):
#                     print("(nric-digit):", doc[start:end])
#         elif match_id == stringstore[u"STR_ID"]:
#             print("(str_id):", doc[start:end])
             

# fix spaces
def fix_space_tags(doc):
    ent_iobs = doc.to_array([ENT_IOB])
    for i, token in enumerate(doc):
        if token.is_space:
            # Sets 'O' tag (0 is None, so I is 1, O is 2)
            ent_iobs[i] = 2
    
    doc.from_array([ENT_IOB], ent_iobs.reshape((len(doc), 1)))
    return doc

def prevent_sentence_boundaries(doc):
    for token in doc:
        if not can_be_sentence_start(token):
            token.is_sent_start = False
    return doc


def can_be_sentence_start(token):
#     print(token.text,':',token.i,'\n')
#     create separator
    
    if token.i == 0:
        return True
    # We're not checking for is_title here to ignore arbitrary titlecased
    # tokens within sentences
    
    elif token.nbor(-1).is_punct and token.nbor(-1).text not in [':','/','-']:
        return True

    elif token.nbor(-1).is_space:
        return True
    
    else:
        return False

spacyModel = 'en_core_web_md'
nlp = spacy.load(spacyModel, disable=['ner'])
nlp.add_pipe(prevent_sentence_boundaries, before="parser")

#     for other processing   
munirahModel = 'D:\\Users\\figohjs\\Documents\\NLP\\Spacy\\NER\\trained-model\\NER_All_Labels_lg'
nlp2 = spacy.load(munirahModel)
nlp2.add_pipe(fix_space_tags, name="fix-ner", before="ner")
nlp2.add_pipe(prevent_sentence_boundaries, before="parser")

jsonlFile = "D:\\Users\\figohjs\\Documents\\NLP\\NER\\Notebook\\patterns.jsonl"
new_ruler = EntityRuler(nlp).from_disk(jsonlFile)
nlp2.add_pipe(new_ruler, after='ner')

In [4]:
descList[0]

"CUSTOMER (ABDUL MUTALIB MAULA ABDUL RAHIM, IC NO. 880704525453) OPENED A SAVINGS ACCOUNT (NO. 7071379494) WITH KHOO HUN YEANG STREET, KUCHING CIMB BRANCH ON 21 AUGUST 2018. BASED ON THE BANK'S RECORD, CUSTOMER IS A SELF-EMPLOYED HAWKER.\r\r\n\r\r\nREVIEW ON THE ACCOUNT (COVERING THE PERIOD BETWEEN 21 AUGUST 2018 AND 31 MARCH 2019), TOTAL DEPOSITS RM 85,236.04 (26 COUNTS) AND TOTAL WITHDRAWALS RM 84,882.50 (66 COUNTS) WERE MADE TO CUSTOMER'S ACCOUNT. NOTICED THAT MAJORITY OF THE TRANSACTIONS WERE MADE IN MARCH 2019. THE CUSTOMER RECEIVED FUNDS FROM MULTIPLE INDIVIDUALS WHERE THE PURPOSE OF TRANSACTIONS IS UNKNOWN. THE FUND IS FOLLOWED BY IMMEDIATE WITHDRAWAL OR INSTANT TRANSFERS TO THE FOLLOWING PARTIES:\r\r\nSUE SWEE HOCK (ACCOUNT WITH ABMB)\r\r\nCYH STAR ENTERPRISE (ACCOUNT WITH HONGLEONG BANK AND PUBLIC BANK)\r\r\nPANG JUNG HS (ACCOUNT WITH PUBLIC BANK)\r\r\nMUHAMMAD JEFRI B (ACCOUNT WITH BMMB)\r\r\n\r\r\nTHE BANK NOTED THAT THERE IS A POLICE REPORT (PADUNGAN/002472/19) LODGED ON TH

In [6]:
#prediction
doc = nlp.pipe(iter(descList[:10]), batch_size=10, n_threads=4000) 

result = []
for parsed_doc in doc:
    #  add pipe
#     doc_2 = nlp2(change_case(parsed_doc))
    doc_2 = nlp2(change_case_gen_pattern(parsed_doc))
    result.append(doc_2)

predResultMunirah = []
for record in result:
    tempList = []
    for entity in record.ents:
        if entity.label_ in ['PERSON', 'ORG']:
            tempList.append((entity, entity.label_))
    predResultMunirah.append(tempList)

#convert result and tag into str for saving purpose
predResultMunirah2 = [[(str(j[0]),str(j[1])) for j in i] for i in predResultMunirah]

In [7]:
predResultMunirah2[0]

[('Abdul Mutalib Maula Abdul Rahim', 'PERSON'),
 ('Khoo Hun Yeang', 'PERSON'),
 ('Sue Swee Hock', 'PERSON'),
 ('Star enterprise', 'ORG'),
 ('Pang Jung Hs', 'PERSON'),
 ('Muhammad Jefri', 'PERSON'),
 ('Bmmb', 'ORG')]