### Load libraries

In [1]:
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite.metrics import flat_classification_report
import pickle
from sklearn.feature_extraction import DictVectorizer
import matplotlib.pyplot as plt
import json
import os
import re
import json
import itertools
from itertools import repeat

### Text Features

Text Features - CRF: 
<ul>
<li>bias</li> 
<li>lower case of term</li>
<li>if term is upper case</li>
<li>if term is sentence case</li>
<li>if term is digit</li>
<li>postag</li>
<li>last 2, 3 characters</li>
<li>last 2 characters from pos</li>
<li> all above features except bias for preceding term if current term is not beginning of sentence</li>
<li> all above features except bias for following term if current term is not end of sentence</li>
<li> only for beginning of sentence, BOS = True and ending of sentence, EOS = True</li>
</ul>

first term/end term in a sentence - 15 features
other term - 19 features
other term (only term in a sentence) - 11 features

Text Features - CRF in our case: 
<ul>
<li>bias</li> 
<li>lower case of term</li>
<li>if term is upper case</li>
<li>if term is sentence case</li>
<li>if term is digit</li>
<li>last 2, 3 characters</li>
<li> all above features except bias for preceding term</li>
<li> all above features except bias for following term</li>   
<li>beginning of sentence?</li>
<li>end of sentence?</li>
</ul>

### Data Preparation

In [2]:
#import training data
recordDictBack = []
jsonFile = "D:\\Users\\figohjs\\Documents\\NLP\\NER\\data\\training\\2020-12-04_LablledStr.json"

with open(jsonFile, 'r', encoding='utf-8') as input_file:
    for row in input_file.readlines():
        recordDictBack.append(json.loads(row))

#all desc
descList = [i['content'] for i in recordDictBack]

#all tag
tagList = [i['tagList'] for i in recordDictBack]

In [4]:
len(descList)

614

In [3]:
#json file from NER label app contains duplicated record coz it s saved time by time
def cleanRecord(result): 
    finalList = []
    
    resultDict = [{i['content']:i['tagList'] for i in result}]
    dictKeys = [key for i in resultDict for key,val in i.items()]
    
    for finalKey in set(dictKeys):
        tempDict = {}
        #take first entry of records with same content
        tagVal = [val for i in resultDict for key, val in i.items() if key == finalKey][0]
        contentVal = finalKey
        tempDict['tagList'] = tagVal
        tempDict['content'] = contentVal
        finalList.append(tempDict)

    return finalList

#from yan ling - extra 200 rows
yanLingJsonFolder = "D:\\Users\\figohjs\\Documents\\NLP\\NER\\data\\training\\Labelled_200_20201110"

yanLingRecord = []
for yanLingFile in os.listdir(yanLingJsonFolder):
    with open(yanLingJsonFolder + '\\' + yanLingFile) as input_file:
        for row in input_file.readlines():
            yanLingRecord.append(json.loads(row))    

yanLingFinalRecord = cleanRecord(yanLingRecord)

In [9]:
#all desc
descListYL = [i['content'] for i in yanLingFinalRecord]

#all tag
tagListYL = [i['tagList'] for i in yanLingFinalRecord]

In [10]:
len(yanLingFinalRecord)

199

In [7]:
descList[0]

"CUSTOMER (ABDUL MUTALIB MAULA ABDUL RAHIM, IC NO. 880704525453) OPENED A SAVINGS ACCOUNT (NO. 7071379494) WITH KHOO HUN YEANG STREET, KUCHING CIMB BRANCH ON 21 AUGUST 2018. BASED ON THE BANK'S RECORD, CUSTOMER IS A SELF-EMPLOYED HAWKER.\r\r\n\r\r\nREVIEW ON THE ACCOUNT (COVERING THE PERIOD BETWEEN 21 AUGUST 2018 AND 31 MARCH 2019), TOTAL DEPOSITS RM 85,236.04 (26 COUNTS) AND TOTAL WITHDRAWALS RM 84,882.50 (66 COUNTS) WERE MADE TO CUSTOMER'S ACCOUNT. NOTICED THAT MAJORITY OF THE TRANSACTIONS WERE MADE IN MARCH 2019. THE CUSTOMER RECEIVED FUNDS FROM MULTIPLE INDIVIDUALS WHERE THE PURPOSE OF TRANSACTIONS IS UNKNOWN. THE FUND IS FOLLOWED BY IMMEDIATE WITHDRAWAL OR INSTANT TRANSFERS TO THE FOLLOWING PARTIES:\r\r\nSUE SWEE HOCK (ACCOUNT WITH ABMB)\r\r\nCYH STAR ENTERPRISE (ACCOUNT WITH HONGLEONG BANK AND PUBLIC BANK)\r\r\nPANG JUNG HS (ACCOUNT WITH PUBLIC BANK)\r\r\nMUHAMMAD JEFRI B (ACCOUNT WITH BMMB)\r\r\n\r\r\nTHE BANK NOTED THAT THERE IS A POLICE REPORT (PADUNGAN/002472/19) LODGED ON TH

In [5]:
#first layer of text cleaning
def cleanText(text):
    #special chars list
    scList = ['\si.e.\s']
    
    #replace \n with ' '
    text = re.sub('\n', ' ', text)
    #remove null in end of sentence
    text = re.sub('null $', '', text, flags = re.I)
    #remove rm as training data does not have rm
    text = re.sub('(rm|myr)\s*(\d+)', r'\2', text, flags = re.I)
    #remove special char 
    text = re.sub('|'.join(scList), '', text)
    #remove additional spaces
    text = re.sub('(\s)+', r'\1', text)        
    return text

In [6]:
def processLabel(text, tag):
    #store final tag result
    tagResultList = []
    
    #tag: [[tag1, label1, color1]]
    #tagDict = {tag1:label1}
    tagDict = dict([(i[0], i[1]) for i in tag if i[1] in ['PERSON', 'ORG']])
    nameList = [i[0] for i in tag if i[1] in ['PERSON', 'ORG']]
    textList = text.split(' ')
    labelDict = {"PERSON":"per", "ORG":"org"}
    skipNo = []
    for no, word in enumerate(textList):
        NEfound = 0
        if no not in skipNo:
            cleanWord = re.sub(r'\(|\)|\,|\.', '', word).strip()
            matchList = [i for i in nameList if cleanWord in i]
            if len(matchList)!=0:
                #find match in dict
                tempDict = {i:j for i,j in tagDict.items() if re.search(cleanWord, i)}
                for key,value in tempDict.items():
                    wordLen = len(key.split(' '))
                    words = re.sub(r'\(|\)|\,|\.', '', ' '.join(textList[no:no+wordLen])).strip()
                    if words == key:
                        NEfound = 1
                    #add word index to skip no
                        skipNo.append(no)
                        tagResultList.append("B-"+labelDict[tagDict[key]])
                        for i in range(no+1, no+wordLen):
                            skipNo.append(i)
                            tagResultList.append("I-"+labelDict[tagDict[key]])
                        break
                #if cannot find NE match
                if NEfound == 0:
                    tagResultList.append("O")
                            
            else:
                tagResultList.append("O")
    return tagResultList

In [7]:
tagFinalResult = []
tokenFinalResult = []
badResult = []
sentenceResult = []
for no, desc in enumerate(descList):
    cleanDesc = cleanText(desc)
    result1 = processLabel(cleanDesc, tagList[no])
    result2 = cleanDesc.split(' ')
    tagFinalResult.append(result1)
    tokenFinalResult.append(result2)
    if len(result1)!=len(result2):
        badResult.append(no)
    sentenceResult.extend(repeat("Sentence"+str(no+1), len(result2)))    

In [8]:
tagFinalResult[0]

['O',
 'B-per',
 'I-per',
 'I-per',
 'I-per',
 'I-per',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-per',
 'I-per',
 'I-per',
 'I-per',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-per',
 'I-per',
 'I-per',
 'O',
 'O',
 'O',
 'B-org',
 'I-org',
 'I-org',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-per',
 'I-per',
 'I-per',
 'O',
 'O',
 'O',
 'O',
 'B-per',
 'I-per',
 'I-per',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O'

In [144]:
#flatten both list
tagFinalResult = list(itertools.chain(*tagFinalResult))
tokenFinalResult = list(itertools.chain(*tokenFinalResult))

In [145]:
len(tagFinalResult), len(tokenFinalResult)

(179124, 179124)

In [146]:
df = pd.DataFrame({"Sentence":sentenceResult,
                   "Token":tokenFinalResult,
                   "Tag":tagFinalResult})
df.shape

(179124, 3)

In [147]:
csvFile = "D:\\Users\\figohjs\\Documents\\NLP\\NER\\data\\training\\2021-01-14_InternalDataCRF.csv"
df.to_csv(csvFile, index = False)

### Use External Data

In [None]:
#default features used in nltk
def word2features(sent, i):
    word = str(sent[i][0])
#     postag = sent[i][1]

    features = {
        'bias': 1.0,
#         'ori':word,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
#         'postag': postag,
#         'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = str(sent[i-1][0])
#         postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
#             '-1:postag': postag1,
#             '-1:postag[:2]': postag1[:2],
        })
    else:
        #beginning of speech
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = str(sent[i+1][0])
#         postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
#             '+1:postag': postag1,
#             '+1:postag[:2]': postag1[:2],
        })
    else:
        #end of speech
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

def generateFeatures(descList):
    featuresList = []
    
    for desc in descList: 
        sample = [(i,) for i in desc.split(' ')]
        sampleFeatures = [sent2features(i) for i in [sample]]
        featuresList.append(sampleFeatures[0])
    
    return featuresList

# This is a class te get sentence. The each sentence will be list of tuples with its tag and pos.
class sentence(object):
    def __init__(self, df):
        self.n_sent = 1
        self.df = df
        self.empty = False
        agg = lambda s : [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
                                                       s['POS'].values.tolist(),
                                                       s['Tag'].values.tolist())]
        self.grouped = self.df.groupby("Sentence #").apply(agg)
        self.sentences = [s for s in self.grouped]

In [173]:
#Reading the csv file
csvFile = "D:/Users/figohjs/Documents/NLP/NER/data/raw/ner_dataset.csv"
df = pd.read_csv(csvFile, encoding = "ISO-8859-1")

#filling column of sentence #
df = df.fillna(method = 'ffill')

#generate features
# featureList = generateFeatures(descList)
sentences = sentence(df)
#for each sentence, a list of tuples: (token, pos, tag)
# [('Iranian', 'JJ', 'B-gpe'),
#  ('officials', 'NNS', 'O'),]
allFullSentences = sentences.sentences

#prepare X and Y
#for each sentence, a list of dict: {features}
X = [sent2features(s) for s in allFullSentences]
y = [sent2labels(s) for s in allFullSentences]

#remove pos features as testing dataset doesnt possess them
X2 = []
for i in X:
    temp = []
    for j in i:
        temp.append({key:value for key,value in j.items() if 'postag' not in key})
    X2.append(temp)

# X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size = 0.3)

#model training
start = datetime.now()
crf2 = CRF(algorithm = 'lbfgs',
             c1 = 0.1,
             c2 = 0.1,
             max_iterations = 100,
             all_possible_transitions = False)
# crf2.fit(X_train, y_train)
crf2.fit(X2, y)
time = datetime.now() - start
print("Estimated time: %s"%time)

#save model in pickle format
filename = 'D:/Users/figohjs/Documents/NLP/NER/Model/2020-01-14_CRFmodel_Externalv1.sav'
pickle.dump(crf2, open(filename, 'wb'))

Estimated time: 0:08:00.540370


### Use internal data

In [148]:
#Reading the csv file
csvFile = "D:\\Users\\figohjs\\Documents\\NLP\\NER\\data\\training\\2021-01-14_InternalDataCRF.csv"
dfInternal = pd.read_csv(csvFile, encoding = "ISO-8859-1")

In [149]:
#default features used in nltk
def word2features(sent, i):
    word = str(sent[i][0])
#     postag = sent[i][1]

    features = {
        'bias': 1.0,
#         'ori':word,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
#         'postag': postag,
#         'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = str(sent[i-1][0])
#         postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
#             '-1:postag': postag1,
#             '-1:postag[:2]': postag1[:2],
        })
    else:
        #beginning of speech
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = str(sent[i+1][0])
#         postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
#             '+1:postag': postag1,
#             '+1:postag[:2]': postag1[:2],
        })
    else:
        #end of speech
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

def generateFeatures(descList):
    featuresList = []
    
    for desc in descList: 
        sample = [(i,) for i in desc.split(' ')]
        sampleFeatures = [sent2features(i) for i in [sample]]
        featuresList.append(sampleFeatures[0])
    
    return featuresList

# This is a class te get sentence. The each sentence will be list of tuples with its tag and pos.
class sentence(object):
    def __init__(self, df):
        self.n_sent = 1
        self.df = df
        self.empty = False
        agg = lambda s : [(w, t) for w, t in zip(s['Token'].values.tolist(),
                                                s['Tag'].values.tolist())]
        self.grouped = self.df.groupby("Sentence").apply(agg)
        self.sentences = [s for s in self.grouped]

In [150]:
#generate features
# featureList = generateFeatures(descList)
sentences = sentence(dfInternal)
#for each sentence, a list of tuples: (token, pos, tag)
# [('Iranian', 'JJ', 'B-gpe'),
#  ('officials', 'NNS', 'O'),]
allFullSentences = sentences.sentences

In [151]:
#prepare X and Y
#for each sentence, a list of dict: {features}
X = [sent2features(s) for s in allFullSentences]
y = [sent2labels(s) for s in allFullSentences]

#remove pos features as testing dataset doesnt possess them
X2 = []
for i in X:
    temp = []
    for j in i:
        temp.append({key:value for key,value in j.items() if 'postag' not in key})
    X2.append(temp)

# X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size = 0.3)

In [152]:
#model training
start = datetime.now()
crf = CRF(algorithm = 'lbfgs',
             c1 = 0.1,
             c2 = 0.1,
             max_iterations = 100,
             all_possible_transitions = False)
crf.fit(X2, y)
time = datetime.now() - start
print("Estimated time: %s"%time)

#save model in pickle format
filename = 'D:/Users/figohjs/Documents/NLP/NER/Model/2021-01-14_CRFmodel_Internalv1.sav'
pickle.dump(crf, open(filename, 'wb'))

Estimated time: 0:00:30.381431


In [162]:
cleanDesc = cleanText(descListYL[1])
cleanDesc

"NAME: INTERNATIONAL OIL DESIGN & CONSTRUCTION SDN BHD BUSINESS REGISTRATION NO: 829060W DATE OF INCORPORATION: 11/08/2008 ADDRESS: 1ST FLOOR WORK @CLEARWATER, CHANGKAT SEMANTAN, OFF JALAN SEMANTAN, 50490 DAMANSARA HEIGHTS, KUALA LUMPUR CONTACT NO: 0320959506, 0323822320 ACCOUNT NO: 8000298329 ACCOUNT OPENING DATE: 03/09/2010 ACCOUNT STATUS: ACTIVE HOME BRANCH: 1408 - KUALA LUMPUR MAIN LATEST BALANCE: 0.00 OTHER ACCOUNTS: 8007061034, 800000149040 (FCA USD), 800014315430 (FCA EUR) PCT1: M RAMANATHAN A/L S M MEYYAPPAN (NRIC NO: 600923106815) PCT2: A MAHESWARY A/P S ARJUNAN (NRIC NO: 660818106180) PCT3: MAZIAR MODARRES SADEGHI MAJID (PASSPORT NO: P95423858 - IRAN) PCT4: SEYEDABOLGHASEM SHEYKHOLESLAMI (PASSPORT NO: R26394391 - IRAN) PCT5: MAJID MALEK (PASSPORT NO: U27015144 - IRAN) INTERNATIONAL OIL DESIGN & CONSTRUCTION SDN BHD (IODC) INVOLVED IN OIL AND GAS INDUSTRIES AND PROVIDE SERVICES IN ENGINEERING, PROCUREMENT, CONSTRUCTION AND MANAGEMENT. IODC HAS BEEN BANKING WITH THE BANK SINCE 

In [156]:
#import stopword
stopWordList = []
txtFile = "D:\\Users\\figohjs\\Documents\\NLP\\NER\\Data\\training\\stopwords.txt"
with open(txtFile, 'r') as myfile:
    for row in myfile.readlines():
        stopWordList.append(re.sub('\n','',row))

#import surname
surnameList = []
txtFile = "D:\\Users\\figohjs\\Documents\\NLP\\NER\\Data\\training\\surname.txt"
with open(txtFile, 'r') as myfile:
    for row in myfile.readlines():
        surnameList.append(re.sub('\n','',row))

#tag dictionary
tagDict = {'org':'ORG', 'per':'PERSON', 'geo': 'GEO'}

def getNamedEntity(records, text):
    finalResult = []
    for noRow, row in enumerate(records):
        temp = []
        for noTerm, term in enumerate(row):
            #if token is beginning of org or per
            if term in ['B-' + i for i in tagDict.keys()]:
                tagType = term.split('-')[1]
                namedEnt = text[noRow][noTerm]
                #if current term is not the last term of the row
                if (noTerm + 1) != len(row):
                    if row[noTerm + 1] != ('I-' + tagType):
                        tempResult = checkTuple((namedEnt, tagDict[tagType]))
                        if tempResult:
                            temp.append(tempResult)
                            
                else:
                    tempResult = checkTuple((namedEnt, tagDict[tagType]))
                    if tempResult:
                        temp.append(tempResult)

            #if token is inside org or per
            elif term in ['I-org', 'I-per', 'I-geo']:
                tagType = term.split('-')[1]
                namedEnt = ' '.join([namedEnt, text[noRow][noTerm]])
                #if current term is not the last term of the row
                if (noTerm + 1) != len(row):
                    if row[noTerm + 1] != ('I-' + tagType):
                        tempResult = checkTuple((namedEnt, tagDict[tagType]))
                        if tempResult:
                            temp.append(tempResult)   

                else:
                    tempResult = checkTuple((namedEnt, tagDict[tagType]))
                    if tempResult:
                        temp.append(tempResult)
                        
        finalResult.append(temp)
            
    return finalResult  

def checkTuple(tupleResult):
    if tupleResult[1] in ['PERSON', 'GEO']:
        if re.search('berhad|bhd', tupleResult[0], flags = re.I):
            return (tupleResult[0], 'ORG')
        else:
            #filter out geo
            if tupleResult[1] == 'GEO':
                return None
            else:
                return tupleResult
            
    elif tupleResult[1] == 'ORG':
        #put chinese name back as label
        if len(tupleResult[0].split(' ')) == 3 and not re.search('berhad|bhd', tupleResult[0], flags = re.I):
            if tupleResult[0].split(' ')[0].lower() in surnameList:
                return (tupleResult[0], 'PERSON')
            else:
                return tupleResult
        else:
            return tupleResult
    else:
        return tupleResult

In [163]:
#features engineering
desc2 = [(i,) for i in cleanDesc.split(' ')]
descFeatures = [sent2features(i) for i in [desc2]]
y_pred = crf.predict(descFeatures)

In [164]:
cleanDesc

"NAME: INTERNATIONAL OIL DESIGN & CONSTRUCTION SDN BHD BUSINESS REGISTRATION NO: 829060W DATE OF INCORPORATION: 11/08/2008 ADDRESS: 1ST FLOOR WORK @CLEARWATER, CHANGKAT SEMANTAN, OFF JALAN SEMANTAN, 50490 DAMANSARA HEIGHTS, KUALA LUMPUR CONTACT NO: 0320959506, 0323822320 ACCOUNT NO: 8000298329 ACCOUNT OPENING DATE: 03/09/2010 ACCOUNT STATUS: ACTIVE HOME BRANCH: 1408 - KUALA LUMPUR MAIN LATEST BALANCE: 0.00 OTHER ACCOUNTS: 8007061034, 800000149040 (FCA USD), 800014315430 (FCA EUR) PCT1: M RAMANATHAN A/L S M MEYYAPPAN (NRIC NO: 600923106815) PCT2: A MAHESWARY A/P S ARJUNAN (NRIC NO: 660818106180) PCT3: MAZIAR MODARRES SADEGHI MAJID (PASSPORT NO: P95423858 - IRAN) PCT4: SEYEDABOLGHASEM SHEYKHOLESLAMI (PASSPORT NO: R26394391 - IRAN) PCT5: MAJID MALEK (PASSPORT NO: U27015144 - IRAN) INTERNATIONAL OIL DESIGN & CONSTRUCTION SDN BHD (IODC) INVOLVED IN OIL AND GAS INDUSTRIES AND PROVIDE SERVICES IN ENGINEERING, PROCUREMENT, CONSTRUCTION AND MANAGEMENT. IODC HAS BEEN BANKING WITH THE BANK SINCE 

In [166]:
tagListYL[1]

[[' INTERNATIONAL OIL DESIGN & CONSTRUCTION SDN BHD', 'ORG', '#8ef'],
 [' INTERNATIONAL OIL DESIGN & CONSTRUCTION SDN BHD', 'ORG', '#8ef'],
 ['M RAMANATHAN A/L S M MEYYAPPAN', 'PERSON', '#faa'],
 [' A MAHESWARY A/P S ARJUNAN', 'PERSON', '#faa'],
 ['MAZIAR MODARRES SADEGHI MAJID', 'PERSON', '#faa'],
 ['SEYEDABOLGHASEM SHEYKHOLESLAMI', 'PERSON', '#faa'],
 ['MAJID MALEK', 'PERSON', '#faa'],
 ['DAMANSARA HEIGHTS', 'LOC', '#fea'],
 [' KUALA LUMPUR', 'LOC', '#fea'],
 [' KUALA LUMPUR', 'LOC', '#fea'],
 ['IRAN', 'LOC', '#fea'],
 ['IRAN', 'LOC', '#fea'],
 ['IRAN', 'LOC', '#fea'],
 ['IRAN', 'LOC', '#fea'],
 ['MAYBANK ', 'BANK', '#afa'],
 ['829060W', 'ID', '#aaf'],
 ['600923106815', 'ID', '#aaf'],
 ['U27015144', 'ID', '#aaf'],
 ['P95423858', 'ID', '#aaf'],
 ['8000298329', 'ACCOUNT', '#ddd'],
 ['8007061034', 'ACCOUNT', '#ddd'],
 [' 800000149040', 'ACCOUNT', '#ddd'],
 ['800014315430', 'ACCOUNT', '#ddd'],
 ['660818106180', 'ACCOUNT', '#ddd'],
 ['660818106180', 'ACCOUNT', '#ddd']]

In [165]:
predList = getNamedEntity(y_pred, [cleanDesc.split(' ')])
predList

[[('M RAMANATHAN A/L S M MEYYAPPAN', 'PERSON'),
  ('MAZIAR MODARRES SADEGHI MAJID', 'PERSON'),
  ('SEYEDABOLGHASEM SHEYKHOLESLAMI', 'PERSON'),
  ('MAJID MALEK', 'PERSON')]]