## Load Modules

In [3]:
import pandas as pd
from wordcloud import STOPWORDS
import re
import spacy
from datetime import datetime

## Load Munirah's model for NER

In [4]:
munirahModel = 'D://Users/figohjs/Documents/NLP/StrPrioritization/Notebook/Model/NER_All_Labels_lg_2'
nerModel2 = spacy.load(munirahModel)

## Load File

In [6]:
file = 'D://Users/figohjs/Documents/NLP/StrPrioritization/Data/Interim/2020-02-04_ProcessedDF.csv'

df = pd.read_csv(file)

#create true positive col
def truePositive(x):
    if len([1 for i in range(1, 8) if x['analystRule' + str(i)] and x['rule' + str(i)]])!=0:
        return True
    else:
        return False

#create false negative col
def falseNegative(x):
    if len([1 for i in range(1, 8) if x['analystRule' + str(i)] and not x['rule' + str(i)]])!=0:
        return True
    else:
        return False
    
def falsePositive(x):
    if len([1 for i in range(1, 8) if not x['analystRule' + str(i)] and x['rule' + str(i)]])!=0:
        return True
    else:
        return False
    
def noFlag(x):
    if not x['FN'] and not x['FP'] and not x['TP']:
        return True
    else:
        return False
    
df['FN'] = df.apply(falseNegative, axis = 1)

df['TP'] = df.apply(truePositive, axis = 1)

df['FP'] = df.apply(falsePositive, axis = 1)

df['noFlag'] = df.apply(noFlag, axis = 1)

## Create dictionaries

In [7]:
IndexToReportID_Dict = df['RECORD_ID'].to_dict()

IndexToStrDesc_Dict = df['SUSPICION_DESC'].to_dict()

ReportIDToIndex_Dict = {j:i for i,j in IndexToReportID_Dict.items()}

## Data Processing

In [25]:
def processText(textArray):
    start = datetime.now()
    processedTextList = []
    
    #stopwords list
    otherStopWords = ['also', 'via',  'within', ' even though ', 'on', 'please', 'still'
                     'pada', 'dan', 'sahaja', 'pula', 'juga', 
                     'yang', 'terdapat', 'oleh', 'telah', 'adalah', 'sejak',
                     'since', 'might', 'o/b', 'e.g', 'a/l', 'a/p', 'i.e']
    stopWordList = list(STOPWORDS) + otherStopWords
    
    #regex - special characters + digits
    regexSpecialChar = '\/|\,|\:|\(|\)|\?|\*|\-|\[|\]|\.|\+|\&|\=|\d|\%'
    
    for no, text in enumerate(textArray):
        #remove stopwords
        processedText = ' '.join([word for word in str(text).split(' ') if
                                  word.lower() not in stopWordList])
        
        #denote ic to ic pattern
        icPattern = '\d{6}[-]\d{1,2}[-]\d{4}'
#         icRep = 'ic'
        icRep = ''
        processedText = re.sub(icPattern, icRep, processedText)
        
        #denote date to date pattern
        datePattern = '\d{1,2}[-\.\/]\d{1,2}[-\.\/]\d{4}'
#         dateRep = 'date'
        dateRep = ''
        processedText = re.sub(datePattern, dateRep, processedText)
        
        #denote phone to phone pattern 
        phonePattern = '01\d{8}'
#         phoneRep = 'phone'
        phoneRep = ''
        processedText = re.sub(phonePattern, phoneRep, processedText)
        
        #denote amount to amount pattern
        amountPattern = 'RM *\d+[\,\.]*\d*[\,\.]*\d*K*|RM *\d+\,\d+\.\d+'
#         amountRep = 'amount'
        amountRep = ''
        processedText = re.sub(amountPattern, amountRep, processedText)
        
        #remove special characters and digits
        processedText = ' '.join([re.sub(regexSpecialChar, ' ', word) for word in processedText.split(' ')])
        #remove additional spaces, leading and trailing spaces
        processedText = re.sub('\s+', ' ', processedText).strip()
        
        #denote name to named entity - need better ner model
        doc = nerModel2(processedText)
        nePattern = '|'.join([str(i) for i in doc.ents]) 
#         neRep = 'name'
        neRep = ''
        try:
            if nePattern!='':
#                 processedText = re.sub(nePattern, neRep, processedText) + ', %s'%no
                processedText = re.sub(nePattern, neRep, processedText) 
        except:
            print(processedText + '\n')
            print(no)
            
#         #remove lagging number ', \d'
#         laggingPattern = ', \d+$'
#         processedText = re.sub(laggingPattern, '', processedText)
        processedTextList.append(processedText)        
    
    end = datetime.now()
    print("Time taken in minutes: %s" %round((end - start).seconds/60, 2))
    return processedTextList

In [26]:
#process text for all 6418 records
processedStr = processText(df['SUSPICION_DESC'].values)
#take True Positive as bad str - 418

#store indices for bad str
badIndex = list(df[df['TP']].index)

#assign cleaned desc to a column
df['SUSPICION_DESC_CLEAN'] = processedStr

Time taken in minutes: 5.27


## Save to csv

In [27]:
#save to csv
filename = 'D://Users/figohjs/Documents/NLP/StrPrioritization/Data/Interim/%s_ProcessedDF.csv'%datetime.now().strftime('%Y-%m-%d')
df.to_csv(filename, index = False)