In [50]:
#Load module
import pandas as pd
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity
from nltk.lm import Vocabulary
import numpy as np
from nltk.tokenize import word_tokenize
import re
import itertools

In [46]:
#read processed file
filename = 'D://Users/figohjs/Documents/NLP/StrPrioritization/Data/Interim/2020-04-24_ProcessedDF.csv'
df = pd.read_csv(filename)

#define column names
truePositiveCol = 'TP'
textCol = 'SUSPICION_DESC_CLEAN'
recordIdCol = 'RECORD_ID'

In [73]:
#bad index:index for bad report
badIndex = list(df[df['TP']].index)

#build vocabulary to determine vector size
tokenizedList = [re.sub('\s+',' ',str(i)).split(' ') for i in df[textCol].values]
flatTokenizedList = list(itertools.chain(*tokenizedList))
vocab = Vocabulary(flatTokenizedList)

#create dictionaries
IndexToReportID_Dict = df[recordIdCol].to_dict()
IndexToStrDesc_Dict = df[textCol].to_dict()
ReportIDToIndex_Dict = {j:i for i,j in IndexToReportID_Dict.items()}

#docs to tag
docs = df[textCol].values

#docs to tag
#fillna with ''
df[textCol].fillna('', inplace = True)
docs = df[textCol].values

In [89]:
#Load module
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime


class docEmbedding():
    def __init__(self, **kwargs):
        #predefine prameters
        self.Doc2VecModel = Doc2Vec(**kwargs)
        #where to save model
        self.ModelFolder = 'D:/Users/figohjs/Documents/NLP/StrPrioritization/Streamlit/Model'
        #model name
        self.ModelName = self.ModelFolder + '/' + datetime.now().strftime('%Y-%m-%d') + '_' + 'Doc2Vec.model'
        #indices are based on list of tagged document
        self.BadStrIndex = kwargs['bad_index']
        self.Threshold = kwargs['threshold']
        self.VectorSize = kwargs['vector_size']
        self.DM = kwargs['dm']
        self.MaxEpoch = kwargs['epoch']

    def tagDocument(self, ContentList):
        #tag document
        self.TaggedDocList = [TaggedDocument(doc[1], tags = [doc[0]]) 
                              for doc in zip(self.BadStrIndex, [i.split() for i in ContentList])]
    
    def trainModel(self):
        #build vocab
        self.Doc2VecModel.build_vocab(self.TaggedDocList)
        
        for epoch in range(self.MaxEpoch):
            print('Iteration - %s'%epoch+1)
            #total_examples = total of document
            self.Doc2VecModel.train(self.TaggedDocList,
                                    total_examples = self.Doc2VecModel.corpus_count,
                                    epochs = epoch)
        #self.Doc2VecModel.alpha -= 0.002
        self.Doc2VecModel.save(self.ModelName)
        print('%s is saved'%self.ModelName)
        
    def loadModel(self, TrainedModel):
        self.Doc2VecModel = gensim.models.doc2vec.Doc2Vec.load(TrainedModel)
        
    def findSimilarDocForBadDoc(self, NewDoc, n = 10):
        NewVec = self.Doc2VecModel.infer_vector(NewDoc.split())
        similarRepList = []
        similarScoreList = []

        #loop through every bad report
        for index in self.BadStrIndex:
            similarScore = cosine_similarity(self.Doc2VecModel.docvecs[index].reshape(1, self.VectorSize),
                                             NewVec.reshape(1, self.VectorSize))
            #if similarity score exceeds threshold
            if similarScore >= self.Threshold:
                similarRepList.append(str(index))
                similarScoreList.append(str(similarScore[0][0]))

        #if not similar to any bad report, (a, b, c)

        #a - 1 if has similar to any bad report
        #b - list of reports 
        if len(similarRepList) == 0:
            return (0, '', '')
        else:
            return (1, ','.join(similarRepList), ','.join(similarScoreList))

In [90]:
#hyperparameters for neural network
#use distributed memory in default: predict word given context
dm = 1
epoch = 10
#use google recommended vector size
vectorSize = int(round((len(vocab) - 1)**(1/4), 0))
#learning rate
alpha = 0.025
minAlpha = alpha #no decreasing rate
#minimum frequency of token
minCount = 1
#true positive as bad reports
badIndex = list(df[df['TP']].index)
#threshold of similarity score to be considered as similar
threshold = 0.75

In [87]:
#instantiate a class named similarDoc
paramDict = {'min_count': minCount, 'vector_size': vectorSize,
             'alpha':alpha, 'min_alpha':alpha, 'bad_index':badIndex,
             'threshold':threshold, 'dm':dm, 'epoch':epoch}

nlpSimilarity = docEmbedding(**paramDict)

#tag documents
nlpSimilarity.tagDocument(docs)

#train model
nlpSimilarity.trainModel()

Iteration - 0
Iteration - 1
Iteration - 2
Iteration - 3
Iteration - 4
Iteration - 5
Iteration - 6
Iteration - 7
Iteration - 8
Iteration - 9
D:/Users/figohjs/Documents/NLP/StrPrioritization/Streamlit/Model/2020-04-27_Doc2Vec.model is saved


In [92]:
#load trained model
trainedModel = 'D:/Users/figohjs/Documents/NLP/StrPrioritization/Streamlit/Model/2020-04-27_Doc2Vec.model'
nlpSimilarity.loadModel(trainedModel)

In [96]:
start = datetime.now()

#use trained model to find if there is report similar with any bad reports
similarList = []
indexList = []
scoreList = []

dfSubset = df[~df['TP']].copy()

#use row which is not true positive i.e not bad report
for no in range(dfSubset.shape[0]):
    info = dfSubset['SUSPICION_DESC_CLEAN'].values[no]
    similarBool, index, score = nlpSimilarity.findSimilarDocForBadDoc(info)
    similarList.append(similarBool)
    indexList.append(index)
    scoreList.append(score)
pandaDict = {'similarBool':similarList,
            'reportIndexList':indexList,
            'scoreList':scoreList}
dfResult = pd.DataFrame(pandaDict)
dfResult = pd.concat([dfResult, df.reset_index()], axis = 1)
dfResult.fillna('', inplace = True)

end = datetime.now()

In [104]:
print("Time taken in minutes: %s" %round((start - end).seconds//3600, 2))

Time taken in minutes: 23


In [97]:
dfResult.shape

(6418, 29)