### Import modules

In [48]:
import pandas as pd
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import math
from timeit import default_timer as timer
from IPython.display import display, Math, Latex
from sklearn.feature_extraction.text import TfidfVectorizer 
import itertools
from gensim.summarization import keywords
import enchant

#british dict
d = enchant.Dict("en_GB")
StrCol = "ACHIEVEMENT"

### Read data

In [49]:
# filename = "D:/Users/figohjs/Documents/JMIS/Data/2019_preprocessed.csv"
# filename = "D:/Users/figohjs/Documents/JMIS/Data/preprocessed_bykra.csv"
filename = "D:/Users/figohjs/Documents/JMIS/Data/toprocess_v1.csv"

df = pd.read_csv(filename)

In [50]:
startTime = timer()

# Extract word vectors from word embedding - 100 dim
embeddingFile = "./Embedding/glove.6B.100d.txt"

#{'word1':embedding in (100,), 'word2':embedding in (100,)}
word_embeddings = {}
with open(embeddingFile, encoding='utf-8') as myFile:
    for line in myFile:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = coefs

#get embedding for each word for each sentence
#get final embedding by summing all corresponding embedding - each sentence one sentence embedding (100,)
sentence_vectors = []
for sentence in df[StrCol].values:
    totalWords = len(sentence.split())
    #get average score of embedding
    #if d sentence is not an empty string
    if len(sentence) != 0:
        v = sum([word_embeddings.get(word, np.zeros((100,))) for word in sentence.split()])\
                /(totalWords + 0.001)
    else:
        v = np.zeros((100,))
    sentence_vectors.append(v)
    
endTime = timer()
print("Total time: %0.4fs" % (endTime - startTime))

Total time: 12.3772s


In [81]:
#text - text array
def generateScores(text):
    #contruct similarity matrix
    totalNoSentences = len(text)
    sim_mat = np.zeros([totalNoSentences, totalNoSentences])
    indexArray = np.arange(totalNoSentences)
    
    for i in indexArray:
        for j in indexArray:
            if i != j:
                sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), 
                                                  sentence_vectors[j].reshape(1,100))[0,0]
    
    #use pagerank algo
    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)
    
    #sort sentence based on scores
    ranked_sentences = sorted(((scores[indexArray[no]],s) for no, s in enumerate(text)), reverse=True)
    return ranked_sentences

def getScoredSentences(textArray):
    startTime = timer()
    topSentenceList = []
    for no, textList in enumerate(textArray):
        scoreArray = generateScores(textList)   
        topSentenceList.append(scoreArray)
    
    endTime = timer()
    print("Total time: %0.4fs" % (endTime - startTime))
    
    return topSentenceList

def getTopSentence(sentenceArray, threshold):
    resultList = []
    for sentence in sentenceArray:
        topN = math.ceil(len(sentence)*threshold)
        topSentences = [i[1].strip() for i in sentence[:topN] if i[1]!='']
        resultList.append('|'.join(topSentences))
    return resultList

def generateNGram(text, ngram = 2):
    listOfTuples = [list(nltk.ngrams(i.split(' '), ngram)) for i in text]
    flattenListOfTuples = list(itertools.chain(*listOfTuples))
    textNGram = ['_'.join(i) for i in flattenListOfTuples]
    
    return textNGram

def generateTopKeywords(textArray, n = 10):
    resultList = []
    for text in textArray:
        tfidf_Vectorizer = TfidfVectorizer()
        #dim: number of doc X number of unique terms
        tfIdf_Text = tfidf_Vectorizer.fit_transform(text)

        dfTemp = pd.DataFrame(tfIdf_Text.toarray(), columns = tfidf_Vectorizer.get_feature_names())
        resultDict = dict(sorted(list(zip(dfTemp.columns, dfTemp.max(axis = 1))),
                           key = lambda x:x[1], reverse = True)[:n])
        keywordsList = list(resultDict.keys())
        resultList.append(keywordsList)
        
    return resultList

def generateKeywordsGensim(textArray, n = 10):
    resultList = []
    for text in textArray:
        cleanText = re.sub(' +',  ' ',re.sub('\n|\|', ' ',str(text))).strip()
        try:
            words = keywords(cleanText, pos_filter = 'NN', lemmatize = True, words = n)
        except:
            words = keywords(cleanText, pos_filter = 'NN', lemmatize = True, words = None)
        words = re.sub("\n", ",", words)
        resultList.append(words)    
    return resultList

def filterKeywordsGensim(textArray):
    resultList = []
    
    for text in textArray:
        tempList = []
        for i in text.split(','):    
            if i!='':
                if len(i.split(' ')) == 1 and not d.check(i):
                    tempList.append(i)
                elif len(i.split(' '))>1:
                    tempList.append(i)
        words = ','.join(tempList)
        resultList.append(words)
    
    return resultList

In [60]:
idList = list(set(df['ID'].values))
textResult = [i.split('.') for i in df[StrCol].values]

dfResult = pd.DataFrame()
dfResult['ID'] = df['ID'].values
dfResult['GROUP'] = df['GROUP'].values
dfResult['ScoredSentence'] = getScoredSentences(textResult)
dfResult['TopSentences'] = getTopSentence(dfResult['ScoredSentence'].values, 0.3)
# dfResult['Sentences'] = dfResult['ScoredSentence'].map(lambda x:'\n'.join([i[1] for i in x]))
dfResult.head()

Total time: 140.1332s


Unnamed: 0,ID,GROUP,ScoredSentence,TopSentences
0,1768,ACHIEVEMENT_1,"[(0.14932332329095063, contributed staff deve...",contributed staff development continuously loo...
1,1768,ACHIEVEMENT_2,"[(0.12957036616532078, provided scenario anal...",provided scenario analysis risks growth|resear...
2,1768,ACHIEVEMENT_3,"[(0.14932332329095063, collaborated world ban...",collaborated world bank team come analysis pol...
3,1768,ACHIEVEMENT_4,"[(0.1748074454489123, led preparation documen...",led preparation documents publications externa...
4,1768,ACHIEVEMENT_5,"[(0.25825011432154193, jek business plan ddin...",jek business plan ddincharge team yeam haris|j...


In [63]:
filename = '2020-12-22_ProcessedKRA.csv'
dfResult.to_csv(filename, index = False)

In [82]:
textResult = [[j.strip() if j!='' else 'nan' for j in i.split('|')] for i in dfResult['TopSentences'].values]
# dfResult['Top10Keywords'] = generateTopKeywords(textResult)

dfResult['Top15KeywordsGensim'] = generateKeywordsGensim(textResult)

dfResult['TopKeywordsGensim'] = filterKeywordsGensim(dfResult['Top15KeywordsGensim'].values)

In [83]:
dfResult.query('Top15KeywordsGensim == ""')['TopSentences'].values

array(['mla requests referred preparation', '|rating sold performance',
       '|rating sold performance', 'rating sold performance',
       'rating sold performance', '|rating sold performance',
       '|rating marginal performance', '|rating sold performance',
       'rating sold performance', '|rating sold performance',
       'reviewed aa applications within required time frame',
       'final presentation hod november',
       'continuing professional development',
       'attended following training',
       'undertake research whistleblowing ombudsman', 'management update',
       'attended jpw transformation workshop',
       'standardised qr will rolled end',
       'staffdevelopment|selfdevelopment',
       'ensured interest bank affiliates protected',
       'negative feedback stakeholders management', 'tides', 'attended',
       'realised capital gain rmm', 'ii|reinsreinst',
       'staff development|selfdevelopment', '|f|faa||aif',
       'attended mfrs budgeting courses',

In [87]:
filename = '2020-12-22_ProcessedKRAwithKeywords.csv'
dfResult.to_csv(filename, index = False)

### Validation

In [2]:
import pandas as pd

filename = "Data/res_forcomparison_v2.xlsx"
dfValidation = pd.read_excel(filename)

In [3]:
dfValidation.columns

Index(['REF_ID', 'ID', 'GROUP', 'ScoredSentence', 'TopSentences',
       'Top15KeywordsGensim', 'TopKeywordsGensim', 'KEYWORDS',
       'KEYWORDS_ACHIEVEMENT', 'ACHIEVEMENT'],
      dtype='object')

In [13]:
print(dfValidation.loc[700, "KEYWORDS"])

myknp, video


In [14]:
print(dfValidation.loc[700, "KEYWORDS_ACHIEVEMENT"])

myknp akpk


In [15]:
print(dfValidation.loc[700, "TopKeywordsGensim"])

fis akpk,launching video myknp,creative ideas building
