<b><h1>Metamap Evaluation tool V1 - part 2</h1></b>
This tool will help to evaluate and compare different metamap behavioural options.
Part 2- This will have the code of lading the trained phrase and word2vec model, read the metamap processed files and gold standard, and finally run evaluation on different metamap options

<h3>Step 0 - Load database parameters</h3>

In [1]:
from configparser import ConfigParser
class database_params:
    def __init__(self):
        self.filename='/zfs/dzrptlab/CDSS/config/database.ini'
        self.section='postgresql'
        
    def config(self):
        # create a parser
        parser = ConfigParser()
        # read config file
        parser.read(self.filename)

        # get section, default to postgresql
        db = {}

        # Checks to see if section (postgresql) parser exists
        if parser.has_section(self.section):
            params = parser.items(self.section)
            for param in params:
                db[param[0]] = param[1]

        # Returns an error if a parameter is called that is not listed in the initialization file
        else:
            raise Exception('Section {0} not found in the {1} file'.format(section, filename))

        return db

<h3>Step 1: Get the gold standard papers</h3>
Here gold standard can be in file as well. In our case we have stored everything in PostgreSQL so we are retrieving data from there

In [2]:
import psycopg2
class get_paper_ids:
    def __init__(self, conn):
        self.conn = conn

    def get_dump(self):
        sql = 'select distinct primary_paper_id from gold_standard_terms'
        cursor = self.conn.cursor()

        try:
            cursor.execute(sql)
            gs_terms = cursor.fetchall()

        except (Exception, psycopg2.Error) as error:
            print("Error while fetching data from PostgreSQL", error)

        finally:
            cursor.close()

        return gs_terms

In [3]:

database_conn_obj = database_params()
# Obtain the configuration parameters
params = database_conn_obj.config()
# Connect to the PostgreSQL database
conn = psycopg2.connect(**params)

get_current_gs_dump_obj=get_paper_ids(conn)
paper_ids = get_current_gs_dump_obj.get_dump()

conn.close()

OperationalError: FATAL:  could not open file "global/pg_filenode.map": Stale file handle


In [None]:
paper_ids_list=[]
for paper in paper_ids:
    paper_ids_list.append(str(paper[0]))

<h3> Step 2: Load the trained model from part 1</h3>
Phrase and word2vec model

In [None]:
from gensim.models.phrases import Phrases
from gensim.models import Word2Vec

class load_models:
    def __init__(self, word2vec_path='corpus_word2vec.model', phrase_path='my_phrase_model.pkl'):
        self.word2vec_path=word2vec_path
        self.phrase_path=phrase_path
    
    def load(self):
        phrase_model = Phrases.load(self.phrase_path)
        w2v_model = Word2Vec.load(self.word2vec_path)
        
        return w2v_model, phrase_model


In [None]:
load_models_obj=load_models()
w2v_model, phrase_model = load_models_obj.load()

<h3> Step 3: Text preprocessing </h3>
Just like part 1

In [None]:
import spacy
import unidecode
from word2number import w2n
from pycontractions import Contractions
import gensim.downloader as api
import json

class text_preprocessing:
    def __init__(self):

        self.nlp = spacy.load('en_core_web_sm')

        # Choose model accordingly for contractions function
        self.model = api.load("glove-twitter-25")
        # model = api.load("glove-twitter-100")
        # model = api.load("word2vec-google-news-300")

        self.cont = Contractions(kv_model=self.model)
        self.cont.load_models()

        # exclude words from spacy stopwords list
        deselect_stop_words = ['no', 'not']
        for w in deselect_stop_words:
            self.nlp.vocab[w].is_stop = False
    


    def remove_whitespace(self,text):
        """remove extra whitespaces from text"""
        text = text.strip()
        return " ".join(text.split())


    def remove_accented_chars(self,text):
        """remove accented characters from text, e.g. café --> cafe"""
        text = unidecode.unidecode(text)
        return text


    def expand_contractions(self,text):
        """expand shortened words, e.g. don't to do not"""
        text = list(self.cont.expand_texts([text], precise=True))[0]
        return text


    def call_preprocessing(self, text, accented_chars=True, contractions=True, 
                           convert_num=True, extra_whitespace=True, 
                           lemmatization=True, lowercase=True, punctuations=True, 
                           remove_num=True, special_chars=True, 
                           stop_words=True):
        """preprocess text with default option set to true for all steps"""

        if extra_whitespace == True: #remove extra whitespaces
            text = self.remove_whitespace(text)
        if accented_chars == True: #remove accented characters
            text = self.remove_accented_chars(text)
        if contractions == True: #expand contractions
            text = self.expand_contractions(text)
        if lowercase == True: #convert all characters to lowercase
            text = text.lower()

        doc = self.nlp(text) #tokenise text
        #print(doc)
        clean_text = []

        #remove_unwanted_dict={'objective'}

        for token in doc:
            #print(token)
            flag = True
            edit = token.text
            '''
            #remove unwanted like objective, methods
            if token.text in remove_unwanted_dict:
                flag = False
            '''

            # remove stop words
            if stop_words == True and token.is_stop and token.pos_ != 'NUM': 
                flag = False
            # remove punctuations
            if punctuations == True and token.pos_ == 'PUNCT' and flag == True: 
                flag = False
            # remove special characters
            if special_chars == True and token.pos_ == 'SYM' and flag == True: 
                flag = False
            # remove numbers
            if remove_num == True and (token.pos_ == 'NUM' or token.text.isnumeric()) and flag == True:
                flag = False
            # convert number words to numeric numbers
            if convert_num == True and token.pos_ == 'NUM' and flag == True:
                edit = w2n.word_to_num(token.text)
            # convert tokens to base form
            elif lemmatization == True and token.lemma_ != "-PRON-" and flag == True:
                edit = token.lemma_
            # append tokens edited and not removed to list 
            if edit != "" and flag == True:
                clean_text.append(edit)        
        return clean_text



<h3> Step 4: Defining Metamap parser </h3>

In [None]:
import json

class parse_metamap_json_new:
    def __init__(self,text_preprocessing_obj):
        self.text_preprocessing_obj = text_preprocessing_obj
        pass

    '''
    This function returns the list of preferred name for the candidates for the input file.
    The input file should be in json format wiz is output of metamap.

    nested levels:

    -AllDocuments
    --Document
    ---Utterances
    ----Phrases
    -----Mappings
    ------MappingCandidates
    -------CandidatePreferred

    '''
    def get_CandidatePreferred(self,file):

        with open(file) as f:
            data = json.load(f)

        final_CandidateMatched_dict={}

        for document in data['AllDocuments']:
            pmid=0
            inner_CandidateMatched={}
            for utterance in document['Document']['Utterances']:
                pmid=utterance['PMID']
                for phrase in utterance['Phrases']:
                    for mapping in phrase['Mappings']:
                        for candidate in mapping['MappingCandidates']:
                            CandidateMatched = ' '.join(self.text_preprocessing_obj.call_preprocessing(candidate['CandidateMatched']))
                            if CandidateMatched == '':
                                continue
                            CandidatePreferred = candidate['CandidatePreferred']
                            if CandidateMatched in inner_CandidateMatched:
                                cur_dict=inner_CandidateMatched[CandidateMatched]
                                if CandidatePreferred not in cur_dict:
                                    cur_dict[CandidatePreferred] = [candidate['CandidateCUI'], candidate['CandidateScore']]
                            else:
                                inner_CandidateMatched[CandidateMatched]={CandidatePreferred : [candidate['CandidateCUI'], candidate['CandidateScore']]}
                            
            final_CandidateMatched_dict[pmid]=inner_CandidateMatched
        return final_CandidateMatched_dict

<h3>Step 5: Gold standard dump </h3>
This will help us to get all gold standard terms from the database
<ol>
    <li>define get gold standard dump class</li>
    <li>get the dump</li>
    <li>preprocess gold standard using text preprocessing class</li>
    <li>form phrases using loaded phrase model</li>
</ol>


In [None]:
import psycopg2
class get_gs_dump:
    def __init__(self, conn):
        self.conn = conn

    def get_dump(self):
        sql = 'select primary_paper_id,gs_term from gold_standard_terms'
        cursor = self.conn.cursor()

        try:
            cursor.execute(sql)
            gs_terms = cursor.fetchall()

        except (Exception, psycopg2.Error) as error:
            print("Error while fetching data from PostgreSQL", error)

        finally:
            cursor.close()

        return gs_terms

In [None]:
#get dump
database_conn_obj = database_params()
# Obtain the configuration parameters
params = database_conn_obj.config()
# Connect to the PostgreSQL database
conn = psycopg2.connect(**params)


get_gs_dump_obj=get_gs_dump(conn)
gs_dump = get_gs_dump_obj.get_dump()

conn.close()


gs_dump_dict={}
for item in gs_dump:
    if str(item[0]) in gs_dump_dict:
        gs_dump_dict[str(item[0])].append(item[1])
    else:
        gs_dump_dict[str(item[0])]= [item[1]]

In [None]:
#preprocess gold standard
preprocessed_gold_standard={}

for paper_id, gs_list in gs_dump_dict.items():
    inner_list=[]
    for term in gs_list:
        clean=text_preprocessing_obj.call_preprocessing(term)
        #print(str(term) +'-----'+ str(clean))
        if len(clean) >0:
            inner_list.append(clean)
    if len(inner_list)>0:
        preprocessed_gold_standard[paper_id]=inner_list

In [None]:
#pass it through phrases model
gold_standard_phrases={}

for pmid, terms in preprocessed_gold_standard.items():
    inner_phrases=[]
    for term in terms:
        inner_phrases.append(phrase_model[term])
    gold_standard_phrases[pmid]=inner_phrases

In [None]:
#get unique gold standard

unique_gold_standard={}
for pmid,terms in gold_standard_phrases.items():
    unique_inner_dict={}
    unique_inner_list=[]
    for term in terms:
        unique_inner_dict[' '.join(term)]=1
    
    for k,v in unique_inner_dict.items():
        unique_inner_list.append(k.split(' '))
    unique_gold_standard[pmid]=unique_inner_list
    

<h3> Step 6: Get mean vector class </h3>
Supporting class for calculating vectors of term by taking the mean of all word vectros present in that term

In [None]:
import numpy as np
class get_mean_vector_class:
    def __init__(self):
        pass
    
    def get_mean_vector(self, word2vec_model, words, dims=(150,)):
        # remove out-of-vocabulary words
        #In gensin 3.8, words = [word for word in words if word in word2vec_model.wv.vocab]
        for word in words:
            if word in word2vec_model.wv.key_to_index:
                continue
            else:
                return "vocab not present"

        if len(words) >= 1:
            dummy_zeros = np.zeros(dims)
            for word in words:
                dummy_zeros = dummy_zeros + w2v_model.wv.get_vector(word)
            return dummy_zeros/len(words)
        else:
            return "nothing"

<h3> Step 7: Read all the metamap processed files</h3> 
This step includes reading metamap processed files, and passing the text through text preprocessing and phrase model.

In [None]:
#get list of all files: use your directory
import glob
metamap_out_files=glob.glob('/zfs/dzrptlab/CDSS/data/metamap_data/metamap_different_options/test_5_27_41_abs/*')

In [None]:
#multiprocessing class for preprocessing and forming phrases of all the metamap files text

import glob
from multiprocessing import Pool, Manager
import itertools

class preprocess_and_phrases:
    def __init__(self, metamap_out_files = glob.glob('/zfs/dzrptlab/CDSS/data/metamap_data/metamap_different_options/test_5_27_41_abs/*'), pool_size=5):
        self.metamap_out_files=metamap_out_files
        self.pool_size=pool_size
        #self.metamap_different_options={}
    
    def f(self,file, results):
        option_name = file.split('/')[-1]
        print('\n'+option_name)
        #CandidateMatched_dict=get_CandidatePreferred(file)

        #preprocess
        preprocessed_CandidateMatched_dict={}
        parsed = parse_metamap_json_new_obj.get_CandidatePreferred(file)
        
        #phrases
        preprocessed_CandidateMatched_phrases={}
        for pmid, candidates in parsed.items():
            phrases_inner_list=[]
            for candidate in candidates:
                phrases_inner_list.append(phrase_model[candidate.split(' ')])

            preprocessed_CandidateMatched_phrases[pmid]=phrases_inner_list
           
        results[option_name]=preprocessed_CandidateMatched_phrases
            
    def func(self, a_b):
        return self.f(*a_b)
    
    def run(self):
        
        manager = Manager()
        results = manager.dict()
        
        p = Pool(processes = self.pool_size)
        p.map(self.func, zip(self.metamap_out_files, itertools.repeat(results)))
        
        return results

In [None]:
#define your own pool size
preprocess_and_phrases_obj=preprocess_and_phrases(pool_size=10)
metamap_different_options = preprocess_and_phrases_obj.run()

<h3> Step 8: Define Similarity function </h3>
Function for cosine similarity

In [None]:
class calc_cosine_similarity_class:
    def __init__(self):
        pass
    
    def calc_cosine_similarity(self, vA, vB):
        return np.dot(vA, vB) / (np.sqrt(np.dot(vA,vA)) * np.sqrt(np.dot(vB,vB)))

In [None]:
#instantiate the objects
get_mean_vector_class_obj = get_mean_vector_class()
calc_cosine_similarity_class_obj = calc_cosine_similarity_class()

<h3> Step 9: Run evaluation </h3>
This will have different functions required to do the evaluation between metamap files and gold standard.
In the end it will print the results on the screen as well as generate a csv files with option name and results in percent

In [None]:
import bisect

#1-n mappings
def evaluate_results_top_n(metamap_different_options, gold_standard_phrases, top_n):
    final_dict={}
    for option_name, documents in metamap_different_options.items():
        print('\nMetamap option: ' + option_name)

        document_dict={}
        for pmid, matched_candidates in documents.items():
            #take gold_standard for particular pmid
            gold_standard = gold_standard_phrases[pmid]
            #print(gold_standard)
            terms_cosine_score_dict={}
            #loop over all  matched_candidates for particular pmid
            for candidate in matched_candidates:
                term_vector = get_mean_vector_class_obj.get_mean_vector(w2v_model, candidate)
                if str(term_vector) == 'vocab not present' or str(term_vector) == 'nothing':
                    #print(str(term_vector))
                    continue
                #loop over all gold_standard for same pmid
                for gold in gold_standard:
                    GS_mean_vector = get_mean_vector_class_obj.get_mean_vector(w2v_model, gold)
                    if str(GS_mean_vector) == 'vocab not present' or str(term_vector) == 'nothing':
                        #print(str(gold) + ' -- GS vocab not present' )
                        continue
                    cos_sim_score=calc_cosine_similarity_class_obj.calc_cosine_similarity(term_vector, GS_mean_vector)
                    if ' '.join(candidate) in terms_cosine_score_dict:
                        #print(terms_cosine_score_dict[' '.join(candidate)])
                        
                        if len(terms_cosine_score_dict[' '.join(candidate)][0]) < top_n:
                            cur_list_of_scores = terms_cosine_score_dict[' '.join(candidate)][0]
                            insertion_point = bisect.bisect(cur_list_of_scores, cos_sim_score)
                            bisect.insort(cur_list_of_scores, cos_sim_score)
                            terms_cosine_score_dict[' '.join(candidate)][0] = cur_list_of_scores
                            #updating gold standard
                            cur_list_of_gold_standard = terms_cosine_score_dict[' '.join(candidate)][1]
                            cur_list_of_gold_standard.insert(insertion_point, ' '.join(gold))                  
                            terms_cosine_score_dict[' '.join(candidate)][1] = cur_list_of_gold_standard
                            
                            #print(cur_list_of_gold_standard,updated_gold_standard)
                        
                        else:
                            cur_list_of_scores = terms_cosine_score_dict[' '.join(candidate)][0]
                            if cos_sim_score > cur_list_of_scores[0]:
                                insertion_point = bisect.bisect(cur_list_of_scores, cos_sim_score)
                                bisect.insort(cur_list_of_scores, cos_sim_score)
                                terms_cosine_score_dict[' '.join(candidate)][0] = cur_list_of_scores[-top_n:]
                                #updating gold_standard
                                cur_list_of_gold_standard = terms_cosine_score_dict[' '.join(candidate)][1]
                                cur_list_of_gold_standard.insert(insertion_point, ' '.join(gold))
                                terms_cosine_score_dict[' '.join(candidate)][1] = cur_list_of_gold_standard[-top_n:]
                            else:
                                continue
        
                    else:
                        terms_cosine_score_dict[' '.join(candidate)] = [[cos_sim_score], [ ' '.join(gold)]]
                    
            document_dict[pmid] = terms_cosine_score_dict
            
            #calculating different metrics
            '''
            #print(terms_cosine_score_dict)
            exact_per, similar_per, missing_gold = calc_eval_metrics(terms_cosine_score_dict, len(matched_candidates) * top_n, gold_standard)
            print('For PMID ' + pmid + ' ---- exact % = ' + str(exact_per)+ ', similar % = ' + str(similar_per) + ', missing gold terms (TN) % = '+ str(round(len(missing_gold)/len(gold_standard),2 )) )               
            print(missing_gold)
            overall_exact += exact_per
            overall_similar += similar_per
            overall_missing_gold += round(len(missing_gold)/len(gold_standard),2 )
            '''
            
        final_dict[option_name] = document_dict
        #print('Overall ---- exact % =' + str(round(overall_exact/len(documents),2)) + ', similar % = ' + str(round(overall_similar/len(documents),2)) + ', missing gold terms (TN) % = '+ str(round(overall_missing_gold/len(documents),2)) )
    return final_dict

In [None]:
answer_dict=evaluate_results_top_n(metamap_different_options, unique_gold_standard, 3)

In [None]:
def calc_eval_metrics_by_gs_top_n(input_dict, total_mappings, gold_set):
    exact=0
    similar=0
    different=0
    exact_set={}
    similar_set={}
    
    exact_set_by_gs=set()
    similar_set_by_gs=set()
    missing_gs_set_by_gs=set()
    
    
    for k,v in input_dict.items():
        for pair_index in range(len(v[0])):
            if v[0][pair_index] >= .85:
                exact +=1
                exact_set[k]=[v[1][pair_index], round(v[0][pair_index],2)]
                exact_set_by_gs.add(v[1][pair_index])
                gold_set.discard(v[1][pair_index])
            elif v[0][pair_index] > 0.65 and v[0][pair_index] < 0.85:
                similar += 1
                similar_set[k]=[v[1][pair_index], round(v[0][pair_index],2)]
                similar_set_by_gs.add(v[1][pair_index])
                gold_set.discard(v[1][pair_index])
            else:
                different +=1
                missing_gs_set_by_gs.add(v[1][pair_index])
                
    
    return round(exact/total_mappings,2), round(similar/total_mappings,2), gold_set, exact_set, similar_set, exact_set_by_gs, similar_set_by_gs, missing_gs_set_by_gs

In [None]:
import pandas as pd
def run_eval_metrics_by_gs_top_n(input_dict, top_n, gold_standard_phrases, myfile, out_csv):
    options=[]
    overall_exact_by_gs_list=[]
    overall_similar_by_gs_list=[]
    overall_missing_by_gs_list=[]
    
    for option_name, documents in input_dict.items():
        options.append(option_name)
        print('\nMetamap option: ' + option_name)
        myfile.write('\nMetamap option: %s\n'%option_name)
        overall_exact=0
        overall_similar=0
        overall_missing_gold=0
        
        overall_exact_by_gs=0
        overall_similar_by_gs=0
        overall_gold = 0
        overall_missing_gold_by_gs=0
        
        for pmid, candidates in documents.items():
            gold_standard = gold_standard_phrases[pmid]
            
            gold_set=set()
            for gold in gold_standard:
                gold_set.add(' '.join(gold))
                        
            exact_per, similar_per, missing_gold, exact_set, similar_set, exact_set_by_gs, similar_set_by_gs, missing_gs_set_by_gs = calc_eval_metrics_by_gs_top_n(candidates, len(candidates) * top_n, gold_set.copy())
           
            print('\nFor PMID ' + pmid + ' ---- Exact matches by gs: '+str(round(len(exact_set_by_gs)/len(gold_set),2)) + ', Similar matches by gs: '+str(round(len(similar_set_by_gs.difference(exact_set_by_gs))/len(gold_set),2)) + ', missing gold terms (TN): '+ str(round(len(missing_gold)/len(gold_set),2 )))
            myfile.write('\nFor PMID ' + pmid + ' ---- Exact matches by gs: '+str(round(len(exact_set_by_gs)/len(gold_set),2)) + ', Similar matches by gs: '+str(round(len(similar_set_by_gs.difference(exact_set_by_gs))/len(gold_set),2)) + ', missing gold terms (TN): '+ str(round(len(missing_gold)/len(gold_set),2 )))
            myfile.write('\nExact matched metamap to gold standard pairs: ' +str(exact_set))
            myfile.write('\nSimilar matched metamap to gold standard pairs: ' +str(similar_set))
            myfile.write('\nMissing gold standard: '+str(missing_gold))

            overall_exact += exact_per
            overall_similar += similar_per
            overall_missing_gold += round(len(missing_gold)/len(gold_set),2 )
            
            overall_exact_by_gs += len(exact_set_by_gs)
            overall_similar_by_gs += len(similar_set_by_gs.difference(exact_set_by_gs))
            overall_missing_gold_by_gs += len(missing_gold)
            overall_gold += len(gold_set)
            #print(overall_gold)
        
        #print('Overall ---- exact % =' + str(round(overall_exact/len(documents),2)) + ', similar % = ' + str(round(overall_similar/len(documents),2)) + ', missing gold terms (TN) % = '+ str(round(overall_missing_gold/len(documents),2)) )
        print('Overall ---- exact by gs % =' + str(round(overall_exact_by_gs/overall_gold,2)) + ', similar by gs % = ' + str(round(overall_similar_by_gs/overall_gold,2)) + ', missing gold terms (TN) % = '+ str(round((overall_missing_gold_by_gs)/overall_gold,2)))
        myfile.write('Overall ---- exact by gs % =' + str(round(overall_exact_by_gs/overall_gold,2)) + ', similar by gs % = ' + str(round(overall_similar_by_gs/overall_gold,2)) + ', missing gold terms (TN) % = '+ str(round((overall_missing_gold_by_gs)/overall_gold,2)))
        overall_exact_by_gs_list.append(round(overall_exact_by_gs/overall_gold,2))
        overall_similar_by_gs_list.append(round(overall_similar_by_gs/overall_gold,2))
        overall_missing_by_gs_list.append(round((overall_missing_gold_by_gs)/overall_gold,2))
        
        #myfile.write('\nOverall ---- exact % =' + str(round(overall_exact/len(documents),2)) + ', similar % = ' + str(round(overall_similar/len(documents),2)) + ', missing gold terms (TN) % = '+ str(round(overall_missing_gold/len(documents),2)) +'\n')
        #myfile.write('Overall ---- exact by gs % =' + str(round(overall_exact_by_gs/overall_gold,2)) + ', similar by gs % = ' + str(round(overall_similar_by_gs/overall_gold,2 )))
    df=pd.DataFrame()
    df['options']=options
    df['overall_exact_by_gs_list']=overall_exact_by_gs_list
    df['overall_similar_by_gs_list'] = overall_similar_by_gs_list
    df['overall_missing_by_gs_list'] = overall_missing_by_gs_list
    
    df.to_csv(out_csv,index=False)

In [None]:
myfile = open('2_option_top_3_cosine_by_gs.log', 'w')
run_eval_metrics_by_gs_top_n(answer_dict, 3, gold_standard_phrases, myfile, '2_option_top_3_cosine_by_gs_results.csv')
myfile.close()