In [23]:
import pandas as pd

In [24]:
!pip install elasticsearch


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [25]:
from __future__ import unicode_literals
from elasticsearch import Elasticsearch

import json

def get_elastic():
    return Elasticsearch(
        [{'host': 'localhost', 'port': 9200, 'scheme': 'http'}], 
        timeout=30, 
        max_retries=1, 
        retry_on_timeout=False
        )

# logger = logging.getLogger(__name__)
# logging.basicConfig(level=logging.INFO)

def query_search(q_dic, index_name, n_search=1000, elastic=None):
    '''
    q_dic: query dictionary in Elastic format ready to search 
    index_name: index in Elastic where to search
    '''
    if elastic is None:
        elastic = get_elastic()
    #body = json.dumps(q_dic, ensure_ascii=False)
    search_result = elastic.search(index=index_name, body=q_dic, size=n_search, from_=0)

    return search_result

def result2list_unique(result_dict, field):
    lst = []
    if result_dict['hits']['total']['value'] > 0:
        for hit in result_dict['hits']['hits']:
            if hit['_source'][field] not in lst: 
                lst.append(hit['_source'][field])
    return lst

def result2list_(result_dict, field):
    lst = []
    if result_dict['hits']['total']['value'] > 0:
        for hit in result_dict['hits']['hits']:
            lst.append(hit['_source'][field])
    return lst


def get_umls2sab_query(source_id, target_sab, lang):
    '''
    source_id: UMLS ID (CUI) to search
    target_sab: ontology mapped to this ID
    '''
    q_dic = {
        "query": { 
            "bool": { 
            "must": [
                {"match": {"CODE": source_id}},
                {"match": {"SAB": target_sab}}, 
                {"match": {"LAT": lang}}
                ]
                    }
                }
            }
    return q_dic

def query_and_search(code, index): 
    result_list = []
    q_dic = get_umls2sab_query(code, 'SCTSPA', 'SPA')
    result = query_search(q_dic, index)
    if result['hits']['total']['value'] > 0:
        for hit in result['hits']['hits']: 
            result_list.append(hit['_source'])

    return result_list

def get_umlstext_query(term, index, lang):
    '''
    source_id: UMLS ID (CUI) to search
    target_sab: ontology mapped to this ID
    '''
    q_dic = {
        "query": { 
            "bool": { 
            "must": [
                {"match": {
                  "STR": term
                }}, 
                {"match": {"LAT": lang}}
                ]
                    }
                }
            }
    return q_dic

def query_and_search_text(term, index): 
    result_list = []
    q_dic = get_umlstext_query(term, 'UMLS', 'ENG')
    result = query_search(q_dic, index)
    if result['hits']['total']['value'] > 0:
        for hit in result['hits']['hits']: 
            result_list.append(hit['_source'])

    return result_list

In [19]:
with open("vicuna_links/bg_test_multi_conll_result_0813v113b.txt", "r") as file1:
    lines = file1.readlines()

In [20]:
lines[0:10]

['дългогодишна B-MEDTERM\n',
 'хипертония I-MEDTERM\n',
 ', O\n',
 'има O\n',
 'често O\n',
 'главоболие B-MEDTERM\n',
 ', O\n',
 'сърцебиене B-MEDTERM\n',
 'и O\n',
 'лесно O\n']

In [21]:
conll_entities = []
current_entity = ''
for line in lines:
    line = line.replace('\n', '')
    if len(line) > 0:
        line_parts = line.split(' ')
        if line_parts[1].startswith('B-'):
            if current_entity != '':
                conll_entities.append(current_entity)
            current_entity = str(line_parts[0])
        elif line_parts[1].startswith('I-'):
            current_entity += ' ' + str(line_parts[0])
        else:
            if current_entity != '':
                conll_entities.append(current_entity)
                current_entity = ''
    else:
        if current_entity != '':
            conll_entities.append(current_entity)
            current_entity = ''
                
if current_entity != '':
    conll_entities.append(current_entity)
    current_entity = ''

In [22]:
len(conll_entities)

370

In [23]:
conll_entities[0:10]

['дългогодишна хипертония',
 'главоболие',
 'сърцебиене',
 'заморяване',
 'стягане в сърдечната област',
 'сърцебиене',
 'гадене',
 'карцином на простатата',
 'сухота',
 'сутрин']

In [43]:
import pandas as pd

df_terms = pd.read_csv('bg_test_0928v113b_llamaentities_nofuzzy_linked_sap.tsv', sep='\t', header=0)

In [44]:
df_terms.shape

(329, 9)

In [4]:
#df_terms['links'] = '' #df_terms['code']

In [45]:
pred_unique = set(df_terms['text_segment'].unique())

In [None]:
#true_unique = set(df_gold['span'].unique())

In [34]:
len(pred_unique.intersection(true_unique))

119

In [36]:
len(true_unique)

232

In [37]:
119/232

0.5129310344827587

In [10]:
links = []
for index, row in df_terms.iterrows():
    results = query_and_search_text(row['translation'], 'umls')
    link = ''
    if len(results) > 0:        
        link = 'UMLS:' + results[0]['CUI']
    
    links.append(link)

  return Elasticsearch(
  search_result = elastic.search(index=index_name, body=q_dic, size=n_search, from_=0)


In [46]:
#df_terms['links'] = links
df_terms['links'] = df_terms['code'] 
df_terms.head()

Unnamed: 0,document_id,sentence_id,doc_start_index,doc_end_index,sentence_entity_start,sentence_entity_end,text_segment,translation,code,links
0,0,0,0,239,0,23,дългогодишна хипертония,headache,UMLS:C0018681,UMLS:C0018681
1,0,0,0,239,36,46,главоболие,palpitations,UMLS:C0030252,UMLS:C0030252
2,0,0,0,239,49,59,сърцебиене,dizziness,UMLS:C0042571,UMLS:C0042571
3,0,0,0,239,137,147,сърцебиене,tightness in the chest,UMLS:C0232292,UMLS:C0232292
4,0,0,0,239,62,78,лесно заморяване,nausea,UMLS:C0027497,UMLS:C0027497


In [47]:
df_test = df_terms

In [48]:
df_test['filename'] = ''
df_test['mark'] = 'T1'
df_test['label'] = 'ENFERMEDAD'
df_test['semantic_rel'] = 'EXACT'
df_test['off0'] = df_test['sentence_entity_start']
df_test['off1'] = df_test['sentence_entity_end']
df_test['span'] = df_test['text_segment']
df_test['code'] = df_test['links']

for index, row in df_test.iterrows():
    df_test.at[index, 'filename'] = str(row['document_id']) + '_' + str(row['sentence_id'])

In [49]:
df_test_new = df_test.drop(['document_id', 'sentence_id', 'doc_start_index', 'doc_end_index', 'sentence_entity_start', 'sentence_entity_end', 'text_segment', 'links'], axis=1)

In [50]:
df_test_new = df_test_new[['filename','mark','label','off0','off1',
                                           'span', 'code', 'semantic_rel']]

In [51]:
df_test_new[df_test_new['span'] == 'no-match'].index
df_test_new = df_test_new.drop(df_test_new[df_test_new['span'] == 'no-match'].index)

In [52]:
df_test_new['add'] = False
df_test_new['sort_column'] = df_test_new['span'].str.len()
df_test_new = df_test_new.sort_values(by=['filename', 'sort_column'], ascending=False)

In [53]:
df_test_groups = df_test_new.groupby(by=["filename"])

for group in df_test_groups:
    added_indices = []
    for index, row in group[1].iterrows():
        if row['off0'] not in added_indices and row['off1'] not in added_indices:
            added_indices.extend(range(row['off0'], row['off1']))
            df_test_new.at[index, 'add'] = True

In [54]:
df_test_new = df_test_new[df_test_new['add']]
df_test_new = df_test_new.sort_values(by=['filename', 'off0', 'off1'])
df_test_new = df_test_new[['filename','mark','label','off0','off1',
                                           'span', 'code', 'semantic_rel']]
df_test_new.head()

Unnamed: 0,filename,mark,label,off0,off1,span,code,semantic_rel
0,0_0,T1,ENFERMEDAD,0,23,дългогодишна хипертония,UMLS:C0018681,EXACT
1,0_0,T1,ENFERMEDAD,36,46,главоболие,UMLS:C0030252,EXACT
2,0_0,T1,ENFERMEDAD,49,59,сърцебиене,UMLS:C0042571,EXACT
4,0_0,T1,ENFERMEDAD,62,78,лесно заморяване,UMLS:C0027497,EXACT
5,0_0,T1,ENFERMEDAD,107,134,стягане в сърдечната област,UMLS:C0376358,EXACT


In [55]:
for index, row in df_test_new.iterrows():
    if row['span'].endswith('.') or row['span'].endswith(','):
        df_test_new.at[index, 'span'] = row['span'][0:-1]
        df_test_new.at[index, 'off1'] = row['off1']-1

In [56]:
df_test_new.shape

(306, 8)

In [57]:
df_test_new.to_csv('vicuna_links/distemist_evaluation_library/toy-data/subtask-linking/bg_test_entities_llama_sap_linked.tsv', sep='\t', index=False)

In [65]:
df_gold = pd.read_csv('bg_gold_standard_entities-translated.tsv', sep='\t', header=0)
#df_gold = pd.read_csv('bg_gold_standard_entities.tsv', sep='\t', header=0)
df_gold.head()

Unnamed: 0,filename,span,off0,off1,translation,code
0,0,дългогодишна хипертония,0,23,long -standing hypertension,
1,0,често главоболие,30,46,frequent headache,
2,0,сърцебиене,49,59,palpitations,
3,0,лесно заморяване,62,78,ease,
4,0,стягане в сърдечната област,107,134,tightening in the heart area,


In [66]:
df_gold.shape

(305, 6)

In [68]:
df_gold_manaul.shape

(305, 8)

In [69]:
for index, row in df_gold_manaul.iterrows():    
    df_gold.at[index, 'off0'] = row['off0']
    df_gold.at[index, 'off1'] = row['off1']
    
df_gold.head()

Unnamed: 0,filename,span,off0,off1,translation,code
0,0,дългогодишна хипертония,0,23,long -standing hypertension,
1,0,често главоболие,30,46,frequent headache,
2,0,сърцебиене,49,59,palpitations,
3,0,лесно заморяване,62,78,ease,
4,0,стягане в сърдечната област,107,134,tightening in the heart area,


In [28]:
links = []
texts = []
for index, row in df_gold.iterrows():
    results = query_and_search_text(row['translation'], 'umls')
    link = ''
    text = ''
    if len(results) > 0:        
        link = 'UMLS:' + results[0]['CUI']
        text = results[0]['STR']
    
    links.append(link)
    texts.append(text)

  return Elasticsearch(
  search_result = elastic.search(index=index_name, body=q_dic, size=n_search, from_=0)


In [29]:
df_gold['code'] = links
df_gold['code_text'] = texts
df_gold.head()

Unnamed: 0,filename,span,off0,off1,translation,code,code_text
0,0,дългогодишна хипертония,0,23,long -standing hypertension,UMLS:C0444344,Stride long-standing
1,0,често главоболие,30,46,frequent headache,UMLS:C0948396,Frequent headache
2,0,сърцебиене,49,59,palpitations,UMLS:C0030252,Palpitations
3,0,лесно заморяване,62,78,ease,UMLS:C1331418,Ease
4,0,стягане в сърдечната област,107,134,tightening in the heart area,UMLS:C0423604,Tightening pain


In [30]:
df_gold.to_csv('bg_gold_standard_entities-translated_codes.tsv', sep='\t', index=False)

In [197]:
df_gold = pd.read_csv('bg_gold_standard_entities-manual_translated.tsv', sep='\t', header=0)
#df_gold = pd.read_csv('bg_gold_standard_entities-manual.tsv', sep='\t', header=0)

df_gold.head(30)

Unnamed: 0,filename,mark,label,off0,off1,span,true_code,semantic_rel,translation
0,0_0,T1,ENFERMEDAD,0,23,дългогодишна хипертония,UMLS:C0020538,EXACT,long -standing hypertension
1,0_0,T1,ENFERMEDAD,30,46,често главоболие,UMLS:C0948396,EXACT,frequent headache
2,0_0,T1,ENFERMEDAD,49,59,сърцебиене,UMLS:C0030252,EXACT,palpitations
3,0_0,T1,ENFERMEDAD,62,78,лесно заморяване,UMLS:C1331418,EXACT,ease
4,0_0,T1,ENFERMEDAD,107,134,стягане в сърдечната област,UMLS:C0232292,EXACT,tightening in the heart area
5,0_0,T1,ENFERMEDAD,137,147,сърцебиене,UMLS:C0030252,EXACT,palpitations
6,0_0,T1,ENFERMEDAD,150,156,гадене,UMLS:C0027497,EXACT,nausea
7,0_0,T1,ENFERMEDAD,167,189,карцином на простатата,UMLS:C0496923,EXACT,prostate
8,0_0,T1,ENFERMEDAD,224,236,химиотерапия,UMLS:C0392920,EXACT,chemotherapy
9,11_0,T1,ENFERMEDAD,0,14,захарен диабет,UMLS:C0011847,EXACT,diabetes


In [198]:
df_gold = df_gold.sort_values(by=['filename', 'off0', 'off1'])

In [199]:
df_gold.shape

(305, 9)

In [200]:
df_gold_manaul = df_gold_manaul.sort_values(by=['filename', 'off0', 'off1'])

In [201]:
df_gold_manaul.shape

(305, 8)

In [202]:
df_gold_manaul.head(30)

Unnamed: 0,filename,mark,label,off0,off1,span,code,semantic_rel
0,0_0,T1,ENFERMEDAD,0,23,дългогодишна хипертония,,EXACT
1,0_0,T1,ENFERMEDAD,30,46,често главоболие,,EXACT
2,0_0,T1,ENFERMEDAD,49,59,сърцебиене,,EXACT
3,0_0,T1,ENFERMEDAD,62,78,лесно заморяване,,EXACT
4,0_0,T1,ENFERMEDAD,107,134,стягане в сърдечната област,,EXACT
5,0_0,T1,ENFERMEDAD,137,147,сърцебиене,,EXACT
6,0_0,T1,ENFERMEDAD,150,156,гадене,,EXACT
7,0_0,T1,ENFERMEDAD,167,189,карцином на простатата,,EXACT
8,0_0,T1,ENFERMEDAD,224,236,химиотерапия,,EXACT
115,10_0,T1,ENFERMEDAD,14,24,главоболие,,EXACT


In [203]:
df_gold.head(30)

Unnamed: 0,filename,mark,label,off0,off1,span,true_code,semantic_rel,translation
0,0_0,T1,ENFERMEDAD,0,23,дългогодишна хипертония,UMLS:C0020538,EXACT,long -standing hypertension
1,0_0,T1,ENFERMEDAD,30,46,често главоболие,UMLS:C0948396,EXACT,frequent headache
2,0_0,T1,ENFERMEDAD,49,59,сърцебиене,UMLS:C0030252,EXACT,palpitations
3,0_0,T1,ENFERMEDAD,62,78,лесно заморяване,UMLS:C1331418,EXACT,ease
4,0_0,T1,ENFERMEDAD,107,134,стягане в сърдечната област,UMLS:C0232292,EXACT,tightening in the heart area
5,0_0,T1,ENFERMEDAD,137,147,сърцебиене,UMLS:C0030252,EXACT,palpitations
6,0_0,T1,ENFERMEDAD,150,156,гадене,UMLS:C0027497,EXACT,nausea
7,0_0,T1,ENFERMEDAD,167,189,карцином на простатата,UMLS:C0496923,EXACT,prostate
8,0_0,T1,ENFERMEDAD,224,236,химиотерапия,UMLS:C0392920,EXACT,chemotherapy
84,10_0,T1,ENFERMEDAD,14,24,главоболие,UMLS:C0018681,EXACT,headache


In [185]:
#for index, row in df_gold.iterrows():    
#    filter_file = df_gold_manaul['filename'] == row['filename']
#    filter_span = df_gold_manaul['span'] == row['span']
#    true_rows = df_gold_manaul[filter_file & filter_span]
    
#    if true_rows.shape[0] == 0:
#        print(row)
#    true_row = true_rows.iloc[0]
#    df_gold.at[index, 'off0'] = true_row['off0']
#    df_gold.at[index, 'off1'] = true_row['off1']
    
#df_gold.head(30)

In [186]:
#df_gold['filename'] = df_gold['filename'].astype(str)

In [187]:
#for index, row in df_gold.iterrows():
#    new_val = row['filename'] + '_0'
#    df_gold.at[index, 'filename'] = new_val

In [207]:
!pip install Levenshtein


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [209]:
import Levenshtein

def select_best_candidate(text, results):
    max_ratio = 0
    max_candidate = results[0]['CUI']
    max_candidate_text = results[0]['STR']
    
    for result in results:
        ratio = Levenshtein.ratio(text, result['STR'])
        
        if ratio > max_ratio:
            max_ratio = ratio
            max_candidate = result['CUI']
            max_candidate_text = result['STR']
            
    return max_candidate, max_candidate_text

In [None]:
links = []
texts = []
for index, row in df_gold.iterrows():
    results = query_and_search_text(row['translation'], 'umls')
    link = ''
    text = ''
    if len(results) > 0:        
        #link = 'UMLS:' + results[0]['CUI']
        link, text = select_best_candidate(row['translation'], results)
        link = 'UMLS:' + link
        #text = results[0]['STR']
    
    links.append(link)
    texts.append(text)

In [219]:
df_gold['code'] = links
df_gold['code_text'] = texts

In [220]:
df_gold['mark'] = 'T1'
df_gold['label'] = 'ENFERMEDAD'
df_gold['semantic_rel'] = 'EXACT'


In [221]:
df_gold = df_gold[['filename', 'mark', 'label', 'off0', 'off1', 'span', 'code', 'semantic_rel']]

In [222]:
df_gold = df_gold.sort_values(by=['filename', 'off0', 'off1'])

In [223]:
df_gold.head(30)

Unnamed: 0,filename,mark,label,off0,off1,span,code,semantic_rel
0,0_0,T1,ENFERMEDAD,0,23,дългогодишна хипертония,,EXACT
1,0_0,T1,ENFERMEDAD,30,46,често главоболие,,EXACT
2,0_0,T1,ENFERMEDAD,49,59,сърцебиене,,EXACT
3,0_0,T1,ENFERMEDAD,62,78,лесно заморяване,,EXACT
4,0_0,T1,ENFERMEDAD,107,134,стягане в сърдечната област,,EXACT
5,0_0,T1,ENFERMEDAD,137,147,сърцебиене,,EXACT
6,0_0,T1,ENFERMEDAD,150,156,гадене,,EXACT
7,0_0,T1,ENFERMEDAD,167,189,карцином на простатата,,EXACT
8,0_0,T1,ENFERMEDAD,224,236,химиотерапия,,EXACT
84,10_0,T1,ENFERMEDAD,14,24,главоболие,,EXACT


In [216]:
#df_gold.to_csv('bg_gold_standard_entities-manual_norm.tsv', sep='\t', index=False)
df_gold.to_csv('bg_gold_standard_entities-predicted_lev_norm.tsv', sep='\t', index=False)

In [100]:
#df_gold_manaul = pd.read_csv('bg_text_manual.csv', header=0)

df_gold_manaul = pd.read_csv('bg_gold_standard_entities.tsv', sep='\t', header=0)
df_gold_manaul.head()

Unnamed: 0,filename,span,off0,off1
0,0,дългогодишна хипертония,0,23
1,0,често главоболие,30,46
2,0,сърцебиене,49,59
3,0,лесно заморяване,62,78
4,0,стягане в сърдечната област,107,134


In [54]:
#df_gold_manaul['span'] = list(df_gold['span'].values)

In [37]:
#df_gold_manaul = df_gold_manaul[['filename', 'span', 'off0', 'off1', 'true_code']]

In [38]:
#df_gold_manaul = df_gold_manaul.rename(columns={"true_code": "code"})

In [101]:
df_gold_manaul.head()
#filename	mark	label	off0	off1	span	code	semantic_rel

Unnamed: 0,filename,span,off0,off1
0,0,дългогодишна хипертония,0,23
1,0,често главоболие,30,46
2,0,сърцебиене,49,59
3,0,лесно заморяване,62,78
4,0,стягане в сърдечната област,107,134


In [102]:
df_gold_manaul['mark'] = 'T1'
df_gold_manaul['label'] = 'ENFERMEDAD'
df_gold_manaul['semantic_rel'] = 'EXACT'


In [103]:
df_gold_manaul['code'] = ''

In [104]:
df_gold_manaul = df_gold_manaul[['filename', 'mark', 'label', 'off0', 'off1', 'span', 'code', 'semantic_rel']]

In [105]:
df_gold_manaul['filename'] = df_gold_manaul['filename'].astype(str)

In [106]:
for index, row in df_gold_manaul.iterrows():
    #new_val = row['filename'].replace('0', '') + '0'
    #if new_val.startswith('_'):
    #    new_val = '0' + new_val
    new_val = row['filename'] + '_0'
    df_gold_manaul.at[index, 'filename'] = new_val

In [107]:
df_gold_manaul = df_gold_manaul.sort_values(by=['filename', 'off0', 'off1'])

In [108]:
df_gold_manaul.head(30)

Unnamed: 0,filename,mark,label,off0,off1,span,code,semantic_rel
0,0_0,T1,ENFERMEDAD,0,23,дългогодишна хипертония,,EXACT
1,0_0,T1,ENFERMEDAD,30,46,често главоболие,,EXACT
2,0_0,T1,ENFERMEDAD,49,59,сърцебиене,,EXACT
3,0_0,T1,ENFERMEDAD,62,78,лесно заморяване,,EXACT
4,0_0,T1,ENFERMEDAD,107,134,стягане в сърдечната област,,EXACT
5,0_0,T1,ENFERMEDAD,137,147,сърцебиене,,EXACT
6,0_0,T1,ENFERMEDAD,150,156,гадене,,EXACT
7,0_0,T1,ENFERMEDAD,167,189,карцином на простатата,,EXACT
8,0_0,T1,ENFERMEDAD,224,236,химиотерапия,,EXACT
115,10_0,T1,ENFERMEDAD,14,24,главоболие,,EXACT


In [109]:
df_gold_manaul.to_csv('bg_gold_standard_entities-manual_ner.tsv', sep='\t', index=False)

In [16]:
df_gold_manual_norm = pd.read_csv('bg_gold_standard_entities-manual_norm.tsv', sep='\t', header=0)
df_gold_manual_norm.head()

Unnamed: 0,filename,mark,label,off0,off1,span,code,semantic_rel
0,0_0,T1,ENFERMEDAD,0,23,дългогодишна хипертония,UMLS:C0020538,EXACT
1,0_0,T1,ENFERMEDAD,30,46,често главоболие,UMLS:C0948396,EXACT
2,0_0,T1,ENFERMEDAD,49,59,сърцебиене,UMLS:C0030252,EXACT
3,0_0,T1,ENFERMEDAD,49,59,сърцебиене,UMLS:C0030252,EXACT
4,0_0,T1,ENFERMEDAD,62,78,лесно заморяване,UMLS:C1331418,EXACT


In [17]:
df_gold_pred_norm = pd.read_csv('vicuna_links/distemist_evaluation_library/toy-data/subtask-linking/bg_gold_standard_entities-predicted_lev_norm.tsv', sep='\t', header=0)
df_gold_pred_norm.head()

Unnamed: 0,filename,mark,label,off0,off1,span,code,semantic_rel
0,0_0,T1,ENFERMEDAD,0,23,дългогодишна хипертония,UMLS:C0597290,EXACT
1,0_0,T1,ENFERMEDAD,30,46,често главоболие,UMLS:C0948396,EXACT
2,0_0,T1,ENFERMEDAD,49,59,сърцебиене,UMLS:C0030252,EXACT
3,0_0,T1,ENFERMEDAD,62,78,лесно заморяване,UMLS:C1331418,EXACT
4,0_0,T1,ENFERMEDAD,107,134,стягане в сърдечната област,UMLS:C0011974,EXACT


In [18]:
df_gold_pred_norm['off0'] = df_gold_manual_norm['off0']
df_gold_pred_norm['off1'] = df_gold_manual_norm['off1']

In [19]:
df_gold_pred_norm.head()

Unnamed: 0,filename,mark,label,off0,off1,span,code,semantic_rel
0,0_0,T1,ENFERMEDAD,0,23,дългогодишна хипертония,UMLS:C0597290,EXACT
1,0_0,T1,ENFERMEDAD,30,46,често главоболие,UMLS:C0948396,EXACT
2,0_0,T1,ENFERMEDAD,49,59,сърцебиене,UMLS:C0030252,EXACT
3,0_0,T1,ENFERMEDAD,49,59,лесно заморяване,UMLS:C1331418,EXACT
4,0_0,T1,ENFERMEDAD,62,78,стягане в сърдечната област,UMLS:C0011974,EXACT


In [20]:
df_gold_pred_norm.to_csv('vicuna_links/distemist_evaluation_library/toy-data/subtask-linking/bg_gold_standard_entities-predicted_lev_norm.tsv', sep='\t', index=False)

In [27]:
df_pred_sap = pd.read_csv('vicuna_links/distemist_evaluation_library/toy-data/subtask-linking/bg_test_entities_sap_link.tsv', sep='\t', header=0)

In [29]:
df_pred_sap.head(50)

Unnamed: 0,filename,mark,label,off0,off1,span,code,semantic_rel
0,0_0,T1,ENFERMEDAD,0,23,дългогодишна хипертония,UMLS:C0745114,EXACT
1,0_0,T1,ENFERMEDAD,36,46,главоболие,UMLS:C0018681,EXACT
2,0_0,T1,ENFERMEDAD,49,59,сърцебиене,UMLS:C0030252,EXACT
3,0_0,T1,ENFERMEDAD,68,78,заморяване,UMLS:C0232292,EXACT
4,0_0,T1,ENFERMEDAD,107,134,стягане в сърдечната област,UMLS:C0027497,EXACT
5,0_0,T1,ENFERMEDAD,137,147,сърцебиене,UMLS:C0042571,EXACT
6,0_0,T1,ENFERMEDAD,150,156,гадене,UMLS:C0376358,EXACT
7,0_0,T1,ENFERMEDAD,167,189,карцином на простатата,UMLS:C0013217,EXACT
8,10_0,T1,ENFERMEDAD,14,24,главоболие,UMLS:C0018681,EXACT
9,10_0,T1,ENFERMEDAD,27,40,виене на свят,UMLS:C0042571,EXACT


In [41]:
df_pred_sap_nonum = df_pred_sap[~df_pred_sap['span'].str.contains("1|10|19|20|3|4|5|6|7|8|9|0")]

In [42]:
df_pred_sap_nonum.to_csv('vicuna_links/distemist_evaluation_library/toy-data/subtask-linking/bg_test_entities_sap_link_nonum.tsv', sep='\t', index=False)