In [102]:
import re
import os
import pandas as pd
from tqdm import tqdm
import numpy as np

from BERT_utility import BERT_utility

import random
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [2]:
def get_concepts_from_file(file_path,file_name,source):
    
    list_of_concepts = list()
    
    file = open(file_path, 'r',encoding="utf8",errors = 'ignore') 
    Lines = file.readlines() 
    for line in Lines: 
        entry = line.strip()
        regular_exp_con = 'c="(.*)" ([0-9]*):([0-9]*) [0-9]*:([0-9]*)\|\|t="(.*)"'
        pattern_search = re.search(regular_exp_con, entry, re.IGNORECASE)

        if pattern_search:
            concept_dict = dict()
            concept_dict['source'] = source
            concept_dict['file_name'] = file_name 
            concept_dict['text'] = pattern_search.group(1)
            concept_dict['line_number'] = int(pattern_search.group(2))
            concept_dict['begin_word_num'] = int(pattern_search.group(3))
            concept_dict['end_word_num'] = int(pattern_search.group(4))
            concept_dict['concept_type'] = pattern_search.group(5)
            list_of_concepts.append(concept_dict)
    
    return list_of_concepts

def get_assertions_from_file(file_path,file_name,source):
    
    list_of_assertions = list()
    
    file = open(file_path, 'r',encoding="utf8",errors = 'ignore') 
    Lines = file.readlines() 
    
    for line in Lines: 
        entry = line.strip()
        regular_exp_con = 'c="(.*)" ([0-9]*):([0-9]*) [0-9]*:([0-9]*)\|\|t=".*"\|\|a="(.*)"'
        pattern_search = re.search(regular_exp_con, entry, re.IGNORECASE)

        if pattern_search:
            assertion_dict = dict()
            assertion_dict['source'] = source
            assertion_dict['file_name'] = file_name 
            assertion_dict['text'] = pattern_search.group(1)
            assertion_dict['line_number'] = int(pattern_search.group(2))
            assertion_dict['begin_word_num'] = int(pattern_search.group(3))
            assertion_dict['end_word_num'] = int(pattern_search.group(4))
            assertion_dict['assertion_type'] = pattern_search.group(5)
            list_of_assertions.append(assertion_dict)
    
    return list_of_assertions

def get_relations_from_file(file_path,file_name,source):
    
    list_of_relations = list()
    
    file = open(file_path, 'r',encoding="utf8",errors = 'ignore') 
    Lines = file.readlines() 
    
    for line in Lines: 
        entry = line.strip()
        regular_exp_con = 'c="(.*)" ([0-9]*):([0-9]*) [0-9]*:([0-9]*)\|\|r="(.*)"\|\|c="(.*)" ([0-9]*):([0-9]*) [0-9]*:([0-9]*)'
        pattern_search = re.search(regular_exp_con, entry, re.IGNORECASE)

        if pattern_search:
            relation_dict = dict()
            relation_dict['source'] = source
            relation_dict['file_name'] = file_name 
            relation_dict['from_text'] = pattern_search.group(1)
            relation_dict['from_line_number'] = int(pattern_search.group(2))
            relation_dict['from_begin_word_num'] = int(pattern_search.group(3))
            relation_dict['from_end_word_num'] = int(pattern_search.group(4))
            relation_dict['relation_type'] = pattern_search.group(5)
            relation_dict['to_text'] = pattern_search.group(6)
            relation_dict['to_line_number'] = int(pattern_search.group(7))
            relation_dict['to_begin_word_num'] = int(pattern_search.group(8))
            relation_dict['to_end_word_num'] = int(pattern_search.group(9))
            list_of_relations.append(relation_dict)
    
    return list_of_relations

def create_pos_dict_concept(concept):
    positions = dict()
    
    for index,row in concept.iterrows():
        for i in range(row['begin_word_num'],row['end_word_num']+1):
            positions[str(row['line_number'])+":"+str(i)] = row['concept_type']
    
    return positions

def create_pos_dict_assertion(assertion):
    positions = dict()
    
    for index,row in assertion.iterrows():
        for i in range(row['begin_word_num'],row['end_word_num']+1):
            positions[str(row['line_number'])+":"+str(i)] = row['assertion_type']
    
    return positions

In [3]:
data_file_path = os.path.dirname(os.getcwd()) + r'\Data\concept_assertion_relation_training_data' 
beth_file_path = data_file_path + r'\beth'
partners_file_path = data_file_path + r'\partners'
test_data_file_path = os.path.dirname(os.getcwd()) + r'\Data\reference_standard_for_test_data'
test_data_texts_path =  os.path.dirname(os.getcwd()) + r'\Data\test_data'

list_of_all_concepts = list()

for file in os.listdir(beth_file_path+r'\concept'):
    file_path = os.path.join(beth_file_path+r'\concept', file)
    list_of_all_concepts.extend(get_concepts_from_file(file_path,file[:-4],'beth'))

for file in os.listdir(partners_file_path+r'\concept'):
    file_path = os.path.join(partners_file_path+r'\concept', file)
    list_of_all_concepts.extend(get_concepts_from_file(file_path,file[:-4],'partners'))

for file in os.listdir(test_data_file_path+r'\concepts'):
    file_path = os.path.join(test_data_file_path+r'\concepts', file)
    list_of_all_concepts.extend(get_concepts_from_file(file_path,file[0:-4],'test_data'))

In [4]:
concept_df = pd.DataFrame(list_of_all_concepts)

In [5]:
concept_df.head()

Unnamed: 0,source,file_name,text,line_number,begin_word_num,end_word_num,concept_type
0,beth,record-105,left basilar atelectasis,55,6,8,problem
1,beth,record-105,ventral hernia,143,1,2,problem
2,beth,record-105,htn,26,0,0,problem
3,beth,record-105,spontaneous echo contrast,68,1,3,problem
4,beth,record-105,cath,21,0,0,test


In [6]:
concept_df.groupby(['concept_type']).size()

concept_type
problem      19665
test         13833
treatment    14188
dtype: int64

In [7]:
list_of_all_assertions = list()

for file in os.listdir(beth_file_path+r'\ast'):
    file_path = os.path.join(beth_file_path+r'\ast', file)
    list_of_all_assertions.extend(get_assertions_from_file(file_path,file[:-4],'beth'))

for file in os.listdir(partners_file_path+r'\ast'):
    file_path = os.path.join(partners_file_path+r'\ast', file)
    list_of_all_assertions.extend(get_assertions_from_file(file_path,file[:-4],'partners'))

for file in os.listdir(test_data_file_path+r'\ast'):
    file_path = os.path.join(test_data_file_path+r'\ast', file)
    list_of_all_assertions.extend(get_assertions_from_file(file_path,file[0:-4],'test_data'))

In [8]:
assertion_df = pd.DataFrame(list_of_all_assertions)

In [9]:
assertion_df.head()

Unnamed: 0,source,file_name,text,line_number,begin_word_num,end_word_num,assertion_type
0,beth,record-105,left basilar atelectasis,55,6,8,present
1,beth,record-105,ventral hernia,143,1,2,present
2,beth,record-105,htn,26,0,0,present
3,beth,record-105,spontaneous echo contrast,68,1,3,absent
4,beth,record-105,80% lm lesion,21,6,8,present


In [10]:
assertion_df.groupby(['assertion_type']).size()

assertion_type
absent                           4190
associated_with_someone_else      220
conditional                       221
hypothetical                      827
possible                          961
present                         13246
dtype: int64

In [11]:
list(set(list(assertion_df['assertion_type'])))

['conditional',
 'present',
 'possible',
 'associated_with_someone_else',
 'hypothetical',
 'absent']

In [12]:
list_of_all_relations = list()

for file in os.listdir(beth_file_path+r'\rel'):
    file_path = os.path.join(beth_file_path+r'\rel', file)
    list_of_all_relations.extend(get_relations_from_file(file_path,file[0:-4],'beth'))

for file in os.listdir(partners_file_path+r'\rel'):
    file_path = os.path.join(partners_file_path+r'\rel', file)
    list_of_all_relations.extend(get_relations_from_file(file_path,file[0:-4],'partners'))

for file in os.listdir(test_data_file_path+r'\rel'):
    file_path = os.path.join(test_data_file_path+r'\rel', file)
    list_of_all_relations.extend(get_relations_from_file(file_path,file[0:-4],'test_data'))

In [13]:
relation_df = pd.DataFrame(list_of_all_relations)

In [14]:
relation_df.head()

Unnamed: 0,source,file_name,from_text,from_line_number,from_begin_word_num,from_end_word_num,relation_type,to_text,to_line_number,to_begin_word_num,to_end_word_num
0,beth,record-105,cath,21,0,0,TeRP,80% lm lesion,21,6,8
1,beth,record-105,pefusion imaging,19,6,7,TeRP,perfusion defects,19,12,13
2,beth,record-105,drugs,12,8,8,TrCP,known allergies,12,5,6
3,beth,record-105,metal plate,26,7,8,TrAP,gsw,26,11,11
4,beth,record-105,creams,145,14,14,TrNAP,any incisions,145,20,21


In [15]:
relation_df.groupby(['relation_type']).size()

relation_type
PIP      2203
TeCP      504
TeRP     3053
TrAP     2617
TrCP      526
TrIP      203
TrNAP     174
TrWP      133
dtype: int64

In [None]:
sentence_map = dict()

#Treatment improves medical problem
sentence_map['TrIP'] = ["{0} improves {1}.", "{1} can be treated by {0}."]

#Treatment worsens medical problem
sentence_map['TrWP'] = ["{0} worsens {1}.", "{0} does not improve {1}.", "{0} does not cure {1}."]

#Treatment causes medical problem
sentence_map['TrCP'] = ["{0} causes {1}.", "{0} results in {1}.", "{1} is a result of {0}."]

#Treatment is administered for the medical problem
sentence_map['TrAP'] = ["{0} is prescribed for {1}.", "{0} is administered for {1}."]

#Treatment is not administered because of medical problem
sentence_map['TrNAP'] = ["{0} can not be prescribed due to {1}.", "{0} is not administered due to {1}."]

#Treatment is not administered because of medical problem
sentence_map['TrNRP'] = ["{0} and {1} has no relation."]

#Test reveals medical problem
sentence_map['TeRP'] = ["{0} reveals {1}.", "{0} indicates {1}."]

#Test conducted to investigate medical problem
sentence_map['TeCP'] = ["{0} is conducted to check {1}.", "{0} is performed to investigate {1}."]

#Test and problem has no relation
sentence_map['TeNRP'] = ["{0} and {1} has no relation."]

#Medical problem indicates medical problem
sentence_map['PIP'] = ["{1} can cause {0}.", "{0} is a result of {1}"]

#Medical problem indicates medical problem
sentence_map['PNP'] = ["{0} and {1} has no relation."]

In [103]:
nsp_utility = BERT_utility()

relation_encoding_list = list()
relation_label_list = list()

no_rel_count = 0
rel_count = 0
exist_rec_list = list()

all_relations_dict = {"treatment":['TrIP','TrWP','TrCP','TrAP',"TrNRP"], "test":['TeRP','TeCP','TeNRP'], "problem":['PIP','PNP']}
no_relations_dict = {"treatment":"TrNRP", "test":"TeNRP", "problem":"PNP"}

all_problems = concept_df[concept_df['concept_type']=='problem']

for index,row in all_problems.iterrows():
    all_other_entities = concept_df[(concept_df['file_name']==row['file_name'])&(concept_df['source']==row['source'])&(concept_df['line_number']==row['line_number'])&(concept_df['begin_word_num']!=row['begin_word_num'])]
    
    for entity_index,entity_row in all_other_entities.iterrows():
        
        if(key_to_check in exist_rec_list):
            continue
        else:
            exist_rec_list.append(row["source"]+"#"+row["file_name"]+"#"+str(row["line_number"])+"#"+str(row["begin_word_num"])+"#"+str(entity_row["begin_word_num"]))
        
        relation_df_record = relation_df[(relation_df['file_name']==row['file_name'])&(relation_df['source']==row['source'])&(relation_df['from_line_number']==row['line_number'])&(((relation_df['from_begin_word_num']==row['begin_word_num']) & (relation_df['to_begin_word_num']==entity_row['begin_word_num'])) |((relation_df['from_begin_word_num']==entity_row['begin_word_num']) & (relation_df['to_begin_word_num']==row['begin_word_num'])))]
        
        key_to_check = row["source"]+"#"+row["file_name"]+"#"+str(row["line_number"])+"#"+str(entity_row["begin_word_num"])+"#"+str(row["begin_word_num"])
        
        if(len(relation_df_record)==0):
            relation = no_relations_dict[row["concept_type"]]
        else:
            relation = relation_df_record.iloc[0]["relation_type"]
        
        remaining_relations = [entry for entry in all_relations_dict[entity_row['concept_type']] if entry != relation]
        
        first_sentence = text_df[text_df['file_name']==row['file_name']].iloc[0]['text'].split("\n")[row['line_number']-1]
        
        for entry in sentence_map[relation]:
            second_sentence = entry.format(entity_row['text'],row['text'])
            
            relation_encoding_list.append(nsp_utility.get_embeddings_for_nsp(first_sentence,second_sentence))
            relation_label_list.append(1)
        
        for no_relation in remaining_relations:
            for entry in sentence_map[no_relation]:
                second_sentence = entry.format(entity_row['text'],row['text'])

                relation_encoding_list.append(nsp_utility.get_embeddings_for_nsp(first_sentence,second_sentence))
                relation_label_list.append(0)

KeyboardInterrupt: 

In [17]:
no_rel_count

19935

In [18]:
rel_count

9410

In [19]:
len(relation_df)

9413

In [20]:
list_of_all_text = list()
        
for file in tqdm(os.listdir(beth_file_path+r'\txt')):
    if(not file.endswith(".txt")):
        continue
    file_path = os.path.join(beth_file_path+r'\txt', file)
    file_name = file[0:-4]
    oFile = open(file_path, 'r')
    line = oFile.read()
    
    new_text = dict()
    new_text['source'] = 'beth'
    new_text['file_name'] = file_name
    new_text['text'] = line
    
    list_of_all_text.append(new_text)
    
for file in tqdm(os.listdir(partners_file_path+r'\txt')):
    if(not file.endswith(".txt")):
        continue
    file_path = os.path.join(partners_file_path+r'\txt', file)
    file_name = file[0:-4]
    oFile = open(file_path, 'r')
    line = oFile.read()
    
    new_text = dict()
    new_text['source'] = 'partners'
    new_text['file_name'] = file_name
    new_text['text'] = line
    
    list_of_all_text.append(new_text)
    
    
for file in tqdm(os.listdir(test_data_texts_path)):
    if(not file.endswith(".txt")):
        continue
    file_path = os.path.join(test_data_texts_path, file)
    file_name = file.strip(".txt")
    oFile = open(file_path, 'r')
    line = oFile.read()
    
    new_text = dict()
    new_text['source'] = 'test_data'
    new_text['file_name'] = file_name
    new_text['text'] = line
    
    list_of_all_text.append(new_text)

100%|████████████████████████████████████████████████████████████████████████████████| 74/74 [00:00<00:00, 3700.44it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 97/97 [00:00<00:00, 8076.06it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 256/256 [00:00<00:00, 8827.65it/s]


In [21]:
text_df = pd.DataFrame(list_of_all_text)

In [22]:
all_files = list(text_df['file_name'])

In [23]:
training_files = random.sample(all_files,int(0.8*len(all_files)))

test_files = [entry for entry in all_files if entry not in training_files]

In [29]:
class_map = {"blank":0,"problem":1,"test":2,"treatment":3}

encoding_list = list()
label_list = list()
utility = BERT_utility(get_encodings=True, get_embeddings=False)

for file_name in training_files:
    text = text_df[text_df['file_name']==file_name].iloc[0]['text']
    all_lines = text.split("\n")
    file_concepts = concept_df[concept_df['file_name']==file_name]
    positions = create_pos_dict_concept(file_concepts)
    
    prior_sentence_index = -1
    
    word_list = utility.process_string_finetune(text,0)
    encoding = utility.encoding_list
    
    for i in range(len(all_lines)):
        labels = [0] * len(encoding[i])
        fil_word_list = [word for word in word_list if word["sentence_index"]==i+1] 
        for entry in fil_word_list:
            key = str(entry['sentence_index'])+ ":" +str(entry['word_index'])
            if(key in positions):
                for token_position in entry["bert_token_positions"]:
                    labels[token_position] = class_map[positions[key]]
        label_list.append(labels)
    
    encoding_list.extend(encoding)

In [32]:
pickle.dump(training_files,open("C:/Users/itsma/Documents/CS 6120 Project/training_files.pkl","wb"))
pickle.dump(test_files,open("C:/Users/itsma/Documents/CS 6120 Project/test_files.pkl","wb"))
pickle.dump(encoding_list,open("C:/Users/itsma/Documents/CS 6120 Project/input_ids.pkl","wb"))
pickle.dump(label_list,open("C:/Users/itsma/Documents/CS 6120 Project/label.pkl","wb"))

In [None]:
encoding_list = list()
label_list = list()
utility = BERT_utility()

for file in tqdm(os.listdir(beth_file_path+r'\txt')):
    if(not file.endswith(".txt")):
        continue
    file_path = os.path.join(beth_file_path+r'\txt', file)
    file_name = file.strip(".txt")
    oFile = open(file_path, 'r')
    line = oFile.read()
    all_lines = line.split("\n")
    file_concepts = concept_df[(concept_df['file_name']==file_name)&(concept_df['source']=='beth')]
    positions = create_pos_dict(file_concepts)
    
    prior_sentence_index = -1
    
    word_list, encoding = utility.process_string_finetune(line,0)
    
    for i in range(len(all_lines)):
        labels = [0] * len(encoding[i])
        fil_word_list = [word for word in word_list if word["sentence_index"]==i+1] 
        for entry in fil_word_list:
            key = str(entry['sentence_index'])+ ":" +str(entry['word_index'])
            if(key in positions):
                for token_position in entry["bert_token_positions"]:
                    labels[token_position] = class_map[positions[key]]
        label_list.append(labels)
    
    encoding_list.extend(encoding)

for file in tqdm(os.listdir(partners_file_path+r'\txt')):
    if(not file.endswith(".txt")):
        continue
    file_path = os.path.join(partners_file_path+r'\txt', file)
    file_name = file.strip(".txt")
    oFile = open(file_path, 'r')
    line = oFile.read()
    all_lines = line.split("\n")
    file_concepts = concept_df[(concept_df['file_name']==file_name)&(concept_df['source']=='partners')]
    positions = create_pos_dict(file_concepts)
    
    prior_sentence_index = -1
    
    word_list, encoding = utility.process_string_finetune(line,0)
    
    for i in range(len(all_lines)):
        labels = [0] * len(encoding[i])
        
        fil_word_list = [word for word in word_list if word["sentence_index"]==i+1] 
        for entry in fil_word_list:
            key = str(entry['sentence_index'])+ ":" +str(entry['word_index'])
            if(key in positions):
                for token_position in entry["bert_token_positions"]:
                    labels[token_position] = class_map[positions[key]]
        label_list.append(labels)
    
    encoding_list.extend(encoding)
    
for file in tqdm(os.listdir(partners_file_path+r'\txt')):
    if(not file.endswith(".txt")):
        continue
    file_path = os.path.join(partners_file_path+r'\txt', file)
    file_name = file.strip(".txt")
    oFile = open(file_path, 'r')
    line = oFile.read()
    all_lines = line.split("\n")
    file_concepts = concept_df[(concept_df['file_name']==file_name)&(concept_df['source']=='partners')]
    positions = create_pos_dict(file_concepts)
    
    prior_sentence_index = -1
    
    word_list, encoding = utility.process_string_finetune(line,0)
    
    for i in range(len(all_lines)):
        labels = [0] * len(encoding[i])
        
        fil_word_list = [word for word in word_list if word["sentence_index"]==i+1] 
        for entry in fil_word_list:
            key = str(entry['sentence_index'])+ ":" +str(entry['word_index'])
            if(key in positions):
                for token_position in entry["bert_token_positions"]:
                    labels[token_position] = class_map[positions[key]]
        label_list.append(labels)
    
    encoding_list.extend(encoding)

In [None]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

In [None]:
inputs

In [None]:
encodings = tokenizer.encode("Hello, my dog is cute",add_special_tokens = True)

In [None]:
encodings

In [None]:
[0,1,0,2,0,3,1,0]

In [None]:
tokenizer.convert_ids_to_tokens(encodings)

In [None]:
input_ids = torch.tensor(encodings).long().unsqueeze(0)
        
outputs = model(input_ids,token_type_ids=None)

In [None]:
len(outputs[0][0][5].data.numpy())

In [99]:
from imp import reload

In [100]:
import BERT_utility

In [101]:
reload(BERT_utility)

<module 'BERT_utility' from 'C:\\Users\\itsma\\Documents\\CS 6120 Project\\CS6120\\Code\\BERT_utility.py'>

In [96]:
utility = BERT_utility()
#word_list = utility.process_string_finetune(line,0)

TypeError: 'module' object is not callable

In [59]:
all_words_list = list()

utility = BERT_utility(get_embeddings=True,get_encodings=False,use_finetuned_model=True)

for file_name in tqdm(training_files):
    text = text_df[text_df['file_name']==file_name].iloc[0]['text']
    all_lines = text.split("\n")
    
    file_concepts = concept_df[concept_df['file_name']==file_name]
    concept_positions = create_pos_dict_concept(file_concepts)
    
    file_assertions = assertion_df[assertion_df['file_name']==file_name]
    assertion_positions = create_pos_dict_assertion(file_assertions)
    
    word_list = utility.process_string_finetune(text,0)
    
    for entry in word_list:
        key = str(entry['sentence_index'])+ ":" +str(entry['word_index'])
        if(key in concept_positions):
            entry.update({"concept":concept_positions[key]})
            if(concept_positions[key]=='problem'):
                if(key in assertion_positions):
                    entry.update({"assertion":assertion_positions[key]})
                else:
                    entry.update({"assertion":"blank"})
            else:
                entry.update({"assertion":"blank"})
        else:
            entry.update({"concept":"blank"})
            entry.update({"assertion":"blank"})
    
    all_words_list.extend(word_list)

100%|████████████████████████████████████████████████████████████████████████████████| 340/340 [21:13<00:00,  3.75s/it]


In [67]:
all_words_list_test = list()

utility = BERT_utility(get_embeddings=True,get_encodings=False,use_finetuned_model=True)

for file_name in tqdm(test_files):
    text = text_df[text_df['file_name']==file_name].iloc[0]['text']
    all_lines = text.split("\n")
    
    file_concepts = concept_df[concept_df['file_name']==file_name]
    concept_positions = create_pos_dict_concept(file_concepts)
    
    file_assertions = assertion_df[assertion_df['file_name']==file_name]
    assertion_positions = create_pos_dict_assertion(file_assertions)
    
    word_list = utility.process_string_finetune(text,0)
    
    for entry in word_list:
        key = str(entry['sentence_index'])+ ":" +str(entry['word_index'])
        if(key in concept_positions):
            entry.update({"concept":concept_positions[key]})
            if(concept_positions[key]=='problem'):
                if(key in assertion_positions):
                    entry.update({"assertion":assertion_positions[key]})
                else:
                    entry.update({"assertion":"blank"})
            else:
                entry.update({"assertion":"blank"})
        else:
            entry.update({"concept":"blank"})
            entry.update({"assertion":"blank"})
    
    all_words_list_test.extend(word_list)

100%|██████████████████████████████████████████████████████████████████████████████████| 86/86 [06:21<00:00,  4.44s/it]


In [60]:
word_df = pd.DataFrame(all_words_list)

In [61]:
word_df.columns

Index(['word', 'keyword_vector', 'sentence_index', 'word_index', 'concept',
       'assertion'],
      dtype='object')

In [62]:
filt_df = word_df[word_df['concept']=='problem']

In [63]:
X_assertion = np.vstack(list(filt_df["keyword_vector"]))                                
y_assertion =list(filt_df["assertion"]) 

In [81]:
X_concept = np.vstack(list(word_df["keyword_vector"]))                                
y_concept =list(word_df["concept"]) 

In [82]:
clf_concept = LogisticRegression(random_state=0,solver="lbfgs",max_iter=1000).fit(X_concept, y_concept)



In [83]:
clf_concept.score(X_concept, y_concept)

0.980297626442755

In [65]:
clf_assertion = LogisticRegression(random_state=0,solver="lbfgs",max_iter=1000).fit(X_assertion, y_assertion)



In [66]:
clf_assertion.score(X_assertion, y_assertion)

0.9565322515648926

In [68]:
word_df_test = pd.DataFrame(all_words_list_test)

In [86]:
X_concept_test = np.vstack(list(word_df_test["keyword_vector"]))                                
y_concept_test =list(word_df_test["concept"]) 

In [87]:
clf_concept.score(X_concept_test, y_concept_test)

0.9805519175716554

In [88]:
y_predict_concept = clf_concept.predict(X_concept_test)
f1_score(y_concept_test,y_predict_concept,average='macro')

0.9636040579362416

In [69]:
filt_df_test = word_df_test[word_df_test['concept']=='problem']

X_assertion_test = np.vstack(list(filt_df_test["keyword_vector"]))                                
y_assertion_test =list(filt_df_test["assertion"])

In [70]:
clf_assertion.score(X_assertion_test, y_assertion_test)

0.928894883900183

In [74]:
y_predict_assertion = clf_assertion.predict(X_assertion_test)

In [75]:
f1_score(y_assertion_test, y_predict_assertion, average='macro')

0.7205219683677596

In [77]:
labels_assertion = ['hypothetical',
 'conditional',
 'associated_with_someone_else',
 'possible',
 'present',
 'absent']

confusion_matrix(y_assertion_test,y_predict_assertion,labels_assertion)

array([[ 203,    0,    0,    2,   50,   13],
       [   0,   20,    0,    0,   74,    0],
       [   0,    0,   39,    0,   26,    6],
       [  14,    1,    0,  309,  207,   16],
       [   9,   13,    1,  115, 7639,   66],
       [   6,    1,    2,   15,  101, 1431]], dtype=int64)

In [None]:
for file in tqdm(os.listdir(partners_file_path+r'\txt')):
    if(not file.endswith(".txt")):
        continue
    file_path = os.path.join(partners_file_path+r'\txt', file)
    file_name = file.strip(".txt")
    oFile = open(file_path, 'r')
    line = oFile.read()
    
    all_lines = line.split("\n")
    file_concepts = concept_df[(concept_df['file_name']==file_name)&(concept_df['source']=='partners')]
    concept_positions = create_pos_dict_concept(file_concepts)
    
    file_assertions = assertion_df[(assertion_df['file_name']==file_name)&(assertion_df['source']=='partners')]
    assertion_positions = create_pos_dict_assertion(file_assertions)
    
    word_list = utility.process_string_finetune(line,0)
    
    for entry in word_list:
        key = str(entry['sentence_index'])+ ":" +str(entry['word_index'])
        if(key in concept_positions):
            entry.update({"concept":concept_positions[key]})
            if(concept_positions[key]=='problem'):
                if(key in assertion_positions):
                    entry.update({"assertion":assertion_positions[key]})
                else:
                    entry.update({"assertion":"blank"})
            else:
                entry.update({"assertion":"blank"})
        else:
            entry.update({"concept":"blank"})
            entry.update({"assertion":"blank"})
    
    all_words_list.extend(word_list)

In [None]:
len(all_words_list)

In [None]:
words_df = pd.DataFrame(all_words_list)

In [None]:
len(words_df[words_df['concept']=='problem'])

In [None]:
set(list(words_df['concept']))

In [None]:
test_dict = {"a":"1", "b":"2"}

In [None]:
X_concept = np.vstack(list(words_df["keyword_vector"]))                                
y_concept = words_df["concept"]  

In [None]:
X_assertion = np.vstack(list(filt_df["keyword_vector"]))                                
y_assertion =list(filt_df["assertion"]) 

In [None]:
filt_df = words_df[words_df['assertion']!='blank'].copy()

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf_concept = LogisticRegression(random_state=0,solver="lbfgs",max_iter=1000).fit(X_concept, y_concept)

In [None]:
count = 0
for i in range(len(y_assertion)):
    
    if(y_assertion[i] not in ['absent', 'associated_with_someone_else', 'conditional', 'hypothetical', 'possible', 'present']):
        count += 1
print(count)

In [None]:
y_assertion

In [None]:
clf_concept.score(X_concept,y_concept)

In [None]:
np.any(np.isnan(X_assertion))

In [None]:
clf_assertion = LogisticRegression(random_state=0,solver="lbfgs",max_iter=1000).fit(X_assertion, y_assertion)

In [None]:
clf_assertion.score(X_assertion,y_assertion)

In [None]:
test_data_file_path = os.path.dirname(os.getcwd()) + r'\Data\reference_standard_for_test_data' 

list_of_all_test_concepts = list()

for file in os.listdir(test_data_file_path+r'\concepts'):
    file_path = os.path.join(test_data_file_path+r'\concepts', file)
    list_of_all_test_concepts.extend(get_concepts_from_file(file_path,file.strip(".con"),'test_data'))

In [None]:
test_concept_df = pd.DataFrame(list_of_all_test_concepts)

In [None]:
test_concept_df.head()

In [None]:
test_data_file_path = os.path.dirname(os.getcwd()) + r'\Data\reference_standard_for_test_data' 

list_of_all_test_assertions = list()

for file in os.listdir(test_data_file_path+r'\ast'):
    file_path = os.path.join(test_data_file_path+r'\ast', file)
    list_of_all_test_assertions.extend(get_assertions_from_file(file_path,file[0:-4],'test_data'))

In [None]:
test_assertion_df = pd.DataFrame(list_of_all_test_assertions)

In [None]:
test_assertion_df.head()

In [None]:
all_words_list_test = list()
test_data_texts_path =  os.path.dirname(os.getcwd()) + r'\Data\test_data'
for file in tqdm(os.listdir(test_data_texts_path)):
    if(not file.endswith(".txt")):
        continue
    file_path = os.path.join(test_data_texts_path, file)
    
    file_name = file.strip(".txt")
    
    oFile = open(file_path, 'r')
    
    line = oFile.read()
    
    all_lines = line.split("\n")
    
    file_concepts = test_concept_df[(test_concept_df['file_name']==file_name)&(test_concept_df['source']=='test_data')]
    concept_positions = create_pos_dict_concept(file_concepts)
    
    file_assertions = test_assertion_df[(test_assertion_df['file_name']==file_name)&(test_assertion_df['source']=='test_data')]
    assertion_positions = create_pos_dict_assertion(file_assertions)
    
    word_list = utility.process_string_finetune(line,0)
    
    for entry in word_list:
        key = str(entry['sentence_index'])+ ":" +str(entry['word_index'])
        
        if(key in concept_positions):
            entry.update({"concept":concept_positions[key]})
            
            if(concept_positions[key]=='problem'):
                if(key in assertion_positions):
                    entry.update({"assertion":assertion_positions[key]})
                else:
                    entry.update({"assertion":"blank"})
            else:
                entry.update({"assertion":"blank"})
        else:
            entry.update({"concept":"blank"})
            entry.update({"assertion":"blank"})
    
    all_words_list_test.extend(word_list)

In [None]:
test_word_df = pd.DataFrame(all_words_list_test)

In [None]:
test_filt_df = test_word_df[test_word_df['assertion']!='blank'].copy()

In [None]:
X_test_concept = np.vstack(list(test_word_df["keyword_vector"]))                                
y_test_concept = test_word_df["concept"]  

In [None]:
X_test_assertion = np.vstack(list(test_filt_df["keyword_vector"]))                                
y_test_assertion = test_filt_df["assertion"]  

In [None]:
clf_assertion.score(X_test_assertion,y_test_assertion)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
y_true = y_test
y_predict = clf.predict(X_test)

In [None]:
y_predict_assertion = clf_assertion.predict(X_test_assertion)

In [None]:
f1_score(y_test_assertion, y_predict_assertion, average='macro')

In [None]:
labels_assertion = ['hypothetical',
 'conditional',
 'associated_with_someone_else',
 'possible',
 'present',
 'absent']

In [None]:
labels = ['blank', 'problem', 'test', 'treatment']

In [None]:
confusion_matrix(y_test_assertion,y_predict_assertion,labels_assertion)

In [89]:
"{1} can be treated {0}".format("Tylenol","headache")

'headache can be treated Tylenol'

In [None]:
from sklearn.metrics import f1_score

In [None]:
f1_score(y_true, y_predict, average='micro')

In [None]:
[0]*5

In [None]:
tokenizer.convert_ids_to_tokens([101, 2381, 1997, 2556, 7355, 1024, 102])

In [None]:
label_list[87]

In [None]:
encoding_list[87]

In [None]:
len(encoding_list)

In [None]:
import pickle

In [None]:
finetuned_model = pickle.load(open("C:/Users/itsma/Documents/CS 6120 Project/CS6120/Model/finetuned_model.pkl","rb"))

In [None]:
encodings = tokenizer.encode("Hello, my dog is cute",add_special_tokens = True)

In [None]:
finetuned_model.cpu()

In [None]:
input_ids = torch.tensor(encodings).long().unsqueeze(0)
        
outputs = finetuned_model(input_ids,token_type_ids=None)

In [None]:
filt = [entry for entry in label_list if 3 in entry]

In [48]:
bert_model = pickle.load(open("C:/Users/itsma/Documents/CS 6120 Project/CS6120/Model/finetuned_model.pkl","rb"))
bert_model.cpu()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [None]:
import pickle

In [44]:
encodings = bert_tokenizer.encode("i love my country",add_special_tokens = True)

In [45]:
from transformers import BertTokenizer, BertConfig

from BERT_config import bert_config
import torch

In [46]:
bert_tokenizer = BertTokenizer.from_pretrained(bert_config['ncbi_base_path'])

In [49]:
input_ids = torch.tensor(encodings).long().unsqueeze(0)

outputs = bert_model(input_ids,token_type_ids=None)

In [52]:
len(outputs[1][12][0])

6

In [None]:
input_ids