In [1]:
import re
import os
import pandas as pd
from tqdm import tqdm
import numpy as np

from BERT_utility import BERT_utility

import random
import torch
import pickle
from keras.preprocessing.sequence import pad_sequences

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

Using TensorFlow backend.


In [2]:
def get_concepts_from_file(file_path,file_name,source):
    
    list_of_concepts = list()
    
    file = open(file_path, 'r',encoding="utf8",errors = 'ignore') 
    Lines = file.readlines() 
    for line in Lines: 
        entry = line.strip()
        regular_exp_con = 'c="(.*)" ([0-9]*):([0-9]*) [0-9]*:([0-9]*)\|\|t="(.*)"'
        pattern_search = re.search(regular_exp_con, entry, re.IGNORECASE)

        if pattern_search:
            concept_dict = dict()
            concept_dict['source'] = source
            concept_dict['file_name'] = file_name 
            concept_dict['text'] = pattern_search.group(1)
            concept_dict['line_number'] = int(pattern_search.group(2))
            concept_dict['begin_word_num'] = int(pattern_search.group(3))
            concept_dict['end_word_num'] = int(pattern_search.group(4))
            concept_dict['concept_type'] = pattern_search.group(5)
            list_of_concepts.append(concept_dict)
    
    return list_of_concepts

def get_assertions_from_file(file_path,file_name,source):
    
    list_of_assertions = list()
    
    file = open(file_path, 'r',encoding="utf8",errors = 'ignore') 
    Lines = file.readlines() 
    
    for line in Lines: 
        entry = line.strip()
        regular_exp_con = 'c="(.*)" ([0-9]*):([0-9]*) [0-9]*:([0-9]*)\|\|t=".*"\|\|a="(.*)"'
        pattern_search = re.search(regular_exp_con, entry, re.IGNORECASE)

        if pattern_search:
            assertion_dict = dict()
            assertion_dict['source'] = source
            assertion_dict['file_name'] = file_name 
            assertion_dict['text'] = pattern_search.group(1)
            assertion_dict['line_number'] = int(pattern_search.group(2))
            assertion_dict['begin_word_num'] = int(pattern_search.group(3))
            assertion_dict['end_word_num'] = int(pattern_search.group(4))
            assertion_dict['assertion_type'] = pattern_search.group(5)
            list_of_assertions.append(assertion_dict)
    
    return list_of_assertions

def get_relations_from_file(file_path,file_name,source):
    
    list_of_relations = list()
    
    file = open(file_path, 'r',encoding="utf8",errors = 'ignore') 
    Lines = file.readlines() 
    
    for line in Lines: 
        entry = line.strip()
        regular_exp_con = 'c="(.*)" ([0-9]*):([0-9]*) [0-9]*:([0-9]*)\|\|r="(.*)"\|\|c="(.*)" ([0-9]*):([0-9]*) [0-9]*:([0-9]*)'
        pattern_search = re.search(regular_exp_con, entry, re.IGNORECASE)

        if pattern_search:
            relation_dict = dict()
            relation_dict['source'] = source
            relation_dict['file_name'] = file_name 
            relation_dict['from_text'] = pattern_search.group(1)
            relation_dict['from_line_number'] = int(pattern_search.group(2))
            relation_dict['from_begin_word_num'] = int(pattern_search.group(3))
            relation_dict['from_end_word_num'] = int(pattern_search.group(4))
            relation_dict['relation_type'] = pattern_search.group(5)
            relation_dict['to_text'] = pattern_search.group(6)
            relation_dict['to_line_number'] = int(pattern_search.group(7))
            relation_dict['to_begin_word_num'] = int(pattern_search.group(8))
            relation_dict['to_end_word_num'] = int(pattern_search.group(9))
            list_of_relations.append(relation_dict)
    
    return list_of_relations

def create_pos_dict_concept(concept):
    positions = dict()
    
    for index,row in concept.iterrows():
        for i in range(row['begin_word_num'],row['end_word_num']+1):
            positions[str(row['line_number'])+":"+str(i)] = row['concept_type']
    
    return positions

def create_pos_dict_assertion(assertion):
    positions = dict()
    
    for index,row in assertion.iterrows():
        for i in range(row['begin_word_num'],row['end_word_num']+1):
            positions[str(row['line_number'])+":"+str(i)] = row['assertion_type']
    
    return positions

In [3]:
data_file_path = os.path.dirname(os.getcwd()) + r'\Data\concept_assertion_relation_training_data' 
beth_file_path = data_file_path + r'\beth'
partners_file_path = data_file_path + r'\partners'
test_data_file_path = os.path.dirname(os.getcwd()) + r'\Data\reference_standard_for_test_data'
test_data_texts_path =  os.path.dirname(os.getcwd()) + r'\Data\test_data'

list_of_all_concepts = list()

for file in os.listdir(beth_file_path+r'\concept'):
    file_path = os.path.join(beth_file_path+r'\concept', file)
    list_of_all_concepts.extend(get_concepts_from_file(file_path,file[:-4],'beth'))

for file in os.listdir(partners_file_path+r'\concept'):
    file_path = os.path.join(partners_file_path+r'\concept', file)
    list_of_all_concepts.extend(get_concepts_from_file(file_path,file[:-4],'partners'))

for file in os.listdir(test_data_file_path+r'\concepts'):
    file_path = os.path.join(test_data_file_path+r'\concepts', file)
    list_of_all_concepts.extend(get_concepts_from_file(file_path,file[0:-4],'test_data'))

In [4]:
concept_df = pd.DataFrame(list_of_all_concepts)

In [5]:
concept_df.head()

Unnamed: 0,source,file_name,text,line_number,begin_word_num,end_word_num,concept_type
0,beth,record-105,left basilar atelectasis,55,6,8,problem
1,beth,record-105,ventral hernia,143,1,2,problem
2,beth,record-105,htn,26,0,0,problem
3,beth,record-105,spontaneous echo contrast,68,1,3,problem
4,beth,record-105,cath,21,0,0,test


In [6]:
concept_df[(concept_df['file_name']=='record-105')&(concept_df['source']=='beth')&(concept_df['line_number']==143)&(concept_df['begin_word_num']!=1)]

Unnamed: 0,source,file_name,text,line_number,begin_word_num,end_word_num,concept_type
10,beth,record-105,right facial plating,143,6,8,treatment
11,beth,record-105,htn,143,0,0,problem
31,beth,record-105,right nephrectomy,143,3,4,treatment


In [7]:
concept_df.groupby(['concept_type']).size()

concept_type
problem      19665
test         13833
treatment    14188
dtype: int64

In [8]:
list_of_all_assertions = list()

for file in os.listdir(beth_file_path+r'\ast'):
    file_path = os.path.join(beth_file_path+r'\ast', file)
    list_of_all_assertions.extend(get_assertions_from_file(file_path,file[:-4],'beth'))

for file in os.listdir(partners_file_path+r'\ast'):
    file_path = os.path.join(partners_file_path+r'\ast', file)
    list_of_all_assertions.extend(get_assertions_from_file(file_path,file[:-4],'partners'))

for file in os.listdir(test_data_file_path+r'\ast'):
    file_path = os.path.join(test_data_file_path+r'\ast', file)
    list_of_all_assertions.extend(get_assertions_from_file(file_path,file[0:-4],'test_data'))

In [9]:
assertion_df = pd.DataFrame(list_of_all_assertions)

In [10]:
assertion_df.head()

Unnamed: 0,source,file_name,text,line_number,begin_word_num,end_word_num,assertion_type
0,beth,record-105,left basilar atelectasis,55,6,8,present
1,beth,record-105,ventral hernia,143,1,2,present
2,beth,record-105,htn,26,0,0,present
3,beth,record-105,spontaneous echo contrast,68,1,3,absent
4,beth,record-105,80% lm lesion,21,6,8,present


In [12]:
assertion_df.groupby(['assertion_type']).size()

assertion_type
absent                           4190
associated_with_someone_else      220
conditional                       221
hypothetical                      827
possible                          961
present                         13246
dtype: int64

In [13]:
list(set(list(assertion_df['assertion_type'])))

['hypothetical',
 'conditional',
 'absent',
 'possible',
 'present',
 'associated_with_someone_else']

In [14]:
list_of_all_relations = list()

for file in os.listdir(beth_file_path+r'\rel'):
    file_path = os.path.join(beth_file_path+r'\rel', file)
    list_of_all_relations.extend(get_relations_from_file(file_path,file[0:-4],'beth'))

for file in os.listdir(partners_file_path+r'\rel'):
    file_path = os.path.join(partners_file_path+r'\rel', file)
    list_of_all_relations.extend(get_relations_from_file(file_path,file[0:-4],'partners'))

for file in os.listdir(test_data_file_path+r'\rel'):
    file_path = os.path.join(test_data_file_path+r'\rel', file)
    list_of_all_relations.extend(get_relations_from_file(file_path,file[0:-4],'test_data'))

In [15]:
relation_df = pd.DataFrame(list_of_all_relations)

In [16]:
relation_df.head()

Unnamed: 0,source,file_name,from_text,from_line_number,from_begin_word_num,from_end_word_num,relation_type,to_text,to_line_number,to_begin_word_num,to_end_word_num
0,beth,record-105,cath,21,0,0,TeRP,80% lm lesion,21,6,8
1,beth,record-105,pefusion imaging,19,6,7,TeRP,perfusion defects,19,12,13
2,beth,record-105,drugs,12,8,8,TrCP,known allergies,12,5,6
3,beth,record-105,metal plate,26,7,8,TrAP,gsw,26,11,11
4,beth,record-105,creams,145,14,14,TrNAP,any incisions,145,20,21


In [17]:
relation_df.groupby(['relation_type']).size()

relation_type
PIP      2203
TeCP      504
TeRP     3053
TrAP     2617
TrCP      526
TrIP      203
TrNAP     174
TrWP      133
dtype: int64

In [18]:
list_of_all_text = list()
        
for file in tqdm(os.listdir(beth_file_path+r'\txt')):
    if(not file.endswith(".txt")):
        continue
    file_path = os.path.join(beth_file_path+r'\txt', file)
    file_name = file[0:-4]
    oFile = open(file_path, 'r')
    line = oFile.read()
    
    new_text = dict()
    new_text['source'] = 'beth'
    new_text['file_name'] = file_name
    new_text['text'] = line
    
    list_of_all_text.append(new_text)
    
for file in tqdm(os.listdir(partners_file_path+r'\txt')):
    if(not file.endswith(".txt")):
        continue
    file_path = os.path.join(partners_file_path+r'\txt', file)
    file_name = file[0:-4]
    oFile = open(file_path, 'r')
    line = oFile.read()
    
    new_text = dict()
    new_text['source'] = 'partners'
    new_text['file_name'] = file_name
    new_text['text'] = line
    
    list_of_all_text.append(new_text)
    
    
for file in tqdm(os.listdir(test_data_texts_path)):
    if(not file.endswith(".txt")):
        continue
    file_path = os.path.join(test_data_texts_path, file)
    file_name = file.strip(".txt")
    oFile = open(file_path, 'r')
    line = oFile.read()
    
    new_text = dict()
    new_text['source'] = 'test_data'
    new_text['file_name'] = file_name
    new_text['text'] = line
    
    list_of_all_text.append(new_text)

100%|█████████████████████████████████████████████████████████████████████████████████| 74/74 [00:00<00:00, 200.59it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 97/97 [00:00<00:00, 215.54it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 256/256 [00:01<00:00, 212.88it/s]


In [19]:
text_df = pd.DataFrame(list_of_all_text)

In [23]:
all_files = list(text_df['file_name'])

training_files = pickle.load(open("C:/Users/itsma/Documents/CS 6120 Project/training_files.pkl","rb"))

test_files = [entry for entry in all_files if entry not in training_files]

In [24]:
sentence_map = dict()

#Treatment improves medical problem
sentence_map['TrIP'] = ["{0} improves {1}.", "{1} can be treated by {0}."]

#Treatment worsens medical problem
sentence_map['TrWP'] = ["{0} worsens {1}.", "{0} does not improve {1}.", "{0} does not cure {1}."]

#Treatment causes medical problem
sentence_map['TrCP'] = ["{0} causes {1}.", "{0} results in {1}.", "{1} is a result of {0}."]

#Treatment is administered for the medical problem
sentence_map['TrAP'] = ["{0} is prescribed for {1}.", "{0} is administered for {1}."]

#Treatment is not administered because of medical problem
sentence_map['TrNAP'] = ["{0} can not be prescribed due to {1}.", "{0} is not administered due to {1}."]

#Treatment is not administered because of medical problem
sentence_map['TrNRP'] = ["{0} and {1} has no relation."]

#Test reveals medical problem
sentence_map['TeRP'] = ["{0} reveals {1}.", "{0} indicates {1}."]

#Test conducted to investigate medical problem
sentence_map['TeCP'] = ["{0} is conducted to check {1}.", "{0} is performed to investigate {1}."]

#Test and problem has no relation
sentence_map['TeNRP'] = ["{0} and {1} has no relation."]

#Medical problem indicates medical problem
sentence_map['PIP'] = ["{1} can cause {0}.", "{0} is a result of {1}"]

#Medical problem indicates medical problem
sentence_map['PNP'] = ["{0} and {1} has no relation."]

In [25]:
sentence_map = dict()

#Treatment improves medical problem
sentence_map['TrIP'] = ["{0} improves {1}."]

#Treatment worsens medical problem
sentence_map['TrWP'] = ["{0} worsens {1}."]

#Treatment causes medical problem
sentence_map['TrCP'] = ["{0} causes {1}."]

#Treatment is administered for the medical problem
sentence_map['TrAP'] = ["{0} is administered for {1}."]

#Treatment is not administered because of medical problem
sentence_map['TrNAP'] = ["{0} is not administered due to {1}."]

#Treatment is not administered because of medical problem
sentence_map['NoREL'] = ["{0} and {1} has relation."]

#Test reveals medical problem
sentence_map['TeRP'] = ["{0} reveals {1}."]

#Test conducted to investigate medical problem
sentence_map['TeCP'] = ["{0} is conducted to investigate {1}."]

#Medical problem indicates medical problem
sentence_map['PIP'] = ["{1} indicates {0}."]

In [27]:
nsp_utility = BERT_utility()

relation_encoding_list = list()
relation_label_list = list()

no_rel_count = 0
rel_count = 0
exist_rec_list = list()

all_relations_dict = {"treatment":['TrIP','TrWP','TrCP','TrAP','TrNAP','NoREL'], "test":['TeRP','TeCP','NoREL'], "problem":['PIP','NoREL']}

all_problems = concept_df[(concept_df['concept_type']=='problem') & (concept_df['file_name'].isin(training_files))]

for index,row in all_problems.iterrows():
    all_other_entities = concept_df[(concept_df['file_name']==row['file_name'])&(concept_df['source']==row['source'])&(concept_df['line_number']==row['line_number'])&(concept_df['begin_word_num']!=row['begin_word_num'])]
    
    for entity_index,entity_row in all_other_entities.iterrows():
        
        key_to_check = row["source"]+"#"+row["file_name"]+"#"+str(row["line_number"])+"#"+str(entity_row["begin_word_num"])+"#"+str(row["begin_word_num"])
        
        if(key_to_check in exist_rec_list):
            continue
        else:
            exist_rec_list.append(row["source"]+"#"+row["file_name"]+"#"+str(row["line_number"])+"#"+str(row["begin_word_num"])+"#"+str(entity_row["begin_word_num"]))
        
        relation_df_record = relation_df[(relation_df['file_name']==row['file_name'])&(relation_df['source']==row['source'])&(relation_df['from_line_number']==row['line_number'])&(((relation_df['from_begin_word_num']==row['begin_word_num']) & (relation_df['to_begin_word_num']==entity_row['begin_word_num'])) |((relation_df['from_begin_word_num']==entity_row['begin_word_num']) & (relation_df['to_begin_word_num']==row['begin_word_num'])))]
        
        if(len(relation_df_record)==0):
            relation = 'NoREL'
        else:
            relation = relation_df_record.iloc[0]["relation_type"]
        
        remaining_relations = [entry for entry in all_relations_dict[entity_row['concept_type']] if entry != relation]
        
        first_sentence = text_df[text_df['file_name']==row['file_name']].iloc[0]['text'].split("\n")[row['line_number']-1]
        
        '''
        for entry in sentence_map[relation]:
            second_sentence = entry.format(entity_row['text'],row['text'])
            
            relation_encoding_list.append(nsp_utility.get_embeddings_for_nsp(first_sentence,second_sentence))
            relation_label_list.append(1)
        
        if(relation != 'NoREL'):
            for no_relation in remaining_relations:
                for entry in sentence_map[no_relation]:
                    second_sentence = entry.format(entity_row['text'],row['text'])

                    relation_encoding_list.append(nsp_utility.get_embeddings_for_nsp(first_sentence,second_sentence))
                    relation_label_list.append(0)
        '''
        
        if(relation=='NoREL'):
            entry = sentence_map['NoREL'][0]
            relation_label_list.append(0)
        else:
            entry = sentence_map['NoREL'][0]
            relation_label_list.append(1)
        
        second_sentence = entry.format(entity_row['text'],row['text'])
        relation_encoding_list.append(nsp_utility.get_embeddings_for_nsp(first_sentence,second_sentence))
        

In [35]:
nsp_utility = BERT_utility()

relation_encoding_list = list()
relation_label_list = list()
relation_type_list = list()

no_rel_count = 0
rel_count = 0
exist_rec_list = list()

all_relations_dict = {"treatment":['TrIP','TrWP','TrCP','TrAP','TrNAP','NoREL'], "test":['TeRP','TeCP','NoREL'], "problem":['PIP','NoREL']}

all_problems = concept_df[(concept_df['concept_type']=='problem') & (concept_df['file_name'].isin(training_files))]

for index,row in tqdm(all_problems.iterrows()):
    all_other_entities = concept_df[(concept_df['file_name']==row['file_name'])&(concept_df['source']==row['source'])&(concept_df['line_number']==row['line_number'])&(concept_df['begin_word_num']!=row['begin_word_num'])]
    
    for entity_index,entity_row in all_other_entities.iterrows():
        
        key_to_check = row["source"]+"#"+row["file_name"]+"#"+str(row["line_number"])+"#"+str(entity_row["begin_word_num"])+"#"+str(row["begin_word_num"])
        
        if(key_to_check in exist_rec_list):
            continue
        else:
            exist_rec_list.append(row["source"]+"#"+row["file_name"]+"#"+str(row["line_number"])+"#"+str(row["begin_word_num"])+"#"+str(entity_row["begin_word_num"]))
        
        relation_df_record = relation_df[(relation_df['file_name']==row['file_name'])&(relation_df['source']==row['source'])&(relation_df['from_line_number']==row['line_number'])&(((relation_df['from_begin_word_num']==row['begin_word_num']) & (relation_df['to_begin_word_num']==entity_row['begin_word_num'])) |((relation_df['from_begin_word_num']==entity_row['begin_word_num']) & (relation_df['to_begin_word_num']==row['begin_word_num'])))]
        
        if(len(relation_df_record)==0):
            relation = 'NoREL'
        else:
            relation = relation_df_record.iloc[0]["relation_type"]
        
        relation_label_list.append(relation_label_dict[relation])
        
        #remaining_relations = [entry for entry in all_relations_dict[entity_row['concept_type']] if entry != relation]
        all_lines = text_df[text_df['file_name']==row['file_name']].iloc[0]['text'].split("\n")
        
        first_sentence = all_lines[row['line_number']-1]
        
        if(row['line_number']>1):
            prev_sentence = all_lines[row['line_number']-2]
        else:
            prev_sentence = ""
        
        total_lines = len(all_lines)
        
        if(row['line_number']<total_lines):
            next_sentence = all_lines[row['line_number']]
        else:
            next_sentence = ""
        
        first_sentence.replace(row['text'],"@problem")
        
        if(entity_row['concept_type']=='problem'):
            relation_type_list.append("problem")
            first_sentence.replace(entity_row['text'],"@related_problem")
        elif(entity_row['concept_type']=='test'):
            relation_type_list.append("test")
            first_sentence.replace(entity_row['text'],"@test")
        else:
            relation_type_list.append("treatment")
            first_sentence.replace(entity_row['text'],"@treatment")
        
        
        final_sentence = prev_sentence + " . "+ first_sentence + ". " + next_sentence + " . "
        
        encodings = bert_tokenizer.encode(final_sentence, add_special_tokens=False)
        
        if(len(encodings)>511):
            final_sentence = prev_sentence + " . "+ first_sentence
        
            encodings = bert_tokenizer.encode(final_sentence, add_special_tokens=False)
            
            if(len(encodings)>511):
                final_sentence = first_sentence
        
                encodings = bert_tokenizer.encode(final_sentence, add_special_tokens=False)
        
        final_encoding = [101]
        final_encoding.extend(encodings)
        
        '''
        for entry in sentence_map[relation]:
            second_sentence = entry.format(entity_row['text'],row['text'])
            
            relation_encoding_list.append(nsp_utility.get_embeddings_for_nsp(first_sentence,second_sentence))
            relation_label_list.append(1)
        
        if(relation != 'NoREL'):
            for no_relation in remaining_relations:
                for entry in sentence_map[no_relation]:
                    second_sentence = entry.format(entity_row['text'],row['text'])

                    relation_encoding_list.append(nsp_utility.get_embeddings_for_nsp(first_sentence,second_sentence))
                    relation_label_list.append(0)
        
        if(relation=='NoREL'):
            entry = sentence_map['NoREL'][0]
            relation_label_list.append(0)
        else:
            entry = sentence_map['NoREL'][0]
            relation_label_list.append(1)
        
        second_sentence = entry.format(entity_row['text'],row['text'])
        '''
        
        relation_encoding_list.append(final_encoding)
        

15396it [04:28, 57.37it/s]


In [38]:
len(relation_label_list)

23206

In [60]:
len([x for x in ptr_relation_label_list if x != 0])

2828

In [40]:
pte_relation_label_list = [relation_label_list[i] for i in range(len(relation_label_list)) if relation_type_list[i]=='test']
pte_relation_encoding_list = [relation_encoding_list[i] for i in range(len(relation_encoding_list)) if relation_type_list[i]=='test']

ptr_relation_label_list = [relation_label_list[i] for i in range(len(relation_label_list)) if relation_type_list[i]=='treatment']
ptr_relation_encoding_list = [relation_encoding_list[i] for i in range(len(relation_encoding_list)) if relation_type_list[i]=='treatment']

pp_relation_label_list = [relation_label_list[i] for i in range(len(relation_label_list)) if relation_type_list[i]=='problem']
pp_relation_encoding_list = [relation_encoding_list[i] for i in range(len(relation_encoding_list)) if relation_type_list[i]=='problem']


In [42]:
pickle.dump(pte_relation_label_list,open("C:/Users/itsma/Documents/CS 6120 Project/pte_relation_labels.pkl","wb"))
pickle.dump(pte_relation_encoding_list,open("C:/Users/itsma/Documents/CS 6120 Project/pte_relation_encodings.pkl","wb"))

In [61]:
set(pte_relation_label_list)

{0, 1, 2}

In [43]:
pickle.dump(ptr_relation_label_list,open("C:/Users/itsma/Documents/CS 6120 Project/ptr_relation_labels.pkl","wb"))
pickle.dump(ptr_relation_encoding_list,open("C:/Users/itsma/Documents/CS 6120 Project/ptr_relation_encodings.pkl","wb"))

In [44]:
pickle.dump(pp_relation_label_list,open("C:/Users/itsma/Documents/CS 6120 Project/pp_relation_labels.pkl","wb"))
pickle.dump(pp_relation_encoding_list,open("C:/Users/itsma/Documents/CS 6120 Project/pp_relation_encodings.pkl","wb"))

In [29]:
relation_label_dict = {'NoREL':0, 'PIP':1, 'TeCP':1, 'TeRP':2, 'TrAP':1, 'TrCP':2, 'TrIP':3, 'TrNAP':4, 'TrWP':5}

In [30]:
set(relation_label_list)

set()

In [62]:
from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
from BERT_config import bert_config

In [63]:
bert_tokenizer = BertTokenizer.from_pretrained(bert_config['ncbi_base_path'])

In [54]:
temp = [101]

temp.extend(bert_tokenizer.encode("I like it."))
temp

[101, 101, 1045, 2066, 2009, 1012, 102]

In [76]:
from collections import Counter

In [78]:
Counter(rel_label_list).most_common(10)

[('NoREL', 15899),
 ('TeRP', 2352),
 ('TrAP', 1993),
 ('PIP', 1730),
 ('TrCP', 422),
 ('TeCP', 397),
 ('TrIP', 178),
 ('TrNAP', 133),
 ('TrWP', 102)]

In [75]:
nsp_utility = BERT_utility()

rel_label_list = list()

exist_rec_list = list()

all_relations_dict = {"treatment":['TrIP','TrWP','TrCP','TrAP','TrNAP','NoREL'], "test":['TeRP','TeCP','NoREL'], "problem":['PIP','NoREL']}

all_problems = concept_df[(concept_df['concept_type']=='problem') & (concept_df['file_name'].isin(training_files))]

for index,row in all_problems.iterrows():
    all_other_entities = concept_df[(concept_df['file_name']==row['file_name'])&(concept_df['source']==row['source'])&(concept_df['line_number']==row['line_number'])&(concept_df['begin_word_num']!=row['begin_word_num'])]
    
    for entity_index,entity_row in all_other_entities.iterrows():
        
        key_to_check = row["source"]+"#"+row["file_name"]+"#"+str(row["line_number"])+"#"+str(entity_row["begin_word_num"])+"#"+str(row["begin_word_num"])
        
        if(key_to_check in exist_rec_list):
            continue
        else:
            exist_rec_list.append(row["source"]+"#"+row["file_name"]+"#"+str(row["line_number"])+"#"+str(row["begin_word_num"])+"#"+str(entity_row["begin_word_num"]))
        
        relation_df_record = relation_df[(relation_df['file_name']==row['file_name'])&(relation_df['source']==row['source'])&(relation_df['from_line_number']==row['line_number'])&(((relation_df['from_begin_word_num']==row['begin_word_num']) & (relation_df['to_begin_word_num']==entity_row['begin_word_num'])) |((relation_df['from_begin_word_num']==entity_row['begin_word_num']) & (relation_df['to_begin_word_num']==row['begin_word_num'])))]
        
        if(len(relation_df_record)==0):
            relation = 'NoREL'
        else:
            relation = relation_df_record.iloc[0]["relation_type"]
        
        rel_label_list.append(relation)
        

In [None]:
relation_encoding_list = [entry[0] for entry in relation_encoding_list]

In [73]:
relation_encoding_list = [list(entry.detach().numpy()) for entry in relation_encoding_list]

In [None]:
relation_encoding_list = [list(np.array(entry)) for entry in relation_encoding_list]

In [None]:
relation_encoding_list = list(relation_encoding_list)

In [None]:
len([ entry for entry in relation_label_list if entry==1])

In [None]:
no_rel_count

In [None]:
rel_count

In [None]:
len(relation_df)

In [24]:
class_map = {"blank":0,"problem":1,"test":2,"treatment":3}

encoding_list = list()
label_list = list()
utility = BERT_utility(get_encodings=True, get_embeddings=False)

for file_name in training_files:
    text = text_df[text_df['file_name']==file_name].iloc[0]['text']
    all_lines = text.split("\n")
    file_concepts = concept_df[concept_df['file_name']==file_name]
    positions = create_pos_dict_concept(file_concepts)
    
    prior_sentence_index = -1
    
    word_list = utility.process_string_finetune(text,0)
    encoding = utility.encoding_list
    
    for i in range(len(all_lines)):
        labels = [0] * len(encoding[i])
        fil_word_list = [word for word in word_list if word["sentence_index"]==i+1] 
        for entry in fil_word_list:
            key = str(entry['sentence_index'])+ ":" +str(entry['word_index'])
            if(key in positions):
                for token_position in entry["bert_token_positions"]:
                    labels[token_position] = class_map[positions[key]]
        label_list.append(labels)
    
    encoding_list.extend(encoding)

In [None]:
pickle.dump(training_files,open("C:/Users/itsma/Documents/CS 6120 Project/training_files.pkl","wb"))
pickle.dump(test_files,open("C:/Users/itsma/Documents/CS 6120 Project/test_files.pkl","wb"))
pickle.dump(encoding_list,open("C:/Users/itsma/Documents/CS 6120 Project/input_ids.pkl","wb"))
pickle.dump(label_list,open("C:/Users/itsma/Documents/CS 6120 Project/label.pkl","wb"))

In [None]:
pickle.dump(concept_df,open("C:/Users/itsma/Documents/CS 6120 Project/concept_df.pkl","wb"))
pickle.dump(relation_df,open("C:/Users/itsma/Documents/CS 6120 Project/relation_df.pkl","wb"))
pickle.dump(text_df,open("C:/Users/itsma/Documents/CS 6120 Project/text_df.pkl","wb"))

In [None]:
encoding_list = pickle.load(open("C:/Users/itsma/Documents/CS 6120 Project/input_ids.pkl","rb"))

In [None]:
training_files

In [61]:
pickle.dump(relation_encoding_list,open("C:/Users/itsma/Documents/CS 6120 Project/relation_encodings.pkl","wb"))
pickle.dump(relation_label_list,open("C:/Users/itsma/Documents/CS 6120 Project/relation_labels.pkl","wb"))

In [None]:
encoding_list = list()
label_list = list()
utility = BERT_utility()

for file in tqdm(os.listdir(beth_file_path+r'\txt')):
    if(not file.endswith(".txt")):
        continue
    file_path = os.path.join(beth_file_path+r'\txt', file)
    file_name = file.strip(".txt")
    oFile = open(file_path, 'r')
    line = oFile.read()
    all_lines = line.split("\n")
    file_concepts = concept_df[(concept_df['file_name']==file_name)&(concept_df['source']=='beth')]
    positions = create_pos_dict(file_concepts)
    
    prior_sentence_index = -1
    
    word_list, encoding = utility.process_string_finetune(line,0)
    
    for i in range(len(all_lines)):
        labels = [0] * len(encoding[i])
        fil_word_list = [word for word in word_list if word["sentence_index"]==i+1] 
        for entry in fil_word_list:
            key = str(entry['sentence_index'])+ ":" +str(entry['word_index'])
            if(key in positions):
                for token_position in entry["bert_token_positions"]:
                    labels[token_position] = class_map[positions[key]]
        label_list.append(labels)
    
    encoding_list.extend(encoding)

for file in tqdm(os.listdir(partners_file_path+r'\txt')):
    if(not file.endswith(".txt")):
        continue
    file_path = os.path.join(partners_file_path+r'\txt', file)
    file_name = file.strip(".txt")
    oFile = open(file_path, 'r')
    line = oFile.read()
    all_lines = line.split("\n")
    file_concepts = concept_df[(concept_df['file_name']==file_name)&(concept_df['source']=='partners')]
    positions = create_pos_dict(file_concepts)
    
    prior_sentence_index = -1
    
    word_list, encoding = utility.process_string_finetune(line,0)
    
    for i in range(len(all_lines)):
        labels = [0] * len(encoding[i])
        
        fil_word_list = [word for word in word_list if word["sentence_index"]==i+1] 
        for entry in fil_word_list:
            key = str(entry['sentence_index'])+ ":" +str(entry['word_index'])
            if(key in positions):
                for token_position in entry["bert_token_positions"]:
                    labels[token_position] = class_map[positions[key]]
        label_list.append(labels)
    
    encoding_list.extend(encoding)
    
for file in tqdm(os.listdir(partners_file_path+r'\txt')):
    if(not file.endswith(".txt")):
        continue
    file_path = os.path.join(partners_file_path+r'\txt', file)
    file_name = file.strip(".txt")
    oFile = open(file_path, 'r')
    line = oFile.read()
    all_lines = line.split("\n")
    file_concepts = concept_df[(concept_df['file_name']==file_name)&(concept_df['source']=='partners')]
    positions = create_pos_dict(file_concepts)
    
    prior_sentence_index = -1
    
    word_list, encoding = utility.process_string_finetune(line,0)
    
    for i in range(len(all_lines)):
        labels = [0] * len(encoding[i])
        
        fil_word_list = [word for word in word_list if word["sentence_index"]==i+1] 
        for entry in fil_word_list:
            key = str(entry['sentence_index'])+ ":" +str(entry['word_index'])
            if(key in positions):
                for token_position in entry["bert_token_positions"]:
                    labels[token_position] = class_map[positions[key]]
        label_list.append(labels)
    
    encoding_list.extend(encoding)

In [65]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

In [34]:
len(all_files)

426

In [33]:
tokenizer.convert_ids_to_tokens(tokenizer.encode("Patient recorded as having known allergies in response to drug."))

['[CLS]',
 'patient',
 'recorded',
 'as',
 'having',
 'known',
 'all',
 '##er',
 '##gies',
 'in',
 'response',
 'to',
 'drug',
 '.',
 '[SEP]']

In [66]:
encodings = tokenizer.encode("Hello, my dog is cute",add_special_tokens = True)

In [None]:
encodings

In [None]:
[0,1,0,2,0,3,1,0]

In [67]:
tokenizer.convert_ids_to_tokens(tokenizer.encode("MRI Scan showed white spots in brain and he was administered Benfotiamine."))

['[CLS]',
 'mri',
 'scan',
 'showed',
 'white',
 'spots',
 'in',
 'brain',
 'and',
 'he',
 'was',
 'administered',
 'ben',
 '##fo',
 '##tia',
 '##mine',
 '.',
 '[SEP]']

In [None]:
input_ids = torch.tensor(encodings).long().unsqueeze(0)
        
outputs = model(input_ids,token_type_ids=None)

In [None]:
len(outputs[0][0][5].data.numpy())

In [None]:
from imp import reload

In [None]:
import BERT_utility

In [None]:
reload(BERT_utility)

In [None]:
utility = BERT_utility()
#word_list = utility.process_string_finetune(line,0)

In [35]:
all_words_list = list()

utility = BERT_utility(get_embeddings=True,get_encodings=False,use_finetuned_model=True)

for file_name in tqdm(training_files):
    text = text_df[text_df['file_name']==file_name].iloc[0]['text']
    all_lines = text.split("\n")
    
    file_concepts = concept_df[concept_df['file_name']==file_name]
    concept_positions = create_pos_dict_concept(file_concepts)
    
    file_assertions = assertion_df[assertion_df['file_name']==file_name]
    assertion_positions = create_pos_dict_assertion(file_assertions)
    
    word_list = utility.process_string_finetune(text,0)
    
    for entry in word_list:
        key = str(entry['sentence_index'])+ ":" +str(entry['word_index'])
        if(key in concept_positions):
            entry.update({"concept":concept_positions[key]})
            if(concept_positions[key]=='problem'):
                if(key in assertion_positions):
                    entry.update({"assertion":assertion_positions[key]})
                else:
                    entry.update({"assertion":"blank"})
            else:
                entry.update({"assertion":"blank"})
        else:
            entry.update({"concept":"blank"})
            entry.update({"assertion":"blank"})
    
    all_words_list.extend(word_list)

100%|████████████████████████████████████████████████████████████████████████████████| 340/340 [24:29<00:00,  4.32s/it]


In [36]:
all_words_list_test = list()

utility = BERT_utility(get_embeddings=True,get_encodings=False,use_finetuned_model=True)

for file_name in tqdm(test_files):
    text = text_df[text_df['file_name']==file_name].iloc[0]['text']
    all_lines = text.split("\n")
    
    file_concepts = concept_df[concept_df['file_name']==file_name]
    concept_positions = create_pos_dict_concept(file_concepts)
    
    file_assertions = assertion_df[assertion_df['file_name']==file_name]
    assertion_positions = create_pos_dict_assertion(file_assertions)
    
    word_list = utility.process_string_finetune(text,0)
    
    for entry in word_list:
        key = str(entry['sentence_index'])+ ":" +str(entry['word_index'])
        if(key in concept_positions):
            entry.update({"concept":concept_positions[key]})
            if(concept_positions[key]=='problem'):
                if(key in assertion_positions):
                    entry.update({"assertion":assertion_positions[key]})
                else:
                    entry.update({"assertion":"blank"})
            else:
                entry.update({"assertion":"blank"})
        else:
            entry.update({"concept":"blank"})
            entry.update({"assertion":"blank"})
    
    all_words_list_test.extend(word_list)

100%|██████████████████████████████████████████████████████████████████████████████████| 86/86 [07:20<00:00,  5.13s/it]


In [37]:
word_df = pd.DataFrame(all_words_list)

In [38]:
word_df.columns

Index(['word', 'keyword_vector', 'sentence_index', 'word_index', 'concept',
       'assertion'],
      dtype='object')

In [39]:
filt_df = word_df[word_df['concept']=='problem']

In [40]:
X_assertion = np.vstack(list(filt_df["keyword_vector"]))                                
y_assertion =list(filt_df["assertion"]) 

In [41]:
X_concept = np.vstack(list(word_df["keyword_vector"]))                                
y_concept =list(word_df["concept"]) 

In [42]:
clf_concept = LogisticRegression(random_state=0,solver="lbfgs",max_iter=1000).fit(X_concept, y_concept)



In [43]:
clf_concept.score(X_concept, y_concept)

0.9954334855504816

In [47]:
clf_assertion = LogisticRegression(random_state=0,solver="lbfgs",max_iter=1000).fit(X_assertion, y_assertion)



In [48]:
clf_assertion.score(X_assertion, y_assertion)

0.9529860570077641

In [44]:
word_df_test = pd.DataFrame(all_words_list_test)

In [45]:
X_concept_test = np.vstack(list(word_df_test["keyword_vector"]))                                
y_concept_test =list(word_df_test["concept"]) 

In [46]:
clf_concept.score(X_concept_test, y_concept_test)

0.971349428031664

In [None]:

f1_score(y_concept_test,y_predict_concept,average='macro')

In [55]:
y_predict_concept = clf_concept.predict(X_concept_test)
f1_score(y_concept_test,y_predict_concept,average=None, labels = ['problem','test','treatment'])

array([0.93780271, 0.9370384 , 0.93038412])

In [57]:
from sklearn.metrics import precision_score, recall_score

In [59]:
print(precision_score(y_concept_test,y_predict_concept,average=None, labels = ['problem','test','treatment']))
print(recall_score(y_concept_test,y_predict_concept,average=None, labels = ['problem','test','treatment']))

[0.93354974 0.9461223  0.93713945]
[0.94209461 0.92812729 0.92372549]


In [50]:
filt_df_test = word_df_test[word_df_test['concept']=='problem']

X_assertion_test = np.vstack(list(filt_df_test["keyword_vector"]))                                
y_assertion_test =list(filt_df_test["assertion"])

In [51]:
clf_assertion.score(X_assertion_test, y_assertion_test)

0.9252336448598131

In [52]:
y_predict_assertion = clf_assertion.predict(X_assertion_test)

In [60]:
f1_score(y_assertion_test, y_predict_assertion, average=None,labels=labels_assertion)

array([0.8249497 , 0.26666667, 0.65      , 0.61139896, 0.95643478,
       0.91396794])

In [61]:
print(precision_score(y_assertion_test, y_predict_assertion, average=None,labels=labels_assertion))
print(recall_score(y_assertion_test, y_predict_assertion, average=None,labels=labels_assertion))

[0.89519651 0.61538462 0.79591837 0.70574163 0.93808239 0.93071286]
[0.76492537 0.17021277 0.54929577 0.5393053  0.97551957 0.89781491]


In [54]:
labels_assertion = ['hypothetical',
 'conditional',
 'associated_with_someone_else',
 'possible',
 'present',
 'absent']

confusion_matrix(y_assertion_test,y_predict_assertion,labels_assertion)

array([[ 205,    0,    0,    1,   44,   18],
       [   2,   16,    0,    0,   76,    0],
       [   0,    0,   39,    0,   26,    6],
       [   8,    0,    3,  295,  225,   16],
       [  13,    9,    1,  105, 7651,   64],
       [   1,    1,    6,   17,  134, 1397]], dtype=int64)

In [None]:
for file in tqdm(os.listdir(partners_file_path+r'\txt')):
    if(not file.endswith(".txt")):
        continue
    file_path = os.path.join(partners_file_path+r'\txt', file)
    file_name = file.strip(".txt")
    oFile = open(file_path, 'r')
    line = oFile.read()
    
    all_lines = line.split("\n")
    file_concepts = concept_df[(concept_df['file_name']==file_name)&(concept_df['source']=='partners')]
    concept_positions = create_pos_dict_concept(file_concepts)
    
    file_assertions = assertion_df[(assertion_df['file_name']==file_name)&(assertion_df['source']=='partners')]
    assertion_positions = create_pos_dict_assertion(file_assertions)
    
    word_list = utility.process_string_finetune(line,0)
    
    for entry in word_list:
        key = str(entry['sentence_index'])+ ":" +str(entry['word_index'])
        if(key in concept_positions):
            entry.update({"concept":concept_positions[key]})
            if(concept_positions[key]=='problem'):
                if(key in assertion_positions):
                    entry.update({"assertion":assertion_positions[key]})
                else:
                    entry.update({"assertion":"blank"})
            else:
                entry.update({"assertion":"blank"})
        else:
            entry.update({"concept":"blank"})
            entry.update({"assertion":"blank"})
    
    all_words_list.extend(word_list)

In [None]:
len(all_words_list)

In [None]:
words_df = pd.DataFrame(all_words_list)

In [None]:
len(words_df[words_df['concept']=='problem'])

In [None]:
set(list(words_df['concept']))

In [None]:
test_dict = {"a":"1", "b":"2"}

In [None]:
X_concept = np.vstack(list(words_df["keyword_vector"]))                                
y_concept = words_df["concept"]  

In [None]:
X_assertion = np.vstack(list(filt_df["keyword_vector"]))                                
y_assertion =list(filt_df["assertion"]) 

In [None]:
filt_df = words_df[words_df['assertion']!='blank'].copy()

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf_concept = LogisticRegression(random_state=0,solver="lbfgs",max_iter=1000).fit(X_concept, y_concept)

In [None]:
count = 0
for i in range(len(y_assertion)):
    
    if(y_assertion[i] not in ['absent', 'associated_with_someone_else', 'conditional', 'hypothetical', 'possible', 'present']):
        count += 1
print(count)

In [None]:
y_assertion

In [None]:
clf_concept.score(X_concept,y_concept)

In [None]:
np.any(np.isnan(X_assertion))

In [None]:
clf_assertion = LogisticRegression(random_state=0,solver="lbfgs",max_iter=1000).fit(X_assertion, y_assertion)

In [None]:
clf_assertion.score(X_assertion,y_assertion)

In [None]:
test_data_file_path = os.path.dirname(os.getcwd()) + r'\Data\reference_standard_for_test_data' 

list_of_all_test_concepts = list()

for file in os.listdir(test_data_file_path+r'\concepts'):
    file_path = os.path.join(test_data_file_path+r'\concepts', file)
    list_of_all_test_concepts.extend(get_concepts_from_file(file_path,file.strip(".con"),'test_data'))

In [None]:
test_concept_df = pd.DataFrame(list_of_all_test_concepts)

In [None]:
test_concept_df.head()

In [None]:
test_data_file_path = os.path.dirname(os.getcwd()) + r'\Data\reference_standard_for_test_data' 

list_of_all_test_assertions = list()

for file in os.listdir(test_data_file_path+r'\ast'):
    file_path = os.path.join(test_data_file_path+r'\ast', file)
    list_of_all_test_assertions.extend(get_assertions_from_file(file_path,file[0:-4],'test_data'))

In [None]:
test_assertion_df = pd.DataFrame(list_of_all_test_assertions)

In [None]:
test_assertion_df.head()

In [None]:
all_words_list_test = list()
test_data_texts_path =  os.path.dirname(os.getcwd()) + r'\Data\test_data'
for file in tqdm(os.listdir(test_data_texts_path)):
    if(not file.endswith(".txt")):
        continue
    file_path = os.path.join(test_data_texts_path, file)
    
    file_name = file.strip(".txt")
    
    oFile = open(file_path, 'r')
    
    line = oFile.read()
    
    all_lines = line.split("\n")
    
    file_concepts = test_concept_df[(test_concept_df['file_name']==file_name)&(test_concept_df['source']=='test_data')]
    concept_positions = create_pos_dict_concept(file_concepts)
    
    file_assertions = test_assertion_df[(test_assertion_df['file_name']==file_name)&(test_assertion_df['source']=='test_data')]
    assertion_positions = create_pos_dict_assertion(file_assertions)
    
    word_list = utility.process_string_finetune(line,0)
    
    for entry in word_list:
        key = str(entry['sentence_index'])+ ":" +str(entry['word_index'])
        
        if(key in concept_positions):
            entry.update({"concept":concept_positions[key]})
            
            if(concept_positions[key]=='problem'):
                if(key in assertion_positions):
                    entry.update({"assertion":assertion_positions[key]})
                else:
                    entry.update({"assertion":"blank"})
            else:
                entry.update({"assertion":"blank"})
        else:
            entry.update({"concept":"blank"})
            entry.update({"assertion":"blank"})
    
    all_words_list_test.extend(word_list)

In [None]:
test_word_df = pd.DataFrame(all_words_list_test)

In [None]:
test_filt_df = test_word_df[test_word_df['assertion']!='blank'].copy()

In [None]:
X_test_concept = np.vstack(list(test_word_df["keyword_vector"]))                                
y_test_concept = test_word_df["concept"]  

In [None]:
X_test_assertion = np.vstack(list(test_filt_df["keyword_vector"]))                                
y_test_assertion = test_filt_df["assertion"]  

In [None]:
clf_assertion.score(X_test_assertion,y_test_assertion)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
y_true = y_test
y_predict = clf.predict(X_test)

In [None]:
y_predict_assertion = clf_assertion.predict(X_test_assertion)

In [None]:
f1_score(y_test_assertion, y_predict_assertion, average='macro')

In [None]:
labels_assertion = ['hypothetical',
 'conditional',
 'associated_with_someone_else',
 'possible',
 'present',
 'absent']

In [None]:
labels = ['blank', 'problem', 'test', 'treatment']

In [None]:
confusion_matrix(y_test_assertion,y_predict_assertion,labels_assertion)

In [None]:
"{1} can be treated {0}".format("Tylenol","headache")

In [None]:
from sklearn.metrics import f1_score

In [None]:
f1_score(y_true, y_predict, average='micro')

In [None]:
[0]*5

In [None]:
tokenizer.convert_ids_to_tokens([101, 2381, 1997, 2556, 7355, 1024, 102])

In [None]:
label_list[87]

In [None]:
encoding_list[87]

In [None]:
len(encoding_list)

In [None]:
import pickle

In [None]:
finetuned_model = pickle.load(open("C:/Users/itsma/Documents/CS 6120 Project/CS6120/Model/finetuned_model.pkl","rb"))

In [None]:
encodings = tokenizer.encode("Hello, my dog is cute",add_special_tokens = True)

In [None]:
finetuned_model.cpu()

In [None]:
input_ids = torch.tensor(encodings).long().unsqueeze(0)
        
outputs = finetuned_model(input_ids,token_type_ids=None)

In [None]:
filt = [entry for entry in label_list if 3 in entry]

In [None]:
bert_model = pickle.load(open("C:/Users/itsma/Documents/CS 6120 Project/CS6120/Model/finetuned_model.pkl","rb"))
bert_model.cpu()

In [None]:
import pickle

In [None]:
encodings = bert_tokenizer.encode("i love my country",add_special_tokens = True)

In [None]:
from transformers import BertTokenizer, BertConfig

from BERT_config import bert_config
import torch

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained(bert_config['ncbi_base_path'])

In [None]:
input_ids = torch.tensor(encodings).long().unsqueeze(0)

outputs = bert_model(input_ids,token_type_ids=None)

In [None]:
len(outputs[1][12][0])

In [None]:
input_ids

In [3]:
relation_model = pickle.load(open("C:/Users/itsma/Documents/CS 6120 Project/CS6120/Model/relation_finetuned.pkl","rb"))
relation_model.cpu()

BertForNextSentencePrediction(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [6]:
relation_model = relation_model.to('cuda')
relation_model = relation_model.eval()

In [9]:
nsp_utility = BERT_utility()

In [7]:
from keras.preprocessing.sequence import pad_sequences

In [10]:
temp_ip  = torch.tensor(pad_sequences([nsp_utility.get_embeddings_for_nsp("i like", "you like")],maxlen=512, dtype="long", 
                          value=0, truncating="post", padding="post")).long()

In [11]:
temp_ip = temp_ip.to('cuda')

In [12]:
temp_op = relation_model(temp_ip)

In [15]:
del temp_op

In [16]:
t = torch.cuda.get_device_properties(0).total_memory
c = torch.cuda.memory_cached(0)
a = torch.cuda.memory_allocated(0)
f = c-a  # free inside cache

In [20]:
relation_model_cp = relation_model

In [21]:
del relation_model

In [22]:
temp_op = relation_model_cp(temp_ip)

In [13]:
op_1 = temp_op[0].detach().cpu().numpy()

In [None]:
len(temp_op[1][12][0][0])

In [None]:
nsp_utility = BERT_utility()

relation_encoding_list = list()
relation_embedding_list = list()
relation_label_list = list()

no_rel_count = 0
rel_count = 0
exist_rec_list = list()

all_relations_dict = {"treatment":['TrIP','TrWP','TrCP','TrAP',"TrNRP"], "test":['TeRP','TeCP','TeNRP'], "problem":['PIP','PNP']}
no_relations_dict = {"treatment":"TrNRP", "test":"TeNRP", "problem":"PNP"}

all_problems = concept_df[(concept_df['concept_type']=='problem') & (concept_df['file_name'].isin(test_files))]

for index,row in tqdm(all_problems.iterrows()):
    all_other_entities = concept_df[(concept_df['file_name']==row['file_name'])&(concept_df['source']==row['source'])&(concept_df['line_number']==row['line_number'])&(concept_df['begin_word_num']!=row['begin_word_num'])]
    
    for entity_index,entity_row in all_other_entities.iterrows():
        
        key_to_check = row["source"]+"#"+row["file_name"]+"#"+str(row["line_number"])+"#"+str(entity_row["begin_word_num"])+"#"+str(row["begin_word_num"])
        
        if(key_to_check in exist_rec_list):
            continue
        else:
            exist_rec_list.append(row["source"]+"#"+row["file_name"]+"#"+str(row["line_number"])+"#"+str(row["begin_word_num"])+"#"+str(entity_row["begin_word_num"]))
        
        relation_df_record = relation_df[(relation_df['file_name']==row['file_name'])&(relation_df['source']==row['source'])&(relation_df['from_line_number']==row['line_number'])&(((relation_df['from_begin_word_num']==row['begin_word_num']) & (relation_df['to_begin_word_num']==entity_row['begin_word_num'])) |((relation_df['from_begin_word_num']==entity_row['begin_word_num']) & (relation_df['to_begin_word_num']==row['begin_word_num'])))]
        
        if(len(relation_df_record)==0):
            relation = no_relations_dict[entity_row["concept_type"]]
        else:
            relation = relation_df_record.iloc[0]["relation_type"]
        
        remaining_relations = [entry for entry in all_relations_dict[entity_row['concept_type']] if entry != relation]
        
        first_sentence = text_df[text_df['file_name']==row['file_name']].iloc[0]['text'].split("\n")[row['line_number']-1]
        
        for entry in sentence_map[relation]:
            second_sentence = entry.format(entity_row['text'],row['text'])
            
            relation_encoding_list.append(nsp_utility.get_embeddings_for_nsp(first_sentence,second_sentence))
            relation_label_list.append(1)
        
        for no_relation in remaining_relations:
            for entry in sentence_map[no_relation]:
                second_sentence = entry.format(entity_row['text'],row['text'])

                relation_encoding_list.append(nsp_utility.get_embeddings_for_nsp(first_sentence,second_sentence))
                relation_label_list.append(0)

In [None]:
len(concept_df[(concept_df['concept_type']=='problem') & (concept_df['file_name'].isin(training_files))])

In [None]:
relation_model.get_output_embeddings()

In [None]:
relation_model = relation_model.to('cuda')

In [None]:
device = torch.device("cuda")

In [None]:
nsp_utility = BERT_utility()

relation_encoding_list = list()
relation_embedding_list = list()
relation_label_list = list()

no_rel_count = 0
rel_count = 0
exist_rec_list = list()

all_relations_dict = {"treatment":['TrIP','TrWP','TrCP','TrAP',"TrNRP"], "test":['TeRP','TeCP','TeNRP'], "problem":['PIP','PNP']}
no_relations_dict = {"treatment":"TrNRP", "test":"TeNRP", "problem":"PNP"}

all_problems = concept_df[(concept_df['concept_type']=='problem') & (concept_df['file_name'].isin(training_files))]

for index,row in tqdm(all_problems.iterrows()):
    all_other_entities = concept_df[(concept_df['file_name']==row['file_name'])&(concept_df['source']==row['source'])&(concept_df['line_number']==row['line_number'])&(concept_df['begin_word_num']!=row['begin_word_num'])]
    
    for entity_index,entity_row in all_other_entities.iterrows():
        
        key_to_check = row["source"]+"#"+row["file_name"]+"#"+str(row["line_number"])+"#"+str(entity_row["begin_word_num"])+"#"+str(row["begin_word_num"])
        
        if(key_to_check in exist_rec_list):
            continue
        else:
            exist_rec_list.append(row["source"]+"#"+row["file_name"]+"#"+str(row["line_number"])+"#"+str(row["begin_word_num"])+"#"+str(entity_row["begin_word_num"]))
        
        relation_df_record = relation_df[(relation_df['file_name']==row['file_name'])&(relation_df['source']==row['source'])&(relation_df['from_line_number']==row['line_number'])&(((relation_df['from_begin_word_num']==row['begin_word_num']) & (relation_df['to_begin_word_num']==entity_row['begin_word_num'])) |((relation_df['from_begin_word_num']==entity_row['begin_word_num']) & (relation_df['to_begin_word_num']==row['begin_word_num'])))]
        
        if(len(relation_df_record)==0):
            relation = "NoREL"
        else:
            relation = relation_df_record.iloc[0]["relation_type"]
        
        
        #remaining_relations = [entry for entry in all_relations_dict[entity_row['concept_type']] if entry != relation]
        
        first_sentence = text_df[text_df['file_name']==row['file_name']].iloc[0]['text'].split("\n")[row['line_number']-1]
        
        '''
        for entry in sentence_map[relation]:
            second_sentence = entry.format(entity_row['text'],row['text'])
            
            encoding = nsp_utility.get_embeddings_for_nsp(first_sentence,second_sentence)
            
            input_ids = pad_sequences([encoding],maxlen=512, dtype="long", 
                          value=0, truncating="post", padding="post")
            
            input_tensor = torch.tensor(input_ids).long()
            
            input_tensor = input_tensor.to('cuda')

            with torch.no_grad():
                embedding = relation_model(input_tensor)
            
            input_tensor = input_tensor.to('cpu')
            
            relation_encoding_list.append(encoding)
            relation_embedding_list.append(embedding[1][12][0][0])
            relation_label_list.append(1)
        
        for no_relation in remaining_relations:
            for entry in sentence_map[no_relation]:
                second_sentence = entry.format(entity_row['text'],row['text'])
                
                encoding = nsp_utility.get_embeddings_for_nsp(first_sentence,second_sentence)
            
                input_ids = pad_sequences([encoding],maxlen=512, dtype="long", 
                              value=0, truncating="post", padding="post")

                input_tensor = torch.tensor(input_ids).long()

                input_tensor = input_tensor.to('cuda')
                
                with torch.no_grad():
                    embedding = relation_model(input_tensor, token_type_ids=None)
                    
                input_tensor = input_tensor.to('cpu')
                
                relation_encoding_list.append(encoding)
                relation_embedding_list.append(embedding[1][12][0][0])
                relation_label_list.append(0)
        '''
        
        if()
        second_sentence = entry.format(entity_row['text'],row['text'])
                
        encoding = nsp_utility.get_embeddings_for_nsp(first_sentence,second_sentence)

        input_ids = pad_sequences([encoding],maxlen=512, dtype="long", 
                      value=0, truncating="post", padding="post")

        input_tensor = torch.tensor(input_ids).long()

        input_tensor = input_tensor.to('cuda')

        with torch.no_grad():
            embedding = relation_model(input_tensor, token_type_ids=None)

        input_tensor = input_tensor.to('cpu')

        relation_encoding_list.append(encoding)
        relation_embedding_list.append(embedding[1][12][0][0])
        relation_label_list.append(0)

In [None]:
len(relation_label_list)

In [None]:
relation_encoding_list_test = torch.tensor(pad_sequences(relation_encoding_list[0:100],maxlen=512, dtype="long", 
                          value=0, truncating="post", padding="post")).long()

In [None]:
from keras.preprocessing.sequence import pad_sequences

In [None]:
import torch

In [None]:
relation_model.cpu()
outputs = relation_model(relation_encoding_list_test)

In [None]:
np_op = outputs[0].detach().numpy()[0]

In [None]:
list(np_op).index(max(np_op))

In [None]:
predicted = list()
for i in tqdm(false_index[0:100]):
    relation_encoding_test = torch.tensor(pad_sequences([relation_encoding_list[i]],maxlen=512, dtype="long", 
                          value=0, truncating="post", padding="post")).long()
    
    outputs = relation_model(relation_encoding_test)
    np_op = outputs[0].detach().numpy()[0]
    predicted.append(list(np_op).index(max(np_op)))

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(true_labels, predicted)

In [None]:
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

In [None]:
true_index = [index for index in range(len(relation_label_list)) if relation_label_list[index]==1]
false_index = [index for index in range(len(relation_label_list)) if relation_label_list[index]==0]

In [None]:
len(true_index)

In [None]:
true_labels = [relation_label_list[i] for i in false_index[0:100]]

In [None]:
predicted

In [23]:
import pickle

In [30]:
all_rel_data = pickle.load(open("C:/Users/itsma/Documents/CS 6120 Project/rel_data_df.pkl","rb"))

AttributeError: 'NoneType' object has no attribute 'axes'

In [26]:
from sklearn.linear_model import LogisticRegression

In [None]:
X_rel_vec = np.vstack(list(test_word_df["keyword_vector"]))                                
y_test_concept = test_word_df["concept"]