*package_leaflets_dataset_final.ipynb* from EZ --- creating json files in bioleaflets/dataset/test|train|valid   




## JSON FILE

            

    leaflet_content = {      
            'ID': leaflet.id,    
            'URL': leaflet.url,    
            'Product_Name': leaflet.product_name,    
            'Full_Content': leaflet.content,    

            'Section_1': {
                'Title': leaflet.section1.title,
                'Section_Content': leaflet.section1.section_content,
                'Entity_Recognition': leaflet.section1.entity_recognition
            },

            'Section_2': {
                'Title': leaflet.section2.title,
                'Section_Content': leaflet.section2.section_content,
                'Entity_Recognition': leaflet.section2.entity_recognition
            },

            'Section_3': {
                'Title': leaflet.section3.title,
                'Section_Content': leaflet.section3.section_content,
                'Entity_Recognition': leaflet.section3.entity_recognition
            },

            'Section_4': {
                'Title': leaflet.section4.title,
                'Section_Content': leaflet.section4.section_content,
                'Entity_Recognition': leaflet.section4.entity_recognition
            },

            'Section_5': {
                'Title': leaflet.section5.title,
                'Section_Content': leaflet.section5.section_content,
                'Entity_Recognition': leaflet.section5.entity_recognition
            },

            'Section_6': {
                'Title': leaflet.section6.title,
                'Section_Content': leaflet.section6.section_content,
                'Entity_Recognition': leaflet.section6.entity_recognition
            }
        }


In [1]:
import json
import os
import numpy as np

In [2]:
with open('bioleaflets/dataset/test/0A1B8116D0DEC288D3F1CEBA70918447.json') as json_file:
    data = json.load(json_file)

In [3]:
data.keys()

dict_keys(['ID', 'URL', 'Product_Name', 'Full_Content', 'Section_1', 'Section_2', 'Section_3', 'Section_4', 'Section_5', 'Section_6'])

In [4]:
data['ID']

'0A1B8116D0DEC288D3F1CEBA70918447'

In [5]:
data['Product_Name']

'Erleada'

In [6]:
data['Section_1'].keys()

dict_keys(['Title', 'Section_Content', 'Entity_Recognition'])

In [7]:
data['Section_1']['Title']

'1. what erleada is and what it is used for'

In [8]:
data['Section_1']['Section_Content']

'erleada is a cancer medicine that contains the active substance apalutamide. it is used to treat adult men with prostate cancer that: has metastasised to other parts of the body and still responds to medical or surgical treatments that lower testosterone (also called hormone-sensitive prostate cancer). has not metastasised to other parts of the body and no longer responds to medical or surgical treatment that lowers testosterone (also called castration-resistant prostate cancer). erleada works by blocking the activity of hormones called androgens (such as testosterone). androgens can cause the cancer to grow. by blocking the effect of androgens, apalutamide stops prostate cancer cells from growing and dividing.'

---------------------------------------------

### Read all json files

In [2]:

def read_dataset(path='bioleaflets/dataset/test/'):
    
    dataset_array = []
    
    for filename in os.listdir(path):

        # path to the particular file
        path2file = path + filename
        
        # read file
        with open(path2file) as json_file:
            test_file = json.load(json_file)
        
        # use less RAM by setting full_content = 0
        test_file['Full_Content'] = None
        
        # save to array
        dataset_array.append(test_file)
    
    return dataset_array

In [3]:
test_dataset = read_dataset('bioleaflets/dataset/test/')
val_dataset = read_dataset('bioleaflets/dataset/valid/')
train_dataset = read_dataset('bioleaflets/dataset/train/')

In [4]:
assert len(test_dataset) == 134
assert len(val_dataset) == 134
assert len(train_dataset) == 1068


## Check quiality

In [5]:
whole_dataset = test_dataset + val_dataset + train_dataset

len(whole_dataset)

1336

### Count None values in each section type

In [6]:
def count_sections_None(dataset):
    """
    check for NONE in section_content and/or NER_output
    """
    
    section_count_None = {'section_1_none': 0,
                     'section_2_none': 0,
                     'section_3_none': 0,
                     'section_4_none': 0,
                     'section_5_none': 0,
                     'section_6_none': 0
                    }
    
    NER_count_None = {'section_1_NER_none': 0,
                     'section_2_NER_none': 0,
                     'section_3_NER_none': 0,
                     'section_4_NER_none': 0,
                     'section_5_NER_none': 0,
                     'section_6_NER_none': 0
                    }
    
    for file in dataset:
        
        ### section contents
        if file['Section_1']['Section_Content'] is None:
            section_count_None['section_1_none'] += 1
        
        if file['Section_2']['Section_Content'] is None:
            section_count_None['section_2_none'] += 1
        
        if file['Section_3']['Section_Content'] is None:
            section_count_None['section_3_none'] += 1
        
        if file['Section_4']['Section_Content'] is None:
            section_count_None['section_4_none'] += 1
        
        if file['Section_5']['Section_Content'] is None:
            section_count_None['section_5_none'] += 1
            
        if file['Section_6']['Section_Content'] is None:
            section_count_None['section_6_none'] += 1
            
        ### NER outputs
        if file['Section_1']['Entity_Recognition'] is None:
            NER_count_None['section_1_NER_none'] += 1
        
        if file['Section_2']['Entity_Recognition'] is None:
            NER_count_None['section_2_NER_none'] += 1
        
        if file['Section_3']['Entity_Recognition'] is None:
            NER_count_None['section_3_NER_none'] += 1
        
        if file['Section_4']['Entity_Recognition'] is None:
            NER_count_None['section_4_NER_none'] += 1
        
        if file['Section_5']['Entity_Recognition'] is None:
            NER_count_None['section_5_NER_none'] += 1
            
        if file['Section_6']['Entity_Recognition'] is None:
            NER_count_None['section_6_NER_none'] += 1
    
    return section_count_None, NER_count_None

In [7]:
count_sections_None(whole_dataset)

({'section_1_none': 22,
  'section_2_none': 27,
  'section_3_none': 23,
  'section_4_none': 41,
  'section_5_none': 164,
  'section_6_none': 25},
 {'section_1_NER_none': 15,
  'section_2_NER_none': 15,
  'section_3_NER_none': 11,
  'section_4_NER_none': 31,
  'section_5_NER_none': 472,
  'section_6_NER_none': 22})

### Count Empty values in each section type

In [8]:
def count_sections_empty(dataset):
    """
    check for empty in section_content and/or NER_output
    """
    
    section_count_empty = {'section_1_empty': 0,
                     'section_2_empty': 0,
                     'section_3_empty': 0,
                     'section_4_empty': 0,
                     'section_5_empty': 0,
                     'section_6_empty': 0
                    }
    
    NER_count_empty = {'section_1_NER_empty': 0,
                     'section_2_NER_empty': 0,
                     'section_3_NER_empty': 0,
                     'section_4_NER_empty': 0,
                     'section_5_NER_empty': 0,
                     'section_6_NER_empty': 0
                    }
    
    for file in dataset:
        
        ### section contents
        if file['Section_1']['Section_Content'] is not None:
            if len(file['Section_1']['Section_Content']) < 1:
                section_count_empty['section_1_empty'] += 1
        
        if file['Section_2']['Section_Content'] is not None:
            if len(file['Section_2']['Section_Content']) < 1:
                section_count_empty['section_2_empty'] += 1
        
        if file['Section_3']['Section_Content'] is not None:
            if len(file['Section_3']['Section_Content']) < 1:
                section_count_empty['section_3_empty'] += 1
        
        if file['Section_4']['Section_Content'] is not None:
            if len(file['Section_4']['Section_Content']) < 1:
                section_count_empty['section_4_empty'] += 1
        
        if file['Section_5']['Section_Content'] is not None:
            if len(file['Section_5']['Section_Content']) < 1:
                section_count_empty['section_5_empty'] += 1
            
        if file['Section_6']['Section_Content'] is not None:
            if len(file['Section_6']['Section_Content']) < 1:
                section_count_empty['section_6_empty'] += 1
            
        ### NER outputs
        if file['Section_1']['Entity_Recognition'] is not None:
            if len(file['Section_1']['Entity_Recognition']) < 1:
                NER_count_empty['section_1_NER_empty'] += 1
        
        if file['Section_2']['Entity_Recognition'] is not None:
            if len(file['Section_2']['Entity_Recognition']) < 1:
                NER_count_empty['section_2_NER_empty'] += 1
        
        if file['Section_3']['Entity_Recognition'] is not None:
            if len(file['Section_3']['Entity_Recognition']) < 1:
                NER_count_empty['section_3_NER_empty'] += 1
        
        if file['Section_4']['Entity_Recognition'] is not None:
            if len(file['Section_4']['Entity_Recognition']) < 1:
                NER_count_empty['section_4_NER_empty'] += 1
        
        if file['Section_5']['Entity_Recognition'] is not None:
            if len(file['Section_5']['Entity_Recognition']) < 1:
                NER_count_empty['section_5_NER_empty'] += 1
            
        if file['Section_6']['Entity_Recognition'] is not None:
            if len(file['Section_6']['Entity_Recognition']) < 1:
                NER_count_empty['section_6_NER_empty'] += 1
    
    return section_count_empty, NER_count_empty

In [9]:
count_sections_empty(whole_dataset)

({'section_1_empty': 0,
  'section_2_empty': 0,
  'section_3_empty': 0,
  'section_4_empty': 0,
  'section_5_empty': 0,
  'section_6_empty': 0},
 {'section_1_NER_empty': 0,
  'section_2_NER_empty': 0,
  'section_3_NER_empty': 0,
  'section_4_NER_empty': 0,
  'section_5_NER_empty': 0,
  'section_6_NER_empty': 0})

### Check for duplicates string-wise

In [10]:
def check_duplicates_section(dataset):
    """
    Test dataset for duplicates in section_content and NER_output for corresponding section_content
    """

    # keep track of unique NER outputs observed so far
    unique_NER_outputs = set()

    # keep track of unique section contents observed so far
    unique_sections = set()
    
    COUNT_DUPLICATE_NER_OUTPUTS = 0
    COUNT_DUPLICATE_SECTIONS = 0
    
    for leaflet in dataset:
        
        leaflet_sections = [leaflet['Section_1']['Section_Content'],
                            leaflet['Section_2']['Section_Content'],
                            leaflet['Section_3']['Section_Content'],
                            leaflet['Section_4']['Section_Content'],
                            leaflet['Section_5']['Section_Content'],
                            leaflet['Section_6']['Section_Content']]

        for section_index, section_content in enumerate(leaflet_sections):

            if section_content is None:
                continue
            
            # if section_content is already in set - unique_sections, then it is a duplicate
            if section_content in unique_sections and len(section_content) > 1:
                COUNT_DUPLICATE_SECTIONS += 1
            # add section_content to a set
            else:
                unique_sections.add(section_content)
        
        ### check for dupicate NERs
        
        leaflet_NERs = [leaflet['Section_1']['Entity_Recognition'],
                        leaflet['Section_2']['Entity_Recognition'],
                        leaflet['Section_3']['Entity_Recognition'],
                        leaflet['Section_4']['Entity_Recognition'],
                        leaflet['Section_5']['Entity_Recognition'],
                        leaflet['Section_6']['Entity_Recognition']]
        
        for section_index, NER_output in enumerate(leaflet_NERs):

            if NER_output is None:
                continue
                        
            # convert list of entitis to string of entities['Text']
            NER_output_str = ""
            for entity in NER_output:
                NER_output_str += entity['Text'] + " "
            
            # if section_content is already in set - unique_sections, then it is a duplicate
            if NER_output_str in unique_NER_outputs and len(NER_output) > 1:
                COUNT_DUPLICATE_NER_OUTPUTS += 1
            # add section_content to a set
            else:
                unique_NER_outputs.add(NER_output_str)
        
            
    print('Num. of detected duplicate NER outputs:', COUNT_DUPLICATE_NER_OUTPUTS)
    print('Num. of detected duplicate section contents:', COUNT_DUPLICATE_SECTIONS)
    
    print()
    
    print('Num. of unique NER outputs:', len(unique_NER_outputs))
    print('Num. of unique section contents:', len(unique_sections))


In [11]:
check_duplicates_section(whole_dataset)

Num. of detected duplicate NER outputs: 0
Num. of detected duplicate section contents: 0

Num. of unique NER outputs: 7450
Num. of unique section contents: 7714


Comment: 1336 * 6 = 8016, the rest are Nones

### TO-DO: check for dupicates in a smart way!

## Cals Statistics

### Calc number samples per section

---> if section has a section1 and section_content is not None and is not Empty - count +1 to section1

In [12]:
def count_sections(dataset):
    
    section_count = {'section_1_num': 0,
                     'section_2_num': 0,
                     'section_3_num': 0,
                     'section_4_num': 0,
                     'section_5_num': 0,
                     'section_6_num': 0
                    }
    
    for file in dataset:

        if file['Section_1']['Section_Content'] is not None:
            if len(file['Section_1']['Section_Content']) > 0:
                section_count['section_1_num'] += 1
        
        if file['Section_2']['Section_Content'] is not None:
            if len(file['Section_2']['Section_Content']) > 0:
                section_count['section_2_num'] += 1
        
        if file['Section_3']['Section_Content'] is not None:
            if len(file['Section_3']['Section_Content']) > 0:
                section_count['section_3_num'] += 1
        
        if file['Section_4']['Section_Content'] is not None:
            if len(file['Section_4']['Section_Content']) > 0:
                section_count['section_4_num'] += 1
        
        if file['Section_5']['Section_Content'] is not None:
            if len(file['Section_5']['Section_Content']) > 0:
                section_count['section_5_num'] += 1
            
        if file['Section_6']['Section_Content'] is not None:
            if len(file['Section_6']['Section_Content']) > 0:
                section_count['section_6_num'] += 1
        
    return section_count

In [13]:
test_section_count = count_sections(test_dataset)
val_section_count = count_sections(val_dataset)
train_section_count = count_sections(train_dataset)

In [14]:
print(test_section_count)
print(val_section_count)
print(train_section_count)
print("\n")


for sec in ['section_1_num', 'section_2_num', 'section_3_num', 'section_4_num', 'section_5_num', 'section_6_num']:
    print(sec, " : ", test_section_count[sec] + val_section_count[sec] + train_section_count[sec])

{'section_1_num': 133, 'section_2_num': 133, 'section_3_num': 133, 'section_4_num': 129, 'section_5_num': 115, 'section_6_num': 133}
{'section_1_num': 130, 'section_2_num': 130, 'section_3_num': 134, 'section_4_num': 130, 'section_5_num': 124, 'section_6_num': 130}
{'section_1_num': 1051, 'section_2_num': 1046, 'section_3_num': 1046, 'section_4_num': 1036, 'section_5_num': 933, 'section_6_num': 1048}


section_1_num  :  1314
section_2_num  :  1309
section_3_num  :  1313
section_4_num  :  1295
section_5_num  :  1172
section_6_num  :  1311


### Calc average length char-wise of each section 

In [15]:
def calc_section_len(dataset):
    """
    Calculate the average length of each section(1-6)
    :param package_leaflets: list, collection of package leaflets
    :return: dict, (key: str, section_num; value: list of lengths of corresponding sections)
    """

    # calculate length of each section

    section_content_len = {
        '1': [],
        '2': [],
        '3': [],
        '4': [],
        '5': [],
        '6': []
    }

    # calc the length of section content and add to list
    for leaflet in dataset:
        
        if leaflet['Section_1']['Section_Content'] is not None:
            section_content_len['1'].append(len(leaflet['Section_1']['Section_Content']))
        
        if leaflet['Section_2']['Section_Content'] is not None:
            section_content_len['2'].append(len(leaflet['Section_2']['Section_Content']))
        
        if leaflet['Section_3']['Section_Content'] is not None:
            section_content_len['3'].append(len(leaflet['Section_3']['Section_Content']))
        
        if leaflet['Section_4']['Section_Content'] is not None:
            section_content_len['4'].append(len(leaflet['Section_4']['Section_Content']))
        
        if leaflet['Section_5']['Section_Content'] is not None:
            section_content_len['5'].append(len(leaflet['Section_5']['Section_Content']))
        
        if leaflet['Section_6']['Section_Content'] is not None:
            section_content_len['6'].append(len(leaflet['Section_6']['Section_Content']))
            
    print('Section 1: ', np.mean(section_content_len['1']))
    print('Section 2: ', np.mean(section_content_len['2']))
    print('Section 3: ', np.mean(section_content_len['3']))
    print('Section 4: ', np.mean(section_content_len['4']))
    print('Section 5: ', np.mean(section_content_len['5']))
    print('Section 6: ', np.mean(section_content_len['6']))
    
    print()
    
    print('Section 1 Num. Samples: ', len(section_content_len['1']))
    print('Section 2 Num. Samples: ', len(section_content_len['2']))
    print('Section 3 Num. Samples: ', len(section_content_len['3']))
    print('Section 4 Num. Samples: ', len(section_content_len['4']))
    print('Section 5 Num. Samples: ', len(section_content_len['5']))
    print('Section 6 Num. Samples: ', len(section_content_len['6']))

    # return section_content_len

In [16]:
calc_section_len(whole_dataset)

Section 1:  962.7945205479452
Section 2:  4559.597402597403
Section 3:  2300.4912414318355
Section 4:  3452.67722007722
Section 5:  630.4846416382253
Section 6:  981.7040427154843

Section 1 Num. Samples:  1314
Section 2 Num. Samples:  1309
Section 3 Num. Samples:  1313
Section 4 Num. Samples:  1295
Section 5 Num. Samples:  1172
Section 6 Num. Samples:  1311


*Numbers calculated before:    *

Section 1:  962.7945205479452    
Section 2:  4559.597402597403    
Section 3:  2300.4912414318355   
Section 4:  3452.67722007722   
Section 5:  630.4846416382253   
Section 6:  981.7040427154843   

section 1 ”Therapeu-tic Indication”, which is 1 019 characters lon gon average.    
section 4 ”Possible side effects” (3 488 characters longon average) generation is poor    

**Explanation** - before I was calculating length on tokenized text just for test_dataset.

### Calc Average Len in tokens - split()

In [50]:
def calc_tokens_split(dataset):

    # calculate length of each section in tokens

    section_content_split = {
        '1': [],
        '2': [],
        '3': [],
        '4': [],
        '5': [],
        '6': []
    }

    # calc the length of section content and add to list
    for leaflet in dataset:
        
        if leaflet['Section_1']['Section_Content'] is not None:
            # array of tokens
            section_content_array = leaflet['Section_1']['Section_Content'].split()
            section_content_split['1'].append(len(section_content_array))
        
        if leaflet['Section_2']['Section_Content'] is not None:
            section_content_array = leaflet['Section_2']['Section_Content'].split()
            section_content_split['2'].append(len(section_content_array))
        
        if leaflet['Section_3']['Section_Content'] is not None:
            section_content_array = leaflet['Section_3']['Section_Content'].split()
            section_content_split['3'].append(len(section_content_array))
        
        if leaflet['Section_4']['Section_Content'] is not None:
            section_content_array = leaflet['Section_4']['Section_Content'].split()
            section_content_split['4'].append(len(section_content_array))
        
        if leaflet['Section_5']['Section_Content'] is not None:
            section_content_array = leaflet['Section_5']['Section_Content'].split()
            section_content_split['5'].append(len(section_content_array))
        
        if leaflet['Section_6']['Section_Content'] is not None:
            section_content_array = leaflet['Section_6']['Section_Content'].split()
            section_content_split['6'].append(len(section_content_array))
            
    print('Section 1: ', np.mean(section_content_split['1']))
    print('Section 2: ', np.mean(section_content_split['2']))
    print('Section 3: ', np.mean(section_content_split['3']))
    print('Section 4: ', np.mean(section_content_split['4']))
    print('Section 5: ', np.mean(section_content_split['5']))
    print('Section 6: ', np.mean(section_content_split['6']))
    
    # assert lengh of arrays == already calculated values
    assert len(section_content_split['1']) == 1314
    assert len(section_content_split['2']) == 1309
    assert len(section_content_split['3']) == 1313
    assert len(section_content_split['4']) == 1295
    assert len(section_content_split['5']) == 1172
    assert len(section_content_split['6']) == 1311
    
    # return section_content_len

In [51]:
calc_tokens_split(whole_dataset)

Section 1:  152.68645357686452
Section 2:  736.1229946524064
Section 3:  407.8926123381569
Section 4:  546.4277992277993
Section 5:  111.63054607508532
Section 6:  152.7398932112891


### Calc Average Len in tokens - wordpunct_tokenize()

In [52]:
from nltk.tokenize import wordpunct_tokenize

def calc_tokens_wordpunct_tokenize(dataset):

    # calculate length of each section in tokens

    section_content_wordpunct_token = {
        '1': [],
        '2': [],
        '3': [],
        '4': [],
        '5': [],
        '6': []
    }

    # calc the length of section content and add to list
    for leaflet in dataset:
        
        if leaflet['Section_1']['Section_Content'] is not None:
            # array of tokens
            section_content_array = wordpunct_tokenize(leaflet['Section_1']['Section_Content'])
            section_content_wordpunct_token['1'].append(len(section_content_array))
            
        if leaflet['Section_2']['Section_Content'] is not None:
            section_content_array = wordpunct_tokenize(leaflet['Section_2']['Section_Content'])
            section_content_wordpunct_token['2'].append(len(section_content_array))
        
        if leaflet['Section_3']['Section_Content'] is not None:
            section_content_array = wordpunct_tokenize(leaflet['Section_3']['Section_Content'])
            section_content_wordpunct_token['3'].append(len(section_content_array))
        
        if leaflet['Section_4']['Section_Content'] is not None:
            section_content_array = wordpunct_tokenize(leaflet['Section_4']['Section_Content'])
            section_content_wordpunct_token['4'].append(len(section_content_array))
        
        if leaflet['Section_5']['Section_Content'] is not None:
            section_content_array = wordpunct_tokenize(leaflet['Section_5']['Section_Content'])
            section_content_wordpunct_token['5'].append(len(section_content_array))
        
        if leaflet['Section_6']['Section_Content'] is not None:
            section_content_array = wordpunct_tokenize(leaflet['Section_6']['Section_Content'])
            section_content_wordpunct_token['6'].append(len(section_content_array))
            
    print('Section 1: ', np.mean(section_content_wordpunct_token['1']))
    print('Section 2: ', np.mean(section_content_wordpunct_token['2']))
    print('Section 3: ', np.mean(section_content_wordpunct_token['3']))
    print('Section 4: ', np.mean(section_content_wordpunct_token['4']))
    print('Section 5: ', np.mean(section_content_wordpunct_token['5']))
    print('Section 6: ', np.mean(section_content_wordpunct_token['6']))
    
    # assert lengh of arrays == already calculated values
    assert len(section_content_wordpunct_token['1']) == 1314
    assert len(section_content_wordpunct_token['2']) == 1309
    assert len(section_content_wordpunct_token['3']) == 1313
    assert len(section_content_wordpunct_token['4']) == 1295
    assert len(section_content_wordpunct_token['5']) == 1172
    assert len(section_content_wordpunct_token['6']) == 1311
        
    # return section_content_len

In [53]:
calc_tokens_wordpunct_tokenize(whole_dataset)

Section 1:  174.6986301369863
Section 2:  849.7158135981665
Section 3:  458.44325971058646
Section 4:  651.03861003861
Section 5:  123.88737201365188
Section 6:  196.86803966437833


### Calc Average number of entities per section  


For the dataset table, I think it would be very useful to include the entity types (e.g. diseases etc) and a break down on how often they appear. (This would help the readers to understand which types of entities are included, and how many.)

In [57]:
test_dataset[33]['Section_1']['Entity_Recognition']

[{'Text': 'incivo', 'Type': 'PRODUCT_NAME', 'BeginOffset': 0, 'EndOffset': 0},
 {'Text': 'the virus', 'Type': 'PROBLEM', 'BeginOffset': 20, 'EndOffset': 29},
 {'Id': 5,
  'BeginOffset': 42,
  'EndOffset': 63,
  'Score': 0.9415066838264465,
  'Text': 'hepatitis c infection',
  'Category': 'MEDICAL_CONDITION',
  'Type': 'DX_NAME',
  'Traits': [{'Name': 'DIAGNOSIS', 'Score': 0.9677639603614807}]},
 {'Text': 'chronic hepatitis c infection',
  'Type': 'PROBLEM',
  'BeginOffset': 85,
  'EndOffset': 114},
 {'Id': 13,
  'BeginOffset': 139,
  'EndOffset': 143,
  'Score': 0.21955788135528564,
  'Text': '1865',
  'Category': 'PROTECTED_HEALTH_INFORMATION',
  'Type': 'AGE',
  'Traits': []},
 {'Id': 0,
  'BeginOffset': 171,
  'EndOffset': 189,
  'Score': 0.92701256275177,
  'Text': 'peginterferon alfa',
  'Category': 'MEDICATION',
  'Type': 'GENERIC_NAME',
  'Traits': []},
 {'Id': 1,
  'BeginOffset': 194,
  'EndOffset': 203,
  'Score': 0.9990363121032715,
  'Text': 'ribavirin',
  'Category': 'MEDIC

In [17]:
def calc_entities_type_section(dataset):
    
    # calculate entity_type - num occurences per section
    
    # num entities per section
    section_entities_num = {
        '1': 0,
        '2': 0,
        '3': 0,
        '4': 0,
        '5': 0,
        '6': 0
    }
    
    # 'entity type': num
    section_entities_type = {
        '1': {},
        '2': {},
        '3': {},
        '4': {},
        '5': {},
        '6': {}
    }
    
    # calc the length of section content and add to list
    for leaflet in dataset:
        
        if leaflet['Section_1']['Entity_Recognition'] is not None:
            
            for entity in leaflet['Section_1']['Entity_Recognition']:
                entity_type = entity['Type']
                
                # number of entities per section
                section_entities_num['1'] += 1
                
                # keep track of entity_type and num. of occurences
                if entity_type in section_entities_type['1']:
                    section_entities_type['1'][entity_type] += 1
                else:
                    section_entities_type['1'][entity_type] = 1
        
            
        if leaflet['Section_2']['Entity_Recognition'] is not None:
            
            for entity in leaflet['Section_2']['Entity_Recognition']:
                entity_type = entity['Type']
                
                section_entities_num['2'] += 1
                
                if entity_type in section_entities_type['2']:
                    section_entities_type['2'][entity_type] += 1
                else:
                    section_entities_type['2'][entity_type] = 1
        
        if leaflet['Section_3']['Entity_Recognition'] is not None:
            
            for entity in leaflet['Section_3']['Entity_Recognition']:
                entity_type = entity['Type']
                
                section_entities_num['3'] += 1
                
                if entity_type in section_entities_type['3']:
                    section_entities_type['3'][entity_type] += 1
                else:
                    section_entities_type['3'][entity_type] = 1
        
        if leaflet['Section_4']['Entity_Recognition'] is not None:
            for entity in leaflet['Section_4']['Entity_Recognition']:
                entity_type = entity['Type']
                
                section_entities_num['4'] += 1
                
                if entity_type in section_entities_type['4']:
                    section_entities_type['4'][entity_type] += 1
                else:
                    section_entities_type['4'][entity_type] = 1
        
        if leaflet['Section_5']['Entity_Recognition'] is not None:
            for entity in leaflet['Section_5']['Entity_Recognition']:
                entity_type = entity['Type']
                
                section_entities_num['5'] += 1
                
                if entity_type in section_entities_type['5']:
                    section_entities_type['5'][entity_type] += 1
                else:
                    section_entities_type['5'][entity_type] = 1
        
        if leaflet['Section_6']['Entity_Recognition'] is not None:
            for entity in leaflet['Section_6']['Entity_Recognition']:
                entity_type = entity['Type']
                
                section_entities_num['6'] += 1
                
                if entity_type in section_entities_type['6']:
                    section_entities_type['6'][entity_type] += 1
                else:
                    section_entities_type['6'][entity_type] = 1
    
    print("Entities count per section")
    print(section_entities_num)
    
    print("\n\n Entities type per section")
    print(section_entities_type)
    
    
    # check
    assert(section_entities_num['1']) == sum(list(section_entities_type['1'].values()))
    assert(section_entities_num['2']) == sum(list(section_entities_type['2'].values()))
    assert(section_entities_num['3']) == sum(list(section_entities_type['3'].values()))
    assert(section_entities_num['4']) == sum(list(section_entities_type['4'].values()))
    assert(section_entities_num['5']) == sum(list(section_entities_type['5'].values()))
    assert(section_entities_num['6']) == sum(list(section_entities_type['6'].values()))

In [18]:
calc_entities_type_section(whole_dataset)

Entities count per section
{'1': 38542, '2': 167271, '3': 66304, '4': 175021, '5': 7341, '6': 50297}


 Entities type per section
{'1': {'PRODUCT_NAME': 1321, 'PROBLEM': 8850, 'SYSTEM_ORGAN_SITE': 2588, 'DX_NAME': 8363, 'TEST': 1215, 'TREATMENT_NAME': 1249, 'GENERIC_NAME': 3895, 'TREATMENT': 7624, 'BRAND_NAME': 1131, 'NAME': 104, 'NUMBER': 1132, 'TEST_NAME': 370, 'PROCEDURE_NAME': 249, 'TIME_TO_DX_NAME': 92, 'DATE': 12, 'AGE': 235, 'TIME_TO_TREATMENT_NAME': 30, 'TIME_TO_MEDICATION_NAME': 31, 'ADDRESS': 20, 'TIME_TO_PROCEDURE_NAME': 14, 'TIME_TO_TEST_NAME': 3, 'ID': 4, 'PHONE_OR_FAX': 8, 'PROFESSION': 1, 'URL': 1}, '2': {'PRODUCT_NAME': 1321, 'TREATMENT': 38334, 'BRAND_NAME': 6240, 'PROBLEM': 25147, 'DX_NAME': 44819, 'GENERIC_NAME': 25509, 'NUMBER': 6073, 'TREATMENT_NAME': 5286, 'SYSTEM_ORGAN_SITE': 5191, 'PROCEDURE_NAME': 722, 'TIME_TO_PROCEDURE_NAME': 97, 'TEST': 3875, 'AGE': 530, 'TIME_TO_TREATMENT_NAME': 395, 'TIME_TO_MEDICATION_NAME': 562, 'TIME_TO_DX_NAME': 1374, 'TEST_NAME': 1585

### Number unique entities per section

In [79]:
def calc_unique_entities_section(dataset):
    
    # calculate unique entities per section
    
    # unique entities
    section_unique_entities = {
        '1': set(),
        '2': set(),
        '3': set(),
        '4': set(),
        '5': set(),
        '6': set()
    }
    
    # calc the length of section content and add to list
    for leaflet in dataset:
        
        if leaflet['Section_1']['Entity_Recognition'] is not None:
            for entity in leaflet['Section_1']['Entity_Recognition']:
                entity_text = entity['Text']
                section_unique_entities['1'].add(entity_text)
            
        if leaflet['Section_2']['Entity_Recognition'] is not None:
            for entity in leaflet['Section_2']['Entity_Recognition']:
                entity_text = entity['Text']
                section_unique_entities['2'].add(entity_text)
        
        if leaflet['Section_3']['Entity_Recognition'] is not None:
            for entity in leaflet['Section_3']['Entity_Recognition']:
                entity_text = entity['Text']
                section_unique_entities['3'].add(entity_text)
        
        if leaflet['Section_4']['Entity_Recognition'] is not None:
            for entity in leaflet['Section_4']['Entity_Recognition']:
                entity_text = entity['Text']
                section_unique_entities['4'].add(entity_text)
        
        if leaflet['Section_5']['Entity_Recognition'] is not None:
            for entity in leaflet['Section_5']['Entity_Recognition']:
                entity_text = entity['Text']
                section_unique_entities['5'].add(entity_text)
        
        if leaflet['Section_6']['Entity_Recognition'] is not None:
            for entity in leaflet['Section_6']['Entity_Recognition']:
                entity_text = entity['Text']
                section_unique_entities['6'].add(entity_text)
    
    for section_type in section_unique_entities:
        print(section_type, " = ", len(section_unique_entities[section_type]))
    
    
    # check ? 
    return section_unique_entities

In [80]:
bla = calc_unique_entities_section(whole_dataset)

1  =  9641
2  =  23278
3  =  11640
4  =  27945
5  =  2041
6  =  9932


1  =  9641
2  =  23278
3  =  11640
4  =  27945
5  =  2041
6  =  9932

In [88]:
def calc_unique_entities_section(dataset):
    
    ### main difference - now I can actualy see the most popular entities per each section
    
    # calculate unique entities per section
    
    # entity - num. it appears
    section_unique_entities = {
        '1': dict(),
        '2': dict(),
        '3': dict(),
        '4': dict(),
        '5': dict(),
        '6': dict()
    }
    
    # calc the length of section content and add to list
    for leaflet in dataset:
        
        if leaflet['Section_1']['Entity_Recognition'] is not None:
            for entity in leaflet['Section_1']['Entity_Recognition']:
                entity_text = entity['Text']
                
                if entity_text in section_unique_entities['1']:
                    section_unique_entities['1'][entity_text] += 1
                else:
                    section_unique_entities['1'][entity_text] = 1
            
        if leaflet['Section_2']['Entity_Recognition'] is not None:
            for entity in leaflet['Section_2']['Entity_Recognition']:
                entity_text = entity['Text']
                if entity_text in section_unique_entities['2']:
                    section_unique_entities['2'][entity_text] += 1
                else:
                    section_unique_entities['2'][entity_text] = 1
        
        if leaflet['Section_3']['Entity_Recognition'] is not None:
            for entity in leaflet['Section_3']['Entity_Recognition']:
                entity_text = entity['Text']
                if entity_text in section_unique_entities['3']:
                    section_unique_entities['3'][entity_text] += 1
                else:
                    section_unique_entities['3'][entity_text] = 1
        
        if leaflet['Section_4']['Entity_Recognition'] is not None:
            for entity in leaflet['Section_4']['Entity_Recognition']:
                entity_text = entity['Text']
                if entity_text in section_unique_entities['4']:
                    section_unique_entities['4'][entity_text] += 1
                else:
                    section_unique_entities['4'][entity_text] = 1
        
        if leaflet['Section_5']['Entity_Recognition'] is not None:
            for entity in leaflet['Section_5']['Entity_Recognition']:
                entity_text = entity['Text']
                if entity_text in section_unique_entities['5']:
                    section_unique_entities['5'][entity_text] += 1
                else:
                    section_unique_entities['5'][entity_text] = 1
        
        if leaflet['Section_6']['Entity_Recognition'] is not None:
            for entity in leaflet['Section_6']['Entity_Recognition']:
                entity_text = entity['Text']
                if entity_text in section_unique_entities['6']:
                    section_unique_entities['6'][entity_text] += 1
                else:
                    section_unique_entities['6'][entity_text] = 1
    
    for section_type in section_unique_entities:
        
        total_num_entities = 0
        for entity_text in section_unique_entities[section_type]:
            total_num_entities += section_unique_entities[section_type][entity_text]
        
        print(section_type, " ~ Num. unique entities: ", len(section_unique_entities[section_type]))
        print(section_type, " ~ Total entities: ", total_num_entities)
        print()
        
    # but now I can see the most popular entities per each section
    # return section_unique_entities

In [89]:
calc_unique_entities_section(whole_dataset)

1  ~ Num. unique entities:  9641
1  ~ Total entities:  38542

2  ~ Num. unique entities:  23278
2  ~ Total entities:  167271

3  ~ Num. unique entities:  11640
3  ~ Total entities:  66304

4  ~ Num. unique entities:  27945
4  ~ Total entities:  175021

5  ~ Num. unique entities:  2041
5  ~ Total entities:  7341

6  ~ Num. unique entities:  9932
6  ~ Total entities:  50297



-------------------------
### TYPO in section 6 - What I was calculating before:

----------------------------

1019.984962406015   
3488.5193798449613    

section 1: <PRODUCT_NAME> incivo </PRODUCT_NAME> <PROBLEM> the_virus </PROBLEM> <DX_NAME> hepatitis_c_infection </DX_NAME> <PROBLEM> chronic_hepatitis_c_infection </PROBLEM> <AGE> 1865 </AGE> <GENERIC_NAME> peginterferon_alfa </GENERIC_NAME> <GENERIC_NAME> ribavirin </GENERIC_NAME> <GENERIC_NAME> telaprevir </GENERIC_NAME> <TREATMENT> medicines </TREATMENT> <TREATMENT_NAME> ns3-4a_protease_inhibitors </TREATMENT_NAME> <TREATMENT> the_ns3-4a_protease_inhibitor </TREATMENT> <DX_NAME> hepatitis_c_virus </DX_NAME> <GENERIC_NAME> peginterferon_alfa </GENERIC_NAME> <GENERIC_NAME> ribavirin </GENERIC_NAME> <TREATMENT> incivo </TREATMENT> <PROBLEM> chronic_hepatitis_c_infection </PROBLEM> <PROBLEM> chronic_hepatitis_c_infection </PROBLEM>     <TIME_TO_TREATMENT_NAME> previously </TIME_TO_TREATMENT_NAME> <TREATMENT> an_interferon-based_regimen </TREATMENT>
incivo acts against the virus that causes hepatitis c infection and is used to treat chronic hepatitis c infection in adult patients ( aged 1865 years ) in combination with peginterferon alfa and ribavirin . incivo contains a substance called telaprevir and belongs to a group of medicines called ' ns3 - 4a protease inhibitors '. the ns3 - 4a protease inhibitor reduces the amount of hepatitis c virus in your body . incivo must not be taken alone and must be taken in combination with peginterferon alfa and ribavirin to be sure your treatment works . incivo can be used for patients with chronic hepatitis c infection who have never been treated before or can be used in patients with chronic hepatitis c infection who have been treated previously with an interferon - based regimen .   

1019.984962406015    
3488.5193798449613   

In [None]:
############

PATH='/content/drive/MyDrive/bayer-intern/content_planner_generations/section1_tgt_test.txt'

# read T5 reference sections
with open(PATH) as f:
    section1_ref = [line.strip() for line in f]

length_1 = []
for i in section1_ref:
  length_1.append(len(i))

print(np.mean(length_1))

PATH='/content/drive/MyDrive/bayer-intern/content_planner_generations/section4_tgt_test.txt'

# read T5 reference sections
with open(PATH) as f:
    section4_ref = [line.strip() for line in f]

length_4 = []
for i in section4_ref:
  length_4.append(len(i))

print(np.mean(length_4))

## ----------------------------
PATH = '/content/drive/MyDrive/bayer-intern/T5_condition_input/input_data/test.source'
# read T5 reference sections
with open(PATH) as f:
    input = [line.strip() for line in f]

PATH = '/content/drive/MyDrive/bayer-intern/T5_condition_input/input_data/test.target'
# read T5 reference sections
with open(PATH) as f:
    output = [line.strip() for line in f]

print(input[0])
print(output[0])

bla_1 = []
bla_4 = []
for i in range(len(input)):
  if input[i][:9] == 'section 1':
    bla_1.append(len(output[i]))
  elif input[i][:9] == 'section 4':
    bla_4.append(len(output[i]))

print(np.mean(bla_1))
print(np.mean(bla_4))
###########

Reason: -- because here it is tokenized text for test dataset ??     


incivo acts against the virus that causes hepatitis c infection and is used to treat chronic hepatitis c infection in adult patients ( aged 1865 years ) in combination with peginterferon alfa and ribavirin . incivo contains a substance called telaprevir and belongs to a group of medicines called ' ns3 - 4a protease inhibitors '. the ns3 - 4a protease inhibitor reduces the amount of hepatitis c virus in your body . incivo must not be taken alone and must be taken in combination with peginterferon alfa and ribavirin to be sure your treatment works . incivo can be used for patients with chronic hepatitis c infection who have never been treated before or can be used in patients with chronic hepatitis c infection who have been treated previously with an interferon - based regimen .

In [41]:
from nltk.tokenize import wordpunct_tokenize

def calc_section_len_reproduce_error(dataset):
    """
    Calculate the average length of each section(1-6)
    :param package_leaflets: list, collection of package leaflets
    :return: dict, (key: str, section_num; value: list of lengths of corresponding sections)
    """

    # calculate length of each section

    section_content_len = {
        '1': [],
        '2': [],
        '3': [],
        '4': [],
        '5': [],
        '6': []
    }

    # calc the length of section content and add to list
    for leaflet in dataset:
        
        if leaflet['Section_1']['Section_Content'] is not None:
            section_content = wordpunct_tokenize(leaflet['Section_1']['Section_Content'])
            section_content = " ".join(section_content)
            # section_content = section_content + "\n"
            section_content_len['1'].append(len(section_content))
        
        if leaflet['Section_2']['Section_Content'] is not None:
            section_content = wordpunct_tokenize(leaflet['Section_2']['Section_Content'])
            section_content = " ".join(section_content)
            # section_content = section_content + "\n"
            section_content_len['2'].append(len(section_content))
        
        if leaflet['Section_3']['Section_Content'] is not None:
            section_content = wordpunct_tokenize(leaflet['Section_3']['Section_Content'])
            section_content = " ".join(section_content)
            # section_content = section_content + "\n"
            section_content_len['3'].append(len(section_content))
        
        if leaflet['Section_4']['Section_Content'] is not None:
            section_content = wordpunct_tokenize(leaflet['Section_4']['Section_Content'])
            section_content = " ".join(section_content)
            # section_content = section_content + "\n"
            section_content_len['4'].append(len(section_content))
        
        if leaflet['Section_5']['Section_Content'] is not None:
            section_content = wordpunct_tokenize(leaflet['Section_5']['Section_Content'])
            section_content = " ".join(section_content)
            # section_content = section_content + "\n"
            section_content_len['5'].append(len(section_content))
        
        if leaflet['Section_6']['Section_Content'] is not None:
            section_content = wordpunct_tokenize(leaflet['Section_6']['Section_Content'])
            section_content = " ".join(section_content)
            # section_content = section_content + "\n"
            section_content_len['6'].append(len(section_content))
            
    print('Section 1: ', np.mean(section_content_len['1']))
    print('Section 2: ', np.mean(section_content_len['2']))
    print('Section 3: ', np.mean(section_content_len['3']))
    print('Section 4: ', np.mean(section_content_len['4']))
    print('Section 5: ', np.mean(section_content_len['5']))
    print('Section 6: ', np.mean(section_content_len['6']))

    # return section_content_len

In [42]:
calc_section_len_reproduce_error(test_dataset)

Section 1:  1019.984962406015
Section 2:  4465.187969924812
Section 3:  2265.2631578947367
Section 4:  3488.5193798449613
Section 5:  642.7217391304348
Section 6:  1022.6315789473684


1019.984962406015    
3488.5193798449613    

In [34]:
for i in test_dataset:
    if i['Product_Name'] == 'Incivo':
        print(i['Section_1']['Section_Content'])
        print()
        print(test_dataset.index(i))
        print()
        print(wordpunct_tokenize(i['Section_1']['Section_Content']))

incivo acts against the virus that causes hepatitis c infection and is used to treat chronic hepatitis c infection in adult patients (aged 1865 years) in combination with peginterferon alfa and ribavirin. incivo contains a substance called telaprevir and belongs to a group of medicines called 'ns3-4a protease inhibitors'. the ns3-4a protease inhibitor reduces the amount of hepatitis c virus in your body. incivo must not be taken alone and must be taken in combination with peginterferon alfa and ribavirin to be sure your treatment works. incivo can be used for patients with chronic hepatitis c infection who have never been treated before or can be used in patients with chronic hepatitis c infection who have been treated previously with an interferon-based regimen.

33

['incivo', 'acts', 'against', 'the', 'virus', 'that', 'causes', 'hepatitis', 'c', 'infection', 'and', 'is', 'used', 'to', 'treat', 'chronic', 'hepatitis', 'c', 'infection', 'in', 'adult', 'patients', '(', 'aged', '1865', 

----------------------------------------------------------------------

In [19]:
def calc_entities_type_section(dataset):
    
    # calculate entity_type - num occurences per section
    
    # num entities per section
    section_entities_num = {
        '1': 0,
        '2': 0,
        '3': 0,
        '4': 0,
        '5': 0,
        '6': 0
    }
    
    # 'entity type': num
    section_entities_type = {
        '1': {},
        '2': {},
        '3': {},
        '4': {},
        '5': {},
        '6': {}
    }
    
    # calc the length of section content and add to list
    for leaflet in dataset:
        
        if leaflet['Section_1']['Entity_Recognition'] is not None:
            
            for entity in leaflet['Section_1']['Entity_Recognition']:
                entity_type = entity['Type']
                
                # number of entities per section
                section_entities_num['1'] += 1
                
                # keep track of entity_type and num. of occurences
                if entity_type in section_entities_type['1']:
                    section_entities_type['1'][entity_type] += 1
                else:
                    section_entities_type['1'][entity_type] = 1
        
            
        if leaflet['Section_2']['Entity_Recognition'] is not None:
            
            for entity in leaflet['Section_2']['Entity_Recognition']:
                entity_type = entity['Type']
                
                section_entities_num['2'] += 1
                
                if entity_type in section_entities_type['2']:
                    section_entities_type['2'][entity_type] += 1
                else:
                    section_entities_type['2'][entity_type] = 1
        
        if leaflet['Section_3']['Entity_Recognition'] is not None:
            
            for entity in leaflet['Section_3']['Entity_Recognition']:
                entity_type = entity['Type']
                
                section_entities_num['3'] += 1
                
                if entity_type in section_entities_type['3']:
                    section_entities_type['3'][entity_type] += 1
                else:
                    section_entities_type['3'][entity_type] = 1
        
        if leaflet['Section_4']['Entity_Recognition'] is not None:
            for entity in leaflet['Section_4']['Entity_Recognition']:
                entity_type = entity['Type']
                
                section_entities_num['4'] += 1
                
                if entity_type in section_entities_type['4']:
                    section_entities_type['4'][entity_type] += 1
                else:
                    section_entities_type['4'][entity_type] = 1
        
        if leaflet['Section_5']['Entity_Recognition'] is not None:
            for entity in leaflet['Section_5']['Entity_Recognition']:
                entity_type = entity['Type']
                
                section_entities_num['5'] += 1
                
                if entity_type in section_entities_type['5']:
                    section_entities_type['5'][entity_type] += 1
                else:
                    section_entities_type['5'][entity_type] = 1
        
        if leaflet['Section_6']['Entity_Recognition'] is not None:
            for entity in leaflet['Section_6']['Entity_Recognition']:
                entity_type = entity['Type']
                
                section_entities_num['6'] += 1
                
                if entity_type in section_entities_type['6']:
                    section_entities_type['6'][entity_type] += 1
                else:
                    section_entities_type['6'][entity_type] = 1
    
    print("Entities count per section")
    print(section_entities_num)
    
    print("\n\n Entities type per section")
    print(section_entities_type)
    
    
    # check
    assert(section_entities_num['1']) == sum(list(section_entities_type['1'].values()))
    assert(section_entities_num['2']) == sum(list(section_entities_type['2'].values()))
    assert(section_entities_num['3']) == sum(list(section_entities_type['3'].values()))
    assert(section_entities_num['4']) == sum(list(section_entities_type['4'].values()))
    assert(section_entities_num['5']) == sum(list(section_entities_type['5'].values()))
    assert(section_entities_num['6']) == sum(list(section_entities_type['6'].values()))
    
    
    return section_entities_type

In [21]:
result = calc_entities_type_section(whole_dataset)

Entities count per section
{'1': 38542, '2': 167271, '3': 66304, '4': 175021, '5': 7341, '6': 50297}


 Entities type per section
{'1': {'PRODUCT_NAME': 1321, 'PROBLEM': 8850, 'SYSTEM_ORGAN_SITE': 2588, 'DX_NAME': 8363, 'TEST': 1215, 'TREATMENT_NAME': 1249, 'GENERIC_NAME': 3895, 'TREATMENT': 7624, 'BRAND_NAME': 1131, 'NAME': 104, 'NUMBER': 1132, 'TEST_NAME': 370, 'PROCEDURE_NAME': 249, 'TIME_TO_DX_NAME': 92, 'DATE': 12, 'AGE': 235, 'TIME_TO_TREATMENT_NAME': 30, 'TIME_TO_MEDICATION_NAME': 31, 'ADDRESS': 20, 'TIME_TO_PROCEDURE_NAME': 14, 'TIME_TO_TEST_NAME': 3, 'ID': 4, 'PHONE_OR_FAX': 8, 'PROFESSION': 1, 'URL': 1}, '2': {'PRODUCT_NAME': 1321, 'TREATMENT': 38334, 'BRAND_NAME': 6240, 'PROBLEM': 25147, 'DX_NAME': 44819, 'GENERIC_NAME': 25509, 'NUMBER': 6073, 'TREATMENT_NAME': 5286, 'SYSTEM_ORGAN_SITE': 5191, 'PROCEDURE_NAME': 722, 'TIME_TO_PROCEDURE_NAME': 97, 'TEST': 3875, 'AGE': 530, 'TIME_TO_TREATMENT_NAME': 395, 'TIME_TO_MEDICATION_NAME': 562, 'TIME_TO_DX_NAME': 1374, 'TEST_NAME': 1585

In [24]:
num_types = set()

for sec_type in result:
    entities_section = result[sec_type]
    
    for entity_type in entities_section:
        num_types.add(entity_type)

In [32]:
len(num_types)

26

In [60]:
unique_display = set()
for i in [0, 100, 500, 300, 333, 777, 1234, 321, 12, -1, 228, 555, 7]:
    for entity in whole_dataset[i]['Section_1']['Entity_Recognition']:
        if entity['Type'] == 'TEST':
            unique_display.add(entity['Text'])
            
    for entity in whole_dataset[i]['Section_2']['Entity_Recognition']:
        if entity['Type'] == 'TEST':
            unique_display.add(entity['Text'])
            
    for entity in whole_dataset[i]['Section_4']['Entity_Recognition']:
        if entity['Type'] == 'TEST':
            unique_display.add(entity['Text'])

print(unique_display)

{'red blood cells', 'the heart', 'gamma gt', 'your blood cells', 'the enzyme', 'your blood or urine', 'a dental examination', 'the vegf protein', 'a monoclonal antibody', 'laboratory tests', 'potassium level', 'your blood levels', 'flushing tests', 'your kidney function', 'your platelet count', 'potassium in your blood', 'regular blood tests', 'the cd4 cell count', 'your blood pressure', 'your blood', 'the blood', 'an enzyme', 'sodium', 'blood tests', 'your urine', 'your last blood test', 'a blood test', 'red blood cell count', 'platelets', 'globotriaosylceramide (gl', 'a pregnancy test', 'your plasma levels', 'a dental check', 'your blood fluid', 'a spinal puncture', 'clinical studies', 'a faulty enzyme', 'fatty acids', 'your cholesterol levels', 'one study'}


In [63]:
unique_display = set()
for i in [0, 100, 500, 300, 333, 777, 1234, 321, 12, -1, 228, 555, 7, 123, 5, 22, 25, 789, 46, 1246]:
    for entity in whole_dataset[i]['Section_1']['Entity_Recognition']:
        if entity['Type'] == 'PROCEDURE_NAME':
            unique_display.add(entity['Text'])
            
    for entity in whole_dataset[i]['Section_2']['Entity_Recognition']:
        if entity['Type'] == 'PROCEDURE_NAME':
            unique_display.add(entity['Text'])
            
    for entity in whole_dataset[i]['Section_4']['Entity_Recognition']:
        if entity['Type'] == 'PROCEDURE_NAME':
            unique_display.add(entity['Text'])

print(unique_display)

{'stent in the blocked', 'injections', 'operations', 'spinal or epidural anaesthesia', 'major surgery', 'lumbar puncture', 'anaesthetics', 'injection', 'surgical intervention', 'tooth extractions', 'surgery', 'operations and anesthetics', 'bone marrow or stem cell transplant', 'operation', 'spinal surgery', 'spinal or epidural anesthesia', 'epidural or spinal anaesthetic', 'dental surgery', 'liver transplant'}
