In [326]:
# Load in libraries
import pandas as pd
import re
from fuzzywuzzy import fuzz
import numpy as np

# Units and Parameter names

In [327]:
params_json = pd.read_json('X1.json')
params_json

Unnamed: 0,Abbreviation,Synonyms
0,11DEOXY,"[11 deoxycortisol, Cortodoxin, Deoxycorticoste..."
1,17OHP,"[17 hydroxyprogesterone, 17-Hydroxyprogesteron..."
2,ACACT,"[acetoacetate, AcAc, Acetoacetic Acid, Diaceti..."
3,ALT,"[alanine transaminase, SGPT, Serum Glutamate P..."
4,ALB,"[albumin, Serum Albumin, Human Albumin, Plasma..."
...,...,...
254,PDW,"[Platelet Distribution Width, Platelet Width, ..."
255,TFT-R,"[Thyroid Function Test - Reflex, Thyroid Funct..."
256,TFT,"[Thyroid Function Test, Thyroid Function Panel..."
257,TPOAb,"[Thyroid Peroxidase Antibodies, Anti-Thyroid P..."


In [328]:
params_json['Abbreviation'].value_counts()

Abbreviation
CA2729    2
CO2       2
CRPLN     2
CEA       2
CCTN      2
         ..
GLOB      1
GLCGN     1
GLUC      1
G6PD      1
TgAb      1
Name: count, Length: 249, dtype: int64

In [329]:
# Check the duplicate abbreviations
params_json['Synonyms'][params_json['Abbreviation'] == 'CO2'][46]

['carbon dioxide',
 'CO2 Gas',
 'Carbonic Acid Gas',
 'Carbon Dioxide Molecule',
 'CO2 Gas',
 'Carbonic Acid Anhydride',
 'Dry Ice',
 'CO2',
 'Carbonic Anhydride',
 'Carbon(IV) Oxide']

In [330]:
params_json['Synonyms'][params_json['Abbreviation'] == 'CO2'][56]

['carbon dioxide',
 'CO2 Gas',
 'Carbonic Acid Gas',
 'Respired Carbon Dioxide',
 'Bicarbonate']

In [331]:
# Group the synonyms of the parameters that appear more than once.
params_json = params_json.groupby('Abbreviation')['Synonyms'].agg(lambda x: sum(x, [])).reset_index()

In [332]:
# Add the abbreviation to the synonyms row
params_json['Synonyms'] = params_json.apply(lambda row: [row['Abbreviation']] + row['Synonyms'], axis =1)

In [333]:
params_json

Unnamed: 0,Abbreviation,Synonyms
0,11DEOXY,"[11DEOXY, 11 deoxycortisol, Cortodoxin, Deoxyc..."
1,17OHP,"[17OHP, 17 hydroxyprogesterone, 17-Hydroxyprog..."
2,A/G_RATIO,"[A/G_RATIO, Albumin/Globulin Ratio, A/G Ratio,..."
3,A1GLO,"[A1GLO, alpha-1 globulin, Alpha-1 Globulin, α1..."
4,A2GLO,"[A2GLO, alpha-2 globulin, Alpha-2 Globulin, α2..."
...,...,...
244,vitB9,"[vitB9, Vitamin B9, Folate, Folic Acid, Folaci..."
245,vitC,"[vitC, Vitamin C, Ascorbic Acid, Ascorbate, L-..."
246,vitD,"[vitD, Vitamin D, Calciferol, Cholecalciferol,..."
247,vitE,"[vitE, Vitamin E, Tocopherol, Tocotrienol, Alp..."


# Analyse samples

In [334]:
# Create list of samples given
sample_list = ['0ab9800e-bc9a-4388-aaa2-d4fc05e7d111.txt',
               '0b8706dc-c9af-4c6b-887d-2f85b5a511e7.txt',
               '0c59298d-4205-4f6a-8bc9-c078123da03a.txt',
               '0c848136-de54-49eb-a3c4-b04dda11ef42.txt',
               '0ca602c0-d93a-4ae2-b91f-532db7e174ae.txt',
               '0deee5f5-f9d3-4712-8d2e-c5f7cd9895dc.txt',
               '0ea75106-2a9d-4adc-9dad-33256e7d9aad.txt',
               '0ecb52fd-138f-4566-bdef-06fabdd7019d.txt',
               '02b832e1-66cc-4f35-8b52-abf41cd821b2.txt']

In [335]:
dates = ['10/12/2018',
         '13/02/2024',
         '18/01/2024',
         'missing',
         '18/01/2024',
         '02/04/2019',
         '26/94/2019',
         '02/02/2024',
         '03/03/2022'
         ]

In [336]:
samples = pd.DataFrame()

In [337]:
# Save each sample to a different variable
raw_text_list = []
for sample in sample_list:
        with open(sample, 'r', encoding='latin1') as f:
            raw_text_list.append(f.read())

In [338]:
samples['raw_text'] = raw_text_list

In [339]:
samples['date'] = dates

In [340]:
samples

Unnamed: 0,raw_text,date
0,"Patient Name;, \nPatient Address: \n\nD.O.B: 2...",10/12/2018
1,Patient Health Summary\n\nremoved\n\nremoved\n...,13/02/2024
2,iilaverty -â- Pathology Report resuits @RCPA...,18/01/2024
3,\n\nHAEMATOLOGY\n\n \n15\nBLOOD - EDTA Result...,missing
4,ilaverty Pathology Report resus\n\npathology E...,18/01/2024
5,AUSTRALIAN LABORATORY 3427-18703 Referred: 02/...,02/04/2019
6,\n\n \n\n \n\n \n \n \n \n \n \n \n\n29 APR\...,26/94/2019
7,Peghy Â«Ao Â¢4 AG +48 | 38 â> CLINICAL LABOR...,02/02/2024
8,Patient Name ee Barcode ; [eennnennninittte\n\...,03/03/2022


# Building Function

In [374]:
sample_1_list = samples['raw_text'][1].split('\n')
sample_1_list[:50]

['Patient Health Summary',
 '',
 'removed',
 '',
 'removed',
 '',
 '',
 'D.O.B.: ',
 '',
 'Record No.:',
 '',
 'Home Phone:',
 '',
 'Work Phone:',
 '',
 'removed',
 '',
 'Printed on 27th February 2024',
 '',
 ' ',
 '',
 'Investigations:',
 '',
 'removed',
 '',
 'removed',
 '',
 'removed',
 '',
 'remoed',
 '',
 'Laboratory: dhm',
 '',
 'removed',
 '',
 'Name of Test: MSU',
 'Requested: 30/01/2024 Collected: 30/01/2024 Reported: 13/02/2024 17:17',
 '',
 'Clinical notes: 2? UTI',
 '',
 'Clinical Notes : ? UTI',
 '',
 'Urine',
 '',
 'pH 7 Protein Nil Glucose Nil',
 '',
 'Blood Nil Ketones Nil',
 '',
 'Specific gravity 1.012 (1.005 - 1.030)',
 '']

In [342]:
def contains_number(string):
    # Check if the string contains a number
    return bool(re.search(r'\d', string))

def contains_date(string):
    # Check if the string contains a date
    return bool(re.search(r'\b\d{2}/\d{2}/\d{4}\b', string))

def remove_parentheses(line):
    # Define a regular expression pattern to match parentheses and their contents
    pattern = r'\([^()]*\)|\[[^\[\]]*\]'

    # Apply the pattern to each line and remove the matched substrings
    cleaned_line = re.sub(pattern, '', line)

    return cleaned_line

In [375]:
# Filter the list to keep only strings that contain a number
filtered_list = list(map(lambda x: remove_parentheses(x), sample_1_list))

filtered_list[:50]

['Patient Health Summary',
 '',
 'removed',
 '',
 'removed',
 '',
 '',
 'D.O.B.: ',
 '',
 'Record No.:',
 '',
 'Home Phone:',
 '',
 'Work Phone:',
 '',
 'removed',
 '',
 'Printed on 27th February 2024',
 '',
 ' ',
 '',
 'Investigations:',
 '',
 'removed',
 '',
 'removed',
 '',
 'removed',
 '',
 'remoed',
 '',
 'Laboratory: dhm',
 '',
 'removed',
 '',
 'Name of Test: MSU',
 'Requested: 30/01/2024 Collected: 30/01/2024 Reported: 13/02/2024 17:17',
 '',
 'Clinical notes: 2? UTI',
 '',
 'Clinical Notes : ? UTI',
 '',
 'Urine',
 '',
 'pH 7 Protein Nil Glucose Nil',
 '',
 'Blood Nil Ketones Nil',
 '',
 'Specific gravity 1.012 ',
 '']

In [376]:
# Filter the list to keep only strings that contain a number
filtered_list = [s for s in filtered_list if contains_number(s)]

filtered_list[:50]

['Printed on 27th February 2024',
 'Requested: 30/01/2024 Collected: 30/01/2024 Reported: 13/02/2024 17:17',
 'Clinical notes: 2? UTI',
 'pH 7 Protein Nil Glucose Nil',
 'Specific gravity 1.012 ',
 'Leucocytes 5 x10*6/L ',
 'Erythrocytes H 14 x10*6/L ',
 'Epithelial cells 5 x10*6/L',
 'Comment on Lab ID 887475034',
 'NATA Accreditation No 2178',
 'Requested: 30/01/2024 Collected: 31/01/2024 Reported: 13/02/2024 17:17',
 'Rubella IgG  11 IU/mL',
 'Comment on Lab ID 838828312',
 '<5 IU/mL: Not immune',
 '5-9 IU/mL: Equivocal',
 '>/=10 IU/mL: Immune',
 'NATA Accreditation No 2178',
 'Requested: 30/01/2024 Collected: 31/01/2024 Reported: 13/02/2024 17:17',
 'Date 29/01/19 28/07/23 14/08/23 31/01/24',
 'Time F-Fast 1640 0839 F 1248 0727 F',
 'Lab ID 295348768 885515368 887902883 838828312 Units Reference',
 'Sodium 140 L 134 139 140 mmol/L ',
 'Potassium 4.9 5.5 4.8 4.5 mmol/L ',
 'Chloride 106 106 107 106 mmol/L ',
 'Bicarbonate 23 21 21 25 mmol/L ',
 'Urea 4.0 4.8 5.3 5.1 mmol/L ',
 'Crea

In [377]:
# Filter the list to keep only strings that contain a number
filtered_list = [s for s in filtered_list if not contains_date(s)]

filtered_list[:50]

['Printed on 27th February 2024',
 'Clinical notes: 2? UTI',
 'pH 7 Protein Nil Glucose Nil',
 'Specific gravity 1.012 ',
 'Leucocytes 5 x10*6/L ',
 'Erythrocytes H 14 x10*6/L ',
 'Epithelial cells 5 x10*6/L',
 'Comment on Lab ID 887475034',
 'NATA Accreditation No 2178',
 'Rubella IgG  11 IU/mL',
 'Comment on Lab ID 838828312',
 '<5 IU/mL: Not immune',
 '5-9 IU/mL: Equivocal',
 '>/=10 IU/mL: Immune',
 'NATA Accreditation No 2178',
 'Date 29/01/19 28/07/23 14/08/23 31/01/24',
 'Time F-Fast 1640 0839 F 1248 0727 F',
 'Lab ID 295348768 885515368 887902883 838828312 Units Reference',
 'Sodium 140 L 134 139 140 mmol/L ',
 'Potassium 4.9 5.5 4.8 4.5 mmol/L ',
 'Chloride 106 106 107 106 mmol/L ',
 'Bicarbonate 23 21 21 25 mmol/L ',
 'Urea 4.0 4.8 5.3 5.1 mmol/L ',
 'Creatinine 70 70 65 65 umol/L ',
 'eGFR >90 >90 >90 >90 mL/min/1.73m2 ',
 'Calcium 2.40 mmol/L ',
 'Corr Calcium 2.34 mmol/L ',
 'Magnesium. 0.84 mmol/L ',
 'Phosphate. 1.08 mmol/L ',
 'Bili.Total 11 H 20 H 18 umol/L ',
 'ALP 59 

In [348]:
def param_lister(sample_list, params_df):
    
    # make sample and synonyms lowercase
    sample_list = [s.lower() for s in sample_list]
    params_df['Synonyms'] = [[w.lower() for w in s] for s in params_df['Synonyms']]
    
    # initialise lists to be used for resulting output
    params = []
    matching_synonyms = []
    units = []
    values = []
    line_numbers = []
    
    for i, line in enumerate(sample_list):
        detected = []
        abbrev_no = []
        for j, synonyms in enumerate(params_df['Synonyms']):
            for synonym in synonyms:
                
                pattern = r'\b' + synonym + r'\b'
                # Use only strict matching for shorter length synonyms
                if len(synonym) < 5:
                    # Check if the pattern matches the line
                    if re.search(pattern, line):
                        detected.append(synonym)
                        abbrev_no.append(j)
                
                # Use fuzzy matching for longer length synonyms
                elif re.search(pattern, line) or fuzz.partial_ratio(synonym, line) >= 85:
                    detected.append(synonym)
                    abbrev_no.append(j)
       
        if detected:            
            # Select the parameter detected from the line as the abbreviation for the longest detected synonym
            param = params_df['Abbreviation'][abbrev_no[np.argmax([len(s) for s in detected])]]
            matching_synonym = max(detected, key = len)
            
            # Split line up into elements
            elements = re.split(r'\s+', line)
            
            # Find the unit by finding element that contains / or %
            # Make sure it is the last matching element
            unit = None
            for element in reversed(elements):
                if '/' in element or '%' in element:
                    unit = element
                    elements.remove(element)
                    break
            
            # Find the unit by finding element that only contains numbers
            # Make sure it is the last matching element
            value = None
            for element in reversed(elements):
                if any(char.isdigit() for char in element):
                    value = element
                    break
            
            params.append(param)
            matching_synonyms.append(matching_synonym)
            units.append(unit)
            values.append(value)
            line_numbers.append(i)
            text = [sample_list[n] for n in line_numbers]
            df = pd.DataFrame({'param': params, 'unit': units,'value': values, 'line_number': line_numbers,'matching_synonym' : matching_synonyms, 'text': text})
            df.dropna(inplace=True)
            df.drop_duplicates(subset=['param'], keep='first', inplace=True)
            df.reset_index(drop=True, inplace=True)
    return df

In [349]:
%%time
test = param_lister(filtered_list, params_json)
test

CPU times: total: 1min 22s
Wall time: 1min 22s


Unnamed: 0,param,unit,value,line_number,matching_synonym,text
0,RBC,x10*6/l,14,5,erythrocytes,erythrocytes h 14 x10*6/l
1,GGLO,iu/ml,11,9,igg,rubella igg 11 iu/ml
2,Na+,mmol/l,140,18,sodium,sodium 140 l 134 139 140 mmol/l
3,K+,mmol/l,4.5,19,potassium,potassium 4.9 5.5 4.8 4.5 mmol/l
4,Cl-,mmol/l,106,20,chloride,chloride 106 106 107 106 mmol/l
5,CO2,mmol/l,25,21,bicarbonate,bicarbonate 23 21 21 25 mmol/l
6,UREA,mmol/l,5.1,22,urea,urea 4.0 4.8 5.3 5.1 mmol/l
7,CREA,umol/l,65,23,creatinine,creatinine 70 70 65 65 umol/l
8,EGFR,ml/min/1.73m2,>90,24,egfr,egfr >90 >90 >90 >90 ml/min/1.73m2
9,Ca2+,mmol/l,2.40,25,calcium,calcium 2.40 mmol/l


issues spotted

- Row  1: No "Rubella" in the parameter json so "Rubella IgG" has just been identified as "IgG"
- Row 20: fasting glucose test reading picked up from likely recommendation
- Row 23: "vldl cholesterol" from "hdl cholesterol"
- Row 39: "galt deficiency" from "moderate deficiency"
- Row 40: "phe level" from "winter - the level may need to be 10 - 20 nmol..."

# Putting all together to test

In [360]:
# Create raw_text cleaner function
def sample_cleaner(raw_sample):
    
    # Split raw sample into list of its lines
    sample_list = raw_sample.split('\n')
    
    # Remove parenthesis and contents
    sample_list = [re.sub(r'[\(\[][^\)\]]*[\)\]]', '', line) for line in sample_list]
    
    # Remove dates from lines
    sample_list = [re.sub(r'\b\d{2}/\d{2}/\d{2,4}\b', '', line) for line in sample_list]
    
    # Only keep lines with numbers in them
    sample_list = [line for line in sample_list if re.search(r'\d', line)]
    
    
    return sample_list

In [361]:
# apply cleaner function to all samples
samples['cleaned_text'] = samples['raw_text'].apply(sample_cleaner)
samples

Unnamed: 0,raw_text,date,cleaned_text,params_df
0,"Patient Name;, \nPatient Address: \n\nD.O.B: 2...",10/12/2018,"[TSH miIU/L 0.73 0.82 0.64 0.46, FT4 pmol/L ...",param unit value line_numbe...
1,Patient Health Summary\n\nremoved\n\nremoved\n...,13/02/2024,"[Printed on 27th February 2024, Requested: Co...",param unit value line_number ...
2,iilaverty -â- Pathology Report resuits @RCPA...,18/01/2024,"[Reported 05:13 PM, INSULIN-LIKE GROWTH FACTO...",param unit value line_number matching...
3,\n\nHAEMATOLOGY\n\n \n15\nBLOOD - EDTA Result...,missing,"[15, ESR 2 0-20 mm/h @, HAEMOGLOBIN 122 110-15...",param unit value ...
4,ilaverty Pathology Report resus\n\npathology E...,18/01/2024,"[2019, DOB Age 50 years Sex, Reported 18/01/20...",param unit value line_number matching...
5,AUSTRALIAN LABORATORY 3427-18703 Referred: 02/...,02/04/2019,"[AUSTRALIAN LABORATORY 3427-18703 Referred: , ...",param unit value line_number ...
6,\n\n \n\n \n\n \n \n \n \n \n \n \n\n29 APR\...,26/94/2019,"[29 APR, 64+ Yrs DOB: Sex: Male, Time: 15:40 ...",param unit value line_number ...
7,Peghy Â«Ao Â¢4 AG +48 | 38 â> CLINICAL LABOR...,02/02/2024,[Peghy Â«Ao Â¢4 AG +48 | 38 â> CLINICAL LABO...,param unit value line_number...
8,Patient Name ee Barcode ; [eennnennninittte\n\...,03/03/2022,[Age: Gender : 64/Female Sample Collected On :...,param unit value line_number matching_s...


In [362]:
%%time
# Apply created param function to all samples
samples['params_df'] = samples['cleaned_text'].apply(lambda x: param_lister(x, params_json))

CPU times: total: 3min 54s
Wall time: 3min 54s


In [363]:
# View full dataframe
samples

Unnamed: 0,raw_text,date,cleaned_text,params_df
0,"Patient Name;, \nPatient Address: \n\nD.O.B: 2...",10/12/2018,"[TSH miIU/L 0.73 0.82 0.64 0.46, FT4 pmol/L ...",param unit value line_numbe...
1,Patient Health Summary\n\nremoved\n\nremoved\n...,13/02/2024,"[Printed on 27th February 2024, Requested: Co...",param unit value line_number ...
2,iilaverty -â- Pathology Report resuits @RCPA...,18/01/2024,"[Reported 05:13 PM, INSULIN-LIKE GROWTH FACTO...",param unit value line_number matching...
3,\n\nHAEMATOLOGY\n\n \n15\nBLOOD - EDTA Result...,missing,"[15, ESR 2 0-20 mm/h @, HAEMOGLOBIN 122 110-15...",param unit value ...
4,ilaverty Pathology Report resus\n\npathology E...,18/01/2024,"[2019, DOB Age 50 years Sex, Reported 18/01/20...",param unit value line_number matching...
5,AUSTRALIAN LABORATORY 3427-18703 Referred: 02/...,02/04/2019,"[AUSTRALIAN LABORATORY 3427-18703 Referred: , ...",param unit value line_number ...
6,\n\n \n\n \n\n \n \n \n \n \n \n \n\n29 APR\...,26/94/2019,"[29 APR, 64+ Yrs DOB: Sex: Male, Time: 15:40 ...",param unit value line_number m...
7,Peghy Â«Ao Â¢4 AG +48 | 38 â> CLINICAL LABOR...,02/02/2024,[Peghy Â«Ao Â¢4 AG +48 | 38 â> CLINICAL LABO...,param unit value line_number...
8,Patient Name ee Barcode ; [eennnennninittte\n\...,03/03/2022,[Age: Gender : 64/Female Sample Collected On :...,param unit value line_number matching_s...


In [364]:
# Check through how each sample performed
samples['params_df'][6]

Unnamed: 0,param,unit,value,line_number,matching_synonym,text
0,Na+,mmol/l,142.0,3,sodium,sodium 142 mmol/l
1,K+,mmol/l,4.0,4,potassium,potassium 4.0 mmol/l
2,Cl-,mmol/l,106.0,5,chloride,chloride 106 mmol/l
3,CO2,mmol/l,28.0,6,bicarbonate,bicarbonate 28 mmol/l
4,UREA,mmol/l,4.1,7,urea,urea 4.1 mmol/l
5,CREA,umol/l,89.0,8,creatinine,creatinine 89 umol/l
6,EGFR,ml/mn/1.73m2,79.0,9,egfr,egfr 79 ml/mn/1.73m2
7,URIC,mmol/l,0.44,10,uric acid,uric acid 0.44 mmol/l
8,Ca2+,mmol/l,2.4,11,calcium,calcium 2.40 mmol/l
9,P5+,mmol/l,1.12,14,inorganic phosphate,inorganic phosphate 1.12 mmol/l


# Final model

In [365]:
# Create final function that returns the disered output of list of dictionaries
def parameter_dictionary(raw_text, params_df):
    """
    Clean raw medical results
    Extracts parameters, values, unit into a list of dictionarys
    
    Parameters:
        raw_text(string): The raw medical data
        params_df: Dataframe of parameter abbreviations and synonyms from json file
        
    Returns:
        list: List of dictionaries ("parameter", "value", "unit")
    
    """
    
    # Convert all string values in params df to lowercase, including strings within lists
    params_df = pd.DataFrame({col: [item.lower() if isinstance(item, str) else [subitem.lower() for subitem in item]
                                    for item in params_json[col]] for col in params_json.columns})
    # Convert raw text to lower case
    lower_text = raw_text.lower()
    
    # Format the params df
    # Group the synonyms of the parameters that appear more than once.
    params_df = params_df.groupby('Abbreviation')['Synonyms'].agg(lambda x: sum(x, [])).reset_index()
    # Add the abbreviation to the synonyms row
    params_df['Synonyms'] = params_df.apply(lambda row: [row['Abbreviation']] + row['Synonyms'], axis =1)
    
    # Clean raw text
    # Split raw sample into list of its lines
    sample_list = lower_text.split('\n')
    # Remove parenthesis and contents
    sample_list = [re.sub(r'\([^()]*\)|\[[^\[\]]*\]', '', line) for line in sample_list]
    # Remove dates from lines
    sample_list = [re.sub(r'\b\d{2}/\d{2}/\d{2,4}\b', '', line) for line in sample_list]
    # Only keep lines with numbers in them
    sample_list = [line for line in sample_list if re.search(r'\d', line)] 
    
    # initialise lists to be used for resulting output
    output_list = []
    seen_params = set()
    
    for line in sample_list:
        # Initialise variables to store detected parameters in line
        detected = []
        abbrev_no = []
        #iterate through each synonym in the parameter df to find any matching in the line
        for j, synonyms in enumerate(params_df['Synonyms']):
            for synonym in synonyms:
                
                # Use regex pattern to maken sure matching synonym is a complete word for strict matching
                pattern = r'\b' + synonym + r'\b'
                # Use only strict matching for shorter length synonyms
                if len(synonym) < 5:
                    # Check if the pattern matches the line
                    if re.search(pattern, line):
                        detected.append(synonym)
                        abbrev_no.append(j)
                
                # Use fuzzy matching as well for longer length synonyms to account for slight recording errors
                elif re.search(pattern, line) or fuzz.partial_ratio(synonym, line) >= 85:
                    detected.append(synonym)
                    abbrev_no.append(j)
       
        # look for values if a synonym has ben detected
        if detected:            
            # Select the parameter detected from the line as the abbreviation for the longest detected synonym
            param = params_df['Abbreviation'][abbrev_no[np.argmax([len(s) for s in detected])]]
            matching_synonym = max(detected, key = len)
            
            # Split line up on spaces into elements
            elements = re.split(r'\s+', line)
            
            # Find the unit by finding element that contains / or %
            # Make sure it is the last matching element
            unit = None
            for element in reversed(elements):
                if '/' in element or '%' in element:
                    unit = element
                    elements.remove(element)
                    break
            
            # Find the unit by finding element that only contains numbers
            # Make sure it is the last matching element
            value = None
            for element in reversed(elements):
                if any(char.isdigit() for char in element):
                    value = element
                    break
            
            # Only add recording dictionary if all keys have values
            if all([param, value, unit]):
                # Only add the recording if the parameter hasn't been recorded already
                if param not in seen_params:
                    output_list.append({
                        "parameter": param,
                        "value": value,
                        "unit": unit
                    })
                    seen_params.add(param)
    
    return output_list

In [366]:
# Load raw files
with open('0b8706dc-c9af-4c6b-887d-2f85b5a511e7.txt', 'r', encoding='latin1') as f:
    raw_text = f.read()
    
params_json = pd.read_json('X1.json')

In [367]:
%%time
# Use function on files and analyse results
results = parameter_dictionary(raw_text, params_json)
results

CPU times: total: 1min 25s
Wall time: 1min 25s


[{'parameter': 'rbc', 'value': '14', 'unit': 'x10*6/l'},
 {'parameter': 'gglo', 'value': '11', 'unit': 'iu/ml'},
 {'parameter': 'na+', 'value': '140', 'unit': 'mmol/l'},
 {'parameter': 'k+', 'value': '4.5', 'unit': 'mmol/l'},
 {'parameter': 'cl-', 'value': '106', 'unit': 'mmol/l'},
 {'parameter': 'co2', 'value': '25', 'unit': 'mmol/l'},
 {'parameter': 'urea', 'value': '5.1', 'unit': 'mmol/l'},
 {'parameter': 'crea', 'value': '65', 'unit': 'umol/l'},
 {'parameter': 'egfr', 'value': '>90', 'unit': 'ml/min/1.73m2'},
 {'parameter': 'ca2+', 'value': '2.40', 'unit': 'mmol/l'},
 {'parameter': 'p5+', 'value': '1.08', 'unit': 'mmol/l'},
 {'parameter': 'biliin', 'value': '18', 'unit': 'umol/l'},
 {'parameter': 'alp', 'value': '70', 'unit': 'u/l'},
 {'parameter': 'ggt', 'value': '15', 'unit': 'u/l'},
 {'parameter': 'ast', 'value': '23', 'unit': 'u/l'},
 {'parameter': 'alt', 'value': '22', 'unit': 'u/l'},
 {'parameter': 'prot', 'value': '72', 'unit': 'g/l'},
 {'parameter': 'alb', 'value': '46', 'u

In [369]:
# Check code works the same as earlier testing function
print(len(results))
print(len(test))

test

43
43


Unnamed: 0,param,unit,value,line_number,matching_synonym,text
0,RBC,x10*6/l,14,5,erythrocytes,erythrocytes h 14 x10*6/l
1,GGLO,iu/ml,11,9,igg,rubella igg 11 iu/ml
2,Na+,mmol/l,140,18,sodium,sodium 140 l 134 139 140 mmol/l
3,K+,mmol/l,4.5,19,potassium,potassium 4.9 5.5 4.8 4.5 mmol/l
4,Cl-,mmol/l,106,20,chloride,chloride 106 106 107 106 mmol/l
5,CO2,mmol/l,25,21,bicarbonate,bicarbonate 23 21 21 25 mmol/l
6,UREA,mmol/l,5.1,22,urea,urea 4.0 4.8 5.3 5.1 mmol/l
7,CREA,umol/l,65,23,creatinine,creatinine 70 70 65 65 umol/l
8,EGFR,ml/min/1.73m2,>90,24,egfr,egfr >90 >90 >90 >90 ml/min/1.73m2
9,Ca2+,mmol/l,2.40,25,calcium,calcium 2.40 mmol/l


- Both C02 and HCO3 have the synonym bicarbonate in X1 json (likely shouldn't be a sysnonym for CO2)