# Medical Record Extracter

## This function reads raw medical files and extracts the recordings as a list of dictionarys

In [3]:
# imports
import pandas as pd
import re
from fuzzywuzzy import fuzz
import numpy as np

In [1]:
# Function definition
def parameter_dictionary(raw_text, params_df):
    """
    Clean raw medical results
    Extracts parameters, values, unit into a list of dictionarys
    
    Parameters:
        raw_text(string): The raw medical data
        params_df: Dataframe of parameter abbreviations and synonyms from json file
        
    Returns:
        list: List of dictionaries ("parameter", "value", "unit")
    
    """
    
    # Convert all string values in params df to lowercase, including strings within lists
    params_df = pd.DataFrame({col: [item.lower() if isinstance(item, str) else [subitem.lower() for subitem in item]
                                    for item in params_json[col]] for col in params_json.columns})
    # Convert raw text to lower case
    lower_text = raw_text.lower()
    
    # Format the params df
    # Group the synonyms of the parameters that appear more than once.
    params_df = params_df.groupby('Abbreviation')['Synonyms'].agg(lambda x: sum(x, [])).reset_index()
    # Add the abbreviation to the synonyms row
    params_df['Synonyms'] = params_df.apply(lambda row: [row['Abbreviation']] + row['Synonyms'], axis =1)
    
    # Clean raw text
    # Split raw sample into list of its lines
    sample_list = lower_text.split('\n')
    # Remove parenthesis and contents
    sample_list = [re.sub(r'\([^()]*\)|\[[^\[\]]*\]', '', line) for line in sample_list]
    # Remove dates from lines
    sample_list = [re.sub(r'\b\d{2}/\d{2}/\d{2,4}\b', '', line) for line in sample_list]
    # Only keep lines with numbers in them
    sample_list = [line for line in sample_list if re.search(r'\d', line)] 
    
    # initialise lists to be used for resulting output
    output_list = []
    seen_params = set()
    
    for line in sample_list:
        # Initialise variables to store detected parameters in line
        detected = []
        abbrev_no = []
        #iterate through each synonym in the parameter df to find any matching in the line
        for j, synonyms in enumerate(params_df['Synonyms']):
            for synonym in synonyms:
                
                # Use regex pattern to maken sure matching synonym is a complete word for strict matching
                pattern = r'\b' + synonym + r'\b'
                # Use only strict matching for shorter length synonyms
                if len(synonym) < 5:
                    # Check if the pattern matches the line
                    if re.search(pattern, line):
                        detected.append(synonym)
                        abbrev_no.append(j)
                
                # Use fuzzy matching as well for longer length synonyms to account for slight recording errors
                elif re.search(pattern, line) or fuzz.partial_ratio(synonym, line) >= 85:
                    detected.append(synonym)
                    abbrev_no.append(j)
       
        # look for values if a synonym has ben detected
        if detected:            
            # Select the parameter detected from the line as the abbreviation for the longest detected synonym
            param = params_df['Abbreviation'][abbrev_no[np.argmax([len(s) for s in detected])]]
            matching_synonym = max(detected, key = len)
            
            # Split line up on spaces into elements
            elements = re.split(r'\s+', line)
            
            # Find the unit by finding element that contains / or %
            # Make sure it is the last matching element
            unit = None
            for element in reversed(elements):
                if '/' in element or '%' in element:
                    unit = element
                    elements.remove(element)
                    break
            
            # Find the unit by finding element that only contains numbers
            # Make sure it is the last matching element
            value = None
            for element in reversed(elements):
                if any(char.isdigit() for char in element):
                    value = element
                    break
            
            # Only add recording dictionary if all keys have values
            if all([param, value, unit]):
                # Only add the recording if the parameter hasn't been recorded already
                if param not in seen_params:
                    output_list.append({
                        "parameter": param,
                        "value": value,
                        "unit": unit
                    })
                    seen_params.add(param)
    
    return output_list

In [2]:
%%time
# Usage Example

# Load raw files
with open('0b8706dc-c9af-4c6b-887d-2f85b5a511e7.txt', 'r', encoding='latin1') as f:
    raw_text = f.read()
    
params_json = pd.read_json('X1.json')

# Run function and view results
results = parameter_dictionary(raw_text, params_json)
results

CPU times: total: 1min 27s
Wall time: 1min 27s


[{'parameter': 'rbc', 'value': '14', 'unit': 'x10*6/l'},
 {'parameter': 'gglo', 'value': '11', 'unit': 'iu/ml'},
 {'parameter': 'na+', 'value': '140', 'unit': 'mmol/l'},
 {'parameter': 'k+', 'value': '4.5', 'unit': 'mmol/l'},
 {'parameter': 'cl-', 'value': '106', 'unit': 'mmol/l'},
 {'parameter': 'co2', 'value': '25', 'unit': 'mmol/l'},
 {'parameter': 'urea', 'value': '5.1', 'unit': 'mmol/l'},
 {'parameter': 'crea', 'value': '65', 'unit': 'umol/l'},
 {'parameter': 'egfr', 'value': '>90', 'unit': 'ml/min/1.73m2'},
 {'parameter': 'ca2+', 'value': '2.40', 'unit': 'mmol/l'},
 {'parameter': 'p5+', 'value': '1.08', 'unit': 'mmol/l'},
 {'parameter': 'biliin', 'value': '18', 'unit': 'umol/l'},
 {'parameter': 'alp', 'value': '70', 'unit': 'u/l'},
 {'parameter': 'ggt', 'value': '15', 'unit': 'u/l'},
 {'parameter': 'ast', 'value': '23', 'unit': 'u/l'},
 {'parameter': 'alt', 'value': '22', 'unit': 'u/l'},
 {'parameter': 'prot', 'value': '72', 'unit': 'g/l'},
 {'parameter': 'alb', 'value': '46', 'u