In [1]:
# So we can use the *thesislib* package
import sys
import os

module_path = os.path.abspath("..")

if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from thesislib.utils import stringutils
from thesislib.utils import pathutils

In [3]:
from openpyxl import load_workbook

In [4]:
ai_med_file = pathutils.get_data_file("05_27_nlice/AI DataMed1.xlsx")

In [5]:
ai_wb = load_workbook(ai_med_file)

In [6]:
gastor_ws = ai_wb['Gastrointestinal conditions']

In [7]:
gastor_it = list(gastor_ws.iter_rows(min_row=1, min_col=2, max_row=43, max_col=11))

In [8]:
# each condition should have the following:
# prevalence by age
# prevalence by gender
# overall incidence
# a list of symptoms with their symptom probability
# For each symptom, information about NLICE (nautre, localisation, intensity, chronology and excitation)
gastor_conditions = {}

In [9]:
def row_is_empty(row):
    for idx in range(len(row)):
        val = row[idx].value
        if val is not None and val.strip():
            return False
    return True

In [10]:
def extract_condition(iterator, column_map):
    conditions = {}
    current_condition = None
    condition_count = None
    
    _cnd_col = column_map['condition']
    _prevalence_col = column_map['prevalence']
    _prevalence_val_col = column_map['prevalence_val']
    _symptom_col = column_map['symptom']
    _prob_col = column_map['symptom_prob']
    _nature_col = column_map['nature']
    _loc_col = column_map['localisation']
    _intensity_col = column_map['intensity']
    _duration_col = column_map['duration']
    _excitation_col = column_map['excitation']
    
    for row in iterator:
        if row_is_empty(row):
            continue
        condition_col = row[_cnd_col].value
        if condition_col is not None:
            current_condition = condition_col.strip()
            if condition_count is None:
                condition_count = 0
            else:
                condition_count = condition_count + 1
            conditions[condition_count] = {
                "condition_name": current_condition,
                "age_prevalence": None,
                "gender_prevalence": None,
                "overal_incidence": None,
                "symptoms": {}
            }
            continue

        prevalence_col = row[_prevalence_col].value
        if prevalence_col is not None:
            prevalence_col = prevalence_col.strip().lower()
            prevalence_value = row[_prevalence_val_col].value
            if "age" in prevalence_col:
                conditions[condition_count]["age_prevalence"] = prevalence_value
            elif "gender" in prevalence_col:
                conditions[condition_count]["gender_prevalence"] = prevalence_value
            elif "overall" in prevalence_col:
                conditions[condition_count]["overal_incidence"] = prevalence_value

        symptom_col = row[_symptom_col].value
        if symptom_col is not None:
            symptom_col = stringutils.slugify(symptom_col.strip().lower())
            _nature = row[_nature_col].value
            _localisation = row[_loc_col].value
            _intensity = row[_intensity_col].value
            _chronology = row[_duration_col].value
            _excitation = row[_excitation_col].value
            symptom = {
                "slug": symptom_col,
                "probability": str(row[_prob_col].value).strip(),
                "n": str(_nature).strip() if _nature is not None else "n/a",
                "l": str(_localisation).strip() if _localisation is not None else "n/a",
                "i": str(_intensity).strip() if _intensity is not None else "n/a",
                "c": str(_chronology).strip() if _chronology is not None else "n/a",
                "e": str(_excitation).strip() if _excitation is not None else "n/a"
            }
            conditions[condition_count]["symptoms"][symptom_col] = symptom
    return conditions

In [11]:
col_map = {
    'condition': 0,
    'prevalence': 1,
    'prevalence_val': 2,
    'symptom': 3,
    'symptom_prob': 4,
    'nature': 5,
    'intensity': 6,
    'localisation': 7,
    'duration': 8,
    'excitation': 9
}
gastor_conditions = extract_condition(gastor_it, col_map)

In [12]:
col_map = {
    'condition': 0,
    'prevalence': 1,
    'prevalence_val': 2,
    'symptom': 3,
    'symptom_prob': 4,
    'nature': 5,
    'localisation': 6,
    'intensity': 7,
    'duration': 8,
    'excitation': 9
}
pulmonary_ws = ai_wb['Pulmonary infections']
pulmonary_it = list(pulmonary_ws.iter_rows(min_row=1, min_col=2, max_row=42, max_col=11))
pulmonary_conditions = extract_condition(pulmonary_it, col_map)

In [13]:
col_map = {
    'condition': 0,
    'prevalence': 1,
    'prevalence_val': 2,
    'symptom': 3,
    'symptom_prob': 4,
    'nature': 5,
    'localisation': 6,
    'intensity': 7,
    'duration': 8,
    'excitation': 9
}
neurological_ws = ai_wb['Neurological conditions']
neurological_it = list(neurological_ws.iter_rows(min_row=1, min_col=2, max_row=49, max_col=11))
neurological_conditions = extract_condition(neurological_it, col_map)

In [14]:
col_map = {
    'condition': 0,
    'prevalence': 1,
    'prevalence_val': 2,
    'symptom': 3,
    'symptom_prob': 4,
    'nature': 5,
    'localisation': 6,
    'intensity': 7,
    'duration': 8,
    'excitation': 9
}
orthopedic_ws = ai_wb['Orthopedic  Neurological']
orthopedic_it = list(orthopedic_ws.iter_rows(min_row=1, min_col=2, max_row=47, max_col=11))
orthopedic_conditions = extract_condition(orthopedic_it, col_map)

Next step is to clean up the data extracted from the sheets into a nice usable format

In [15]:
import re

In [16]:
def clean_up_prob(range1, range0):
    if range1:
        range1 = range1.strip()
    if range0:
        range0 = range0.strip()

    if range0 and range1:
        range0 = range0.strip(',.')
        value = float("%s.%s" % (range0, range1))
    elif range1:
        value = float(range1)
    else:
        value = None
    return value

In [17]:
def extract_percentages(value):
    if value == "n/a":
        return 0
    pct_regex = "([0-9]*[.|,])?([0-9]+)%?\-?([0-9]*[.|,])?([0-9]+)?%?"
    pct_regex = re.compile(pct_regex)
    value = re.sub("\s+", "", value)
    match = pct_regex.match(value)
    if match is None:
        raise ValueError("Could not extract percentage for %s" % value)
    
    range0 = match.group(1)
    range0_dec = match.group(2)
    range1 = match.group(3)
    range1_dec = match.group(4)
    
    prob_min = clean_up_prob(range0_dec, range0)
    prob_max = clean_up_prob(range1_dec, range1)
    
    if prob_max is None:
        return prob_min
    
    return max(prob_min, prob_max)
    

In [18]:
def extract_nature(nature_str):
    nature_regex = "([\w\s]+)\(([0-9]*[.|,])?([0-9]+)%\)"
    parts = re.findall(nature_regex, nature_str)
    if len(parts) == 0:
        raise ValueError("Could not extract nature for: %s" % nature_str)
    natures = {}
    for item in parts:
        name = stringutils.slugify(item[0].strip())
        pct0 = item[1]
        pct1 = item[2]
        nature_pct = clean_up_prob(pct1, pct0)
        if nature_pct is not None:
            natures[name] = nature_pct
    
    return natures

In [19]:
def extract_localisation(location_str):
    if "%" not in location_str:
        # we have only one location, assume 100%
        loc_regex = "([\w\s]+)"
        loc_regex = re.compile(loc_regex)
        match = loc_regex.match(location_str)
        if not match:
            raise ValueError("Could not parse location for %s" % location_str)
        location = match.group(1)
        location = stringutils.slugify(location.strip())
        return {
            location: 100
        }
    
    loc_regex = "([\w\s]+)\(([0-9]*[.|,])?([0-9]+)%?\-?([0-9]*[.|,])?([0-9]+)?%?\)"
    location_str = re.sub("\s+", "", location_str)
    location_str = re.sub(":", "", location_str)
    location_str = re.sub(";", "", location_str)
    parts = re.findall(loc_regex, location_str)
    
    locations = {}
    for item in parts:
        name = stringutils.slugify(item[0].strip())
        pct0 = item[1]
        pct1 = item[2]
        location_pct = clean_up_prob(pct1, pct0)
        if location_pct is not None:
            locations[name] = location_pct
    return locations

In [20]:
def extract_gender(gender_str):
    pass

In [21]:
def clean_condition(condition):
    condition_regex = re.compile("([A-Za-z0-9\-\s]+)")
    condition_name = condition['condition_name']
    match = condition_regex.match(condition_name)
    if match is None:
        raise ValueError("Could not parse condition name for %s" % condition_name)
    condition_name = match.group(1).strip()
    condition_name = stringutils.slugify(condition_name)
    condition_symptoms = {}
    for _symptom in condition.get("symptoms").values():
        symptom_data = {
            "slug": _symptom.get("slug"),
            "probability": extract_percentages(_symptom.get("probability")),
            "nlice": {}
        }
        nlice = {}
        nature_str = _symptom.get("n").strip().lower()
        if nature_str != "n/a":
            nature = extract_nature(nature_str)
            nlice["nature"] = nature
        
        location_str = _symptom.get("l").strip().lower()
        if location_str != "n/a":
            location = extract_localisation(location_str)
            nlice["location"] = location
        
        chronology_str = _symptom.get("c").strip().lower()
        if chronology_str != "n/a":
            nlice['duration'] = chronology_str
        
        symptom_data['nlice'] = nlice
        condition_symptoms[_symptom.get('slug')] = symptom_data
    
    return {
        "condition_name": condition_name,
        "condition_slug": condition_name,
        "age": condition.get("age_prevalence"),
        "gender": condition.get("gender_prevalence"),
        "incidence": condition.get("overal_incidence"),
        "symptoms": condition_symptoms
    }
        

In [22]:
_all = [gastor_conditions, pulmonary_conditions, neurological_conditions, orthopedic_conditions]

In [23]:
_conditions = {}

In [24]:
for item in _all:
    for value in item.values():
        try:
            cnd = clean_condition(value)
            cnd_name = cnd.get("condition_name")
            _conditions[cnd_name] = cnd
        except ValueError as e:
            cnd_name = value['condition_name']
            print("Error %s on %s" % (e, cnd_name))

In [25]:
import json
ai_med_2 = pathutils.get_data_file("05_27_nlice/ai_med_2.json")
with open(ai_med_2, "w") as fp:
    json.dump(_conditions, fp, indent=4)

In [26]:
sample1 = "(16% male, 84% female)"
sample2 = "male (42%), female (42%)"
sample3 = "male (85,7%), female (14,3%)"
sample4 = "males: 8.6%, females 17.5%"
sample5 = "35% male, 65% female"

In [27]:
# "([\w\s]+)\(([0-9]*[.|,])?([0-9]+)%?\-?([0-9]*[.|,])?([0-9]+)?%?\)"3
gender_regex = "([0-9]*[.|,])?([0-9]+)?%?\s+?(male[s]?)([0-9]*[.|,])?([0-9]+)?%?[:,]?([0-9]*[.|,])?([0-9]+)?%?\s+?(female[s]?)([0-9]*[.|,])?([0-9]+)?%?[:,]?"

In [28]:
gender_regex = re.compile(gender_regex)
m = re.search(gender_regex, sample1)
if m:
    print(m.groups())
else:
    print("m is none")

m is none
