In [27]:
## FUNCTIONS ##
"""
Returns string of hh-mm-ss
"""
from time import gmtime, strftime
def getTime():
    return strftime("%H-%M-%S", gmtime())

"""
Splits input string by delimiter
Returns list of these strings 
"""
import re
def split(string):
    return re.split(', |; |\. ', string)

"""
Add column with to sheet to max column of sheet
Labels first row with column_header
"""
def add_column(sheet, column_header):
    max_col = sheet1.max_column
    sheet.cell(row=1, column=max_col+1).value = column_header

"""
Returns True if string contains any of words in keywords
Else return False
"""
def contains(statement, keywords):
    for word in keywords:
        if word in statement:
            return True
    return False

"""
Shortens diagnostic_type
admission => addx
discharge => dcdx
"""
def get_shortened_diagnostic_type(diagnostic_type):
    if diagnostic_type == "admission":
        return "addx"
    elif diagnostic_type == "discharge":
        return "dcdx"
    else:
        print("WARNING: BAD DIAGNOSTIC TYPE: {}".format(diagnostic_type))
        return ""

"""
For each illness in diagnosis, label the corresponding column with a 0
"""
def one_hot_encode_diagnoses(sheet, diagnostic_type, ColNames):
    ## Setup diagnostic type
    diagnostic_type_shortened = get_shortened_diagnostic_type(diagnostic_type)
        
    for row_cells in sheet.iter_rows(min_row=2, max_row=sheet.max_row):
        if type(row_cells[ColNames['{}_diagnosis'.format(diagnostic_type)]].value) is str:
            ## Get admission diagnosis in lower case
            full_diagnosis = row_cells[ColNames['{}_diagnosis'.format(diagnostic_type)]].value.lower()

            ## Split and iterate through statements
            diagnoses = split(full_diagnosis)
            for statement in diagnoses:

                # Ignore certain statements
                if not contains(statement, IGNORE):

                    for illness in KEYWORDS_DICT:
                        if contains(statement, KEYWORDS_DICT[illness]):
                            row_cells[ColNames['{}_{}'.format(diagnostic_type_shortened, illness)]].value = 1
                            if illness == "FASD" and row_cells[ColNames['{}_ASD'.format(diagnostic_type_shortened, illness)]].value == 1:
                                row_cells[ColNames['{}_ASD'.format(diagnostic_type_shortened, illness)]].value = 0
                                
"""
Find first illness in diagnosis, then label the main diagnosis
"""                           
def set_main_diagnosis(sheet, diagnostic_type, ColNames):
    ## Setup diagnostic type
    diagnostic_type_shortened = get_shortened_diagnostic_type(diagnostic_type)
        
    for row_cells in sheet.iter_rows(min_row=2, max_row=sheet.max_row):
        if type(row_cells[ColNames['{}_diagnosis'.format(diagnostic_type)]].value) is str:

            ## Get discharge diagnosis in lower-case
            full_diagnosis = row_cells[ColNames['{}_diagnosis'.format(diagnostic_type)]].value.lower()

            ## Split and iterate through statements
            diagnoses = split(full_diagnosis)

            if len(diagnoses) > 0:
                main_diagnosis = diagnoses[0]

                ## Get main illness NOTE: no check of ignore terms
                num_main = 0
                main_illness = ""
                for illness in KEYWORDS_DICT:
                    if not contains(main_diagnosis, IGNORE) and contains(main_diagnosis, KEYWORDS_DICT[illness]):
                        num_main = num_main + 1
                        main_illness = illness

                ## Update spreadsheet
                row_cells[ColNames["main_{}".format(diagnostic_type_shortened)]].value = main_illness

                ## Print warning for multiple illnesses
#                 if num_main != 1:
#                     print("****************")
#                     print("Main diagnosis full description: {}".format(main_diagnosis))
#                     print("Main illness: {}".format(main_illness))
#                     print("Non-zero illnesses at record_id = {}".format(row_cells[ColNames['record_id']].value))

            else:
                print("Length of 0 diagnosis found for {}".format(row_cells[ColNames['record_id']].value))
                
"""
Set all columns with binary headers to 0
"""
def set_binaries_to_zero(sheet, diagnostic_type, ColNames):
    ## Setup diagnostic type
    diagnostic_type_shortened = get_shortened_diagnostic_type(diagnostic_type)
    
    for row_cells in sheet.iter_rows(min_row=2, max_row=sheet.max_row):
        for illness in ILLNESSES:
            row_cells[ColNames["{}_{}".format(diagnostic_type_shortened, illness)]].value = 0
            
"""
Set column headers at end of columns
"""
def set_headers(sheet, diagnostic_type):
    ## Setup diagnostic type
    diagnostic_type_shortened = get_shortened_diagnostic_type(diagnostic_type)
    
    add_column(sheet, "main_{}".format(diagnostic_type_shortened))
    for i in range(0, len(ILLNESSES)):
        add_column(sheet1, "{}_{}".format(diagnostic_type_shortened, ILLNESSES[i]))