# emr_analysis

Code for predicting treatment periods and effect using a rule-based and double-LSTM algorithm.

In [1]:
import datetime
import sys

import dateutil
import pandas as pd
from tabulate import tabulate

from emr_analysis.config import load_config
from emr_analysis.data import Data, Effect
from emr_analysis.effect.detector import EffectDetector
from emr_analysis.evaluator import Evaluator, OutOfEvaluationError
from emr_analysis.treatment_period import TreatmentPeriod

Load EHRs data, structured data, and word embedding model.

In [2]:
config = load_config("resources/config.yml")

data = Data(data_dir="resources/data/", preprocess=True)
data.load_data()

Loading EMR data...
Loaded EMR data
Loading structured data...
Loaded structured data
Loading word embedding model...
Loaded word embedding model


## Treatment periods

Rule-based prediction of treatment periods

In [3]:
treatment_period = TreatmentPeriod(data, config)
evaluator = Evaluator(data, config)

def treatment_period_results():
    for patient_id in data.patient_ids:
        periods = treatment_period.treatment_periods(patient_id)

        for date, last_date, medication_name \
                in zip(periods["DateTime"], periods["LastDateTime"], periods["MedicationName"]):
            try:
                cperiod_df = \
                    evaluator.corresponding_record_by_period(patient_id, date, last_date,
                                                             medication=medication_name)

                if not cperiod_df.empty:
                    ans = cperiod_df.iloc[0]
                    yield (patient_id, date.date(), last_date.date(), medication_name,
                           ans["治療開始日"].date(), ans["治療終了日"].date(), ans["治療詳細"])
                else:
                    yield (patient_id, date.date(), last_date.date(), medication_name, None, None, None)
            except OutOfEvaluationError:
                yield (patient_id, date.date(), last_date.date(), medication_name, None, None, None)

def format_date(s, fmt="%Y-%m-%d"):
    try:
        if pd.isnull(s):
            return ""
        elif isinstance(s, str):
            return dateutil.parser.parse(s).strftime(fmt)
        elif isinstance(s, datetime.datetime):
            return s.strftime(fmt)
        elif isinstance(s, datetime.date):
            return s.strftime(fmt)
        else:
            return s
    except dateutil.parser.ParserError:
        return s

table = []
patient_id = None
for result in treatment_period_results():
    tr = list(result)
    tr[1] = format_date(tr[1])
    tr[2] = format_date(tr[2])
    tr[4] = format_date(tr[4])
    tr[5] = format_date(tr[5])
    if tr[0] == patient_id:
        tr[0] = ""
    else:
        patient_id = tr[0]
    table.append(tr)

tabulate(table,
         headers=["Patient ID", "Start date (pred)", "End date (pred)",
                  "Medication (pred)", "Start date (ans)", "End date (ans)",
                  "Medication (ans)"],
         tablefmt="html")

Patient ID,Start date (pred),End date (pred),Medication (pred),Start date (ans),End date (ans),Medication (ans)
1.0,2015-12-04,2016-01-05,Carboplatin,2015-11-24,2016-01-06,CBDCA
,2016-08-19,2018-04-29,Erlotinib,2016-08-22,2018-05-03,Erlotinib
2.0,2011-01-12,2011-04-12,CDDP+VNR,2011-01-12,2011-04-19,CDDP+VNR
,2012-06-14,2014-08-15,Gefitinib,2012-06-15,2014-08-14,Gefitinib
,2014-08-14,2014-11-06,PEM,2014-08-14,2014-12-06,PEM


## Tumor response evaluation

Prediction of tumor response evaluation and PD date using machine learning

In [4]:
effect_detector_model = "resources/model/model.pt"
effect_detector = EffectDetector(data, config, effect_detector_model)

def effect_results():
    for patient_id in data.patient_ids:
        periods = treatment_period.treatment_periods(patient_id)
        for i in range(len(periods)):
            start_dt = periods.iloc[i]["DateTime"]
            end_dt = periods.iloc[i]["LastDateTime"]
            if i < len(periods) - 1:
                next_start_dt = periods.iloc[i + 1]["DateTime"]
            else:
                next_start_dt = end_dt + datetime.timedelta(days=365) # for demo
            delta = datetime.timedelta(days=1)

            estimated_effect, estimated_pd_date, nreports = \
                effect_detector.detect_effect(patient_id, start_dt, next_start_dt + delta)
            
            cperiod_df = \
                evaluator.corresponding_record_by_period(patient_id, start_dt, end_dt,
                                                         medication=periods.iloc[i]["MedicationName"])

            if estimated_effect is None:
                continue

            if estimated_pd_date:
                estimated_pd_date = estimated_pd_date.date()

            if not cperiod_df.empty:
                ans = cperiod_df.iloc[0]        
                yield (patient_id, start_dt.date(), end_dt.date(), ans["治療詳細"],
                       estimated_effect, ans["効果"],
                       estimated_pd_date, ans["PD確定日"],
                       nreports)

table = []
patient_id = None
interval = None
for result in effect_results():
    tr = list(result)
    tr[1] = format_date(tr[1])
    tr[2] = format_date(tr[2])
    if not pd.isnull(tr[4]):
        tr[4] = tr[4].name
    tr[5] = tr[5].name
    tr[6] = format_date(tr[6])
    tr[7] = format_date(tr[7])

    if tr[0] == patient_id:
        tr[0] = ""
        if [tr[1], tr[2]] == interval:
            tr[1] = tr[2] = ""
        else:
            interval = [tr[1], tr[2]]
    else:
        patient_id = tr[0]
        interval = [tr[1], tr[2]]
    table.append(tr)

tabulate(table,
         headers=["Patient ID", "Start date", "End date", "Medication",
                  "Effect (pred)", "Effect (ans)", "PD date (pred)", "PD date (ans)", "#reports"],
         tablefmt="html")

2024-07-31 09:25:15 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

Downloading https://huggingface.co/stanfordnlp/stanza-ja/resolve/v1.4.1/models/tokenize/gsd.pt:   0%|         …

2024-07-31 09:25:16 INFO: Loading these models for language: ja (Japanese):
| Processor | Package |
-----------------------
| tokenize  | gsd     |

2024-07-31 09:25:16 INFO: Use device: cpu
2024-07-31 09:25:16 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-07-31 09:25:17 INFO: Done loading processors!


Patient ID,Start date,End date,Medication,Effect (pred),Effect (ans),PD date (pred),PD date (ans),#reports
1.0,2015-12-04,2016-01-05,CBDCA,PR,PR,,2016-01-11,1
,2016-08-19,2018-04-29,Erlotinib,PR,PR,2018-08-25,2018-08-25,7
2.0,2011-01-12,2011-04-12,CDDP+VNR,NE,NE,2012-05-17,2012-06-07,4
,2012-06-14,2014-08-15,Gefitinib,NE,NE,2014-03-13,2014-03-13,5
,2014-08-14,2014-11-06,PEM,PR,SD,,,1
