## Imports

In [2]:
import math
import numpy as np
import pandas as pd

#from lifelines import KaplanMeierFitter
#from lifelines import CoxPHFitter
#from lifelines.statistics import logrank_test

from datetime import datetime, timedelta
import re

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)

%matplotlib inline
plt.rcParams['figure.figsize'] = [10, 5]
plt.rc('xtick', labelsize=10) 
plt.rc('ytick', labelsize=10)

## Input

In [68]:
python_code_path = 'P:/ORD_Singh_202001030D/Moore Measures 2&3 Code/Python Code/'

### MM2

In [69]:
mm2_crc_df = pd.read_csv(python_code_path + 'Data/mm2_crc_merged.csv', engine='python')
mm2_crc_df['StudyID'] = mm2_crc_df['ï»¿StudyID']
mm2_crc_df = mm2_crc_df.drop('ï»¿StudyID', axis=1)

In [70]:
mm2_lca_df = pd.read_csv('Data/mm2_lca_merged.csv', engine='python')
mm2_lca_df['StudyID'] = mm2_lca_df['ï»¿StudyID']
mm2_lca_df = mm2_lca_df.drop('ï»¿StudyID', axis=1)

### MM3

In [71]:
mm3_crc_df = pd.read_csv('Data/mm3_crc_merged.csv', engine='python')
mm3_crc_df['StudyID'] = mm3_crc_df['ï»¿StudyID']
mm3_crc_df = mm3_crc_df.drop('ï»¿StudyID', axis=1)

In [72]:
mm3_lca_df = pd.read_csv('Data/mm3_lca_merged.csv', engine='python')
mm3_lca_df['StudyID'] = mm3_lca_df['ï»¿StudyID']
mm3_lca_df = mm3_lca_df.drop('ï»¿StudyID', axis=1)

## Data Prep

### MM2

In [73]:
# true EPs are ones where reviewer says it is unplanned, is really first time cancer dx, and there is a cancer signal
def is_true_EP (row):
    if (row['UnplanVal'] == 'Yes' and row['DxVal'] == 'Yes' and not pd.isnull(row['RelSig'])):
        return 1
    else:
        return 0

In [74]:
def response_num_to_text (row):
    if (row['RelRespNumerical'] == 1):
        return 'No signal requiring f/u'
    elif (row['RelRespNumerical'] == 2):
        return 'Signal not recognized'
    elif (row['RelRespNumerical'] == 3):
        return 'Signal recognized, f/u not initiated'
    elif (row['RelRespNumerical'] == 4):
        return 'Signal recognized, OP f/u'
    elif (row['RelRespNumerical'] == 5):
        return 'Signal recognized, IP f/u'
    elif (row['RelRespNumerical'] == 6):
        return 'Other'
    #else:
        #return 'ERROR'

In [75]:
def typeep_num_to_text (row):
    if (row['TypeOfEP'] == 1):
        return 'Dx within emergency event'
    elif (row['TypeOfEP'] == 2):
        return 'Cancer-related emergency -> Related subsequent encounter'
    elif (row['TypeOfEP'] == 3):
        return 'Non-cancer-related emergency -> Related subsequent encounter'
    elif (row['TypeOfEP'] == 4):
        return 'Non-cancer-related emergency -> Unrelated subsequent encounter'
    elif (row['TypeOfEP'] == 5):
        return 'Other'
    #else:
        #return 'ERROR'

In [76]:
def parse_datetime1 (row, col):
    try:
        return datetime.strptime(row[col], '%m/%d/%Y')
    except:
        return None

In [77]:
def parse_datetime2 (row, col):
    try:
        return datetime.strptime(str(row[col]), '%Y-%m-%d')
    except:
        return None

In [78]:
def is_timedelta_within_threshold (row, col_later, col_earlier, threshold):
    try:
        delta = (row[col_later] - row[col_earlier]).days
        if delta <= threshold:
            return 1
        else:
            return 0
    except:
        return -1

In [79]:
def parse_yesno_1 (row, col):
    if (row[col] == 'Yes'):
        return 1
    else:
        return 0

In [80]:
def parse_01_1 (row, col):
    if (row[col] == 1):
        return 1
    else:
        return 0

In [81]:
def check_not_null (row, col):
    if not pd.isnull(row[col]):
        return 1
    else:
        return 0

In [82]:
# make columns for each individual signal for CRC
def get_crc_sig_01 (row):
    if ';' in str(row['RelSig']):
        for x in str(row['RelSig']).split(';'):
            if ('1' in str(x) and not '10' in str(x) and not '11' in str(x) and not '12' in str(x) and not '13' in str(x) and not '14' in str(x)):
                return 1
        return 0
    else:
        if ('1' in str(row['RelSig']) and not '10' in str(row['RelSig']) and not '11' in str(row['RelSig']) and not '12' in str(row['RelSig']) and not '13' in str(row['RelSig']) and not '14' in str(row['RelSig'])):
            return 1
        else:
            return 0
    
def get_crc_sig_02 (row):
    if ';' in str(row['RelSig']):
        for x in str(row['RelSig']).split(';'):
            if ('2' in str(x) and not '12' in str(x)):
                return 1
        return 0
    else:
        if ('2' in str(row['RelSig']) and not '12' in str(row['RelSig'])):
            return 1
        else:
            return 0
    
def get_crc_sig_03 (row):
    if ';' in str(row['RelSig']):
        for x in str(row['RelSig']).split(';'):
            if ('3' in str(x) and not '13' in str(x)):
                return 1
        return 0
    else:
        if ('3' in str(row['RelSig']) and not '13' in str(row['RelSig'])):
            return 1
        else:
            return 0
    
def get_crc_sig_04 (row):
    if ';' in str(row['RelSig']):
        for x in str(row['RelSig']).split(';'):
            if ('4' in str(x) and not '14' in str(x)):
                return 1
        return 0
    else:
        if ('4' in str(row['RelSig']) and not '14' in str(row['RelSig'])):
            return 1
        else:
            return 0
    
def get_crc_sig_05 (row):
    if ('5' in str(row['RelSig'])):
        return 1
    else:
        return 0
    
def get_crc_sig_06 (row):
    if ('6' in str(row['RelSig'])):
        return 1
    else:
        return 0
    
def get_crc_sig_07 (row):
    if ('7' in str(row['RelSig'])):
        return 1
    else:
        return 0
    
def get_crc_sig_08 (row):
    if ('8' in str(row['RelSig'])):
        return 1
    else:
        return 0
    
def get_crc_sig_09 (row):
    if ('9' in str(row['RelSig'])):
        return 1
    else:
        return 0
    
def get_crc_sig_10 (row):
    if ('10' in str(row['RelSig'])):
        return 1
    else:
        return 0
    
def get_crc_sig_11 (row):
    if ('11' in str(row['RelSig'])):
        return 1
    else:
        return 0
    
def get_crc_sig_12 (row):
    if ('12' in str(row['RelSig'])):
        return 1
    else:
        return 0
    
def get_crc_sig_13 (row):
    if ('13' in str(row['RelSig'])):
        return 1
    else:
        return 0
    
def get_crc_sig_14 (row):
    if ('14' in str(row['RelSig'])):
        return 1
    else:
        return 0

In [83]:
# make columns for each individual signal for LCa
def get_lca_sig_01 (row):
    if ';' in str(row['RelSig']):
        for x in str(row['RelSig']).split(';'):
            if ('1' in str(x) and not '10' in str(x) and not '11' in str(x) and not '12' in str(x) and not '13' in str(x) and not '14' in str(x)):
                return 1
        return 0
    else:
        if ('1' in str(row['RelSig']) and not '10' in str(row['RelSig']) and not '11' in str(row['RelSig']) and not '12' in str(row['RelSig']) and not '13' in str(row['RelSig']) and not '14' in str(row['RelSig'])):
            return 1
        else:
            return 0
    
def get_lca_sig_02 (row):
    if ('2' in str(row['RelSig'])):
        return 1
    else:
        return 0
    
def get_lca_sig_03 (row):
    if ('3' in str(row['RelSig'])):
        return 1
    else:
        return 0
    
def get_lca_sig_04 (row):
    if ('4' in str(row['RelSig'])):
        return 1
    else:
        return 0
    
def get_lca_sig_05 (row):
    if ('5' in str(row['RelSig'])):
        return 1
    else:
        return 0
    
def get_lca_sig_06 (row):
    if ('6' in str(row['RelSig'])):
        return 1
    else:
        return 0
    
def get_lca_sig_07 (row):
    if ('7' in str(row['RelSig'])):
        return 1
    else:
        return 0
    
def get_lca_sig_08 (row):
    if ('8' in str(row['RelSig'])):
        return 1
    else:
        return 0
    
def get_lca_sig_09 (row):
    if ('9' in str(row['RelSig'])):
        return 1
    else:
        return 0
    
def get_lca_sig_10 (row):
    if ('10' in str(row['RelSig'])):
        return 1
    else:
        return 0

In [84]:
mm2_crc_df['TrueEP'] = mm2_crc_df.apply(lambda row: is_true_EP(row), axis=1)
mm2_lca_df['TrueEP'] = mm2_lca_df.apply(lambda row: is_true_EP(row), axis=1)

In [85]:
mm2_crc_df['ResponseText'] = mm2_crc_df.apply(lambda row: response_num_to_text(row), axis=1)
mm2_crc_df['ResponseText'] = pd.Categorical(mm2_crc_df['ResponseText'], categories=['No signal requiring f/u', 'Signal not recognized', 'Signal recognized, f/u not initiated', 'Signal recognized, OP f/u', 'Signal recognized, IP f/u', 'Other'], ordered=True)
mm2_lca_df['ResponseText'] = mm2_lca_df.apply(lambda row: response_num_to_text(row), axis=1)
mm2_lca_df['ResponseText'] = pd.Categorical(mm2_lca_df['ResponseText'], categories=['No signal requiring f/u', 'Signal not recognized', 'Signal recognized, f/u not initiated', 'Signal recognized, OP f/u', 'Signal recognized, IP f/u', 'Other'], ordered=True)

In [86]:
mm2_crc_df['TypeEPText'] = mm2_crc_df.apply(lambda row: typeep_num_to_text(row), axis=1)
mm2_crc_df['TypeEPText'] = pd.Categorical(mm2_crc_df['TypeEPText'], categories=['Dx within emergency event', 'Cancer-related emergency -> Related subsequent encounter', 'Non-cancer-related emergency -> Related subsequent encounter', 'Non-cancer-related emergency -> Unrelated subsequent encounter', 'Other'], ordered=True)
mm2_lca_df['TypeEPText'] = mm2_lca_df.apply(lambda row: typeep_num_to_text(row), axis=1)
mm2_lca_df['TypeEPText'] = pd.Categorical(mm2_lca_df['TypeEPText'], categories=['Dx within emergency event', 'Cancer-related emergency -> Related subsequent encounter', 'Non-cancer-related emergency -> Related subsequent encounter', 'Non-cancer-related emergency -> Unrelated subsequent encounter', 'Other'], ordered=True)

In [87]:
mm2_crc_df['ScreeningTestDateTime'] = mm2_crc_df.apply(lambda row: parse_datetime1(row, 'DiagnosticTestDate'), axis=1)
mm2_lca_df['ScreeningTestDateTime'] = mm2_lca_df.apply(lambda row: parse_datetime2(row, 'DiagnosticTestDate'), axis=1)
mm2_crc_df['DiagnosisDateTime'] = mm2_crc_df.apply(lambda row: parse_datetime2(row, 'DiagnosticEventDateTime'), axis=1)
mm2_lca_df['DiagnosisDateTime'] = mm2_lca_df.apply(lambda row: parse_datetime2(row, 'DiagnosticEventDateTime'), axis=1)

mm2_crc_df['ScreeningUpToDate'] = mm2_crc_df.apply(lambda row: is_timedelta_within_threshold(row, 'DiagnosisDateTime', 'ScreeningTestDateTime', 3650), axis=1)
mm2_lca_df['ScreeningUpToDate'] = mm2_lca_df.apply(lambda row: is_timedelta_within_threshold(row, 'DiagnosisDateTime', 'ScreeningTestDateTime', 3650), axis=1)

In [88]:
mm2_crc_df['RFPriorToDx'] = mm2_crc_df.apply(lambda row: parse_yesno_1(row, 'RFYesNo'), axis=1)
# lung cancer is already in 1 0 format
mm2_lca_df['RFPriorToDx'] = mm2_lca_df.apply(lambda row: parse_01_1(row, 'RFYesNo'), axis=1)

In [89]:
mm2_crc_df['HasMOD'] = mm2_crc_df.apply(lambda row: parse_yesno_1(row, 'DxMOD'), axis=1)
# lung cancer is already in 1 0 format
mm2_lca_df['HasMOD'] = mm2_lca_df.apply(lambda row: parse_01_1(row, 'DxMOD'), axis=1)

In [90]:
mm2_crc_df['DimensionPtRelated_Parsed'] = mm2_crc_df.apply(lambda row: check_not_null(row, 'DimensionPtRelated'), axis=1)
mm2_crc_df['DimensionPtProvEncounter_Parsed'] = mm2_crc_df.apply(lambda row: check_not_null(row, 'DimensionPtProvEncounter'), axis=1)
mm2_crc_df['DimensionDxTests_Parsed'] = mm2_crc_df.apply(lambda row: check_not_null(row, 'DimensionDxTests'), axis=1)
mm2_crc_df['DimensionFupTrack_Parsed'] = mm2_crc_df.apply(lambda row: check_not_null(row, 'DimensionFupTrack'), axis=1)
mm2_crc_df['DimensionReferrals_Parsed'] = mm2_crc_df.apply(lambda row: check_not_null(row, 'DimensionReferrals'), axis=1)

mm2_lca_df['DimensionPtRelated_Parsed'] = mm2_lca_df.apply(lambda row: check_not_null(row, 'DimensionPtRelated'), axis=1)
mm2_lca_df['DimensionPtProvEncounter_Parsed'] = mm2_lca_df.apply(lambda row: check_not_null(row, 'DimensionPtProvEncounter'), axis=1)
mm2_lca_df['DimensionDxTests_Parsed'] = mm2_lca_df.apply(lambda row: check_not_null(row, 'DimensionDxTests'), axis=1)
mm2_lca_df['DimensionFupTrack_Parsed'] = mm2_lca_df.apply(lambda row: check_not_null(row, 'DimensionFupTrack'), axis=1)
mm2_lca_df['DimensionReferrals_Parsed'] = mm2_lca_df.apply(lambda row: check_not_null(row, 'DimensionReferrals'), axis=1)

In [91]:
mm2_crc_df['Signal_IDA'] = mm2_crc_df.apply(lambda row: get_crc_sig_01(row), axis=1)
mm2_crc_df['Signal_Obstruction'] = mm2_crc_df.apply(lambda row: get_crc_sig_02(row), axis=1)
mm2_crc_df['Signal_BloodyStool'] = mm2_crc_df.apply(lambda row: get_crc_sig_03(row), axis=1)
mm2_crc_df['Signal_AbdMass'] = mm2_crc_df.apply(lambda row: get_crc_sig_04(row), axis=1)
mm2_crc_df['Signal_RectMass'] = mm2_crc_df.apply(lambda row: get_crc_sig_05(row), axis=1)
mm2_crc_df['Signal_StoolCaliber'] = mm2_crc_df.apply(lambda row: get_crc_sig_06(row), axis=1)
mm2_crc_df['Signal_CScope'] = mm2_crc_df.apply(lambda row: get_crc_sig_07(row), axis=1)
mm2_crc_df['Signal_FOBT'] = mm2_crc_df.apply(lambda row: get_crc_sig_08(row), axis=1)
mm2_crc_df['Signal_FlexSig'] = mm2_crc_df.apply(lambda row: get_crc_sig_09(row), axis=1)
mm2_crc_df['Signal_Enema'] = mm2_crc_df.apply(lambda row: get_crc_sig_10(row), axis=1)
mm2_crc_df['Signal_CT'] = mm2_crc_df.apply(lambda row: get_crc_sig_11(row), axis=1)
mm2_crc_df['Signal_FamHx'] = mm2_crc_df.apply(lambda row: get_crc_sig_12(row), axis=1)
mm2_crc_df['Signal_MassExam'] = mm2_crc_df.apply(lambda row: get_crc_sig_13(row), axis=1)
mm2_crc_df['Signal_Other'] = mm2_crc_df.apply(lambda row: get_crc_sig_14(row), axis=1)

In [92]:
mm2_lca_df['Signal_Cough'] = mm2_lca_df.apply(lambda row: get_lca_sig_01(row), axis=1)
mm2_lca_df['Signal_Dyspnea'] = mm2_lca_df.apply(lambda row: get_lca_sig_02(row), axis=1)
mm2_lca_df['Signal_ChestPain'] = mm2_lca_df.apply(lambda row: get_lca_sig_03(row), axis=1)
mm2_lca_df['Signal_Hemoptysis'] = mm2_lca_df.apply(lambda row: get_lca_sig_04(row), axis=1)
mm2_lca_df['Signal_LRTI'] = mm2_lca_df.apply(lambda row: get_lca_sig_05(row), axis=1)
mm2_lca_df['Signal_Imaging'] = mm2_lca_df.apply(lambda row: get_lca_sig_06(row), axis=1)
mm2_lca_df['Signal_Paraneoplastic'] = mm2_lca_df.apply(lambda row: get_lca_sig_07(row), axis=1)
mm2_lca_df['Signal_Hoarse'] = mm2_lca_df.apply(lambda row: get_lca_sig_08(row), axis=1)
mm2_lca_df['Signal_WtLoss'] = mm2_lca_df.apply(lambda row: get_lca_sig_09(row), axis=1)
mm2_lca_df['Signal_Other'] = mm2_lca_df.apply(lambda row: get_lca_sig_10(row), axis=1)

In [93]:
# mm2_crc_df[['RelSig', 'Signal_IDA', 'Signal_Obstruction', 'Signal_BloodyStool', 'Signal_AbdMass', 'Signal_RectMass', 'Signal_StoolCaliber', 'Signal_CScope', 'Signal_FOBT', 'Signal_FlexSig', 'Signal_Enema', 'Signal_CT', 'Signal_FamHx', 'Signal_MassExam', 'Signal_Other']]

In [94]:
# mm2_lca_df[['RelSig', 'Signal_Cough', 'Signal_Dyspnea', 'Signal_ChestPain', 'Signal_Hemoptysis', 'Signal_LRTI', 'Signal_Imaging', 'Signal_Paraneoplastic', 'Signal_Hoarse', 'Signal_WtLoss', 'Signal_Other']]

# AJZ

Examine MM2 CRC df, and MM2 lca df. 

In [95]:
hi_value_vars = []

for line in list(mm2_crc_df):
    if 'MOD' in line or 'Type' in line or 'True' in line or 'EP' in line:
        hi_value_vars.append(line)

print(hi_value_vars)

['CancerType', 'TypeOfEP', 'TypeOfEP_other', 'DxMOD', 'TypeOfCancer', 'TypeOfDiagnosisEvent', 'TypeOfEmergencyEvent', 'EP', 'TrueEP', 'TypeEPText', 'HasMOD']


## By inspection

EP column is always 1.

TypeOfEP is not what we're looking for.

DxMOD is the one with "NaN" values.

In [66]:
[
    sum(mm2_crc_df['DxMOD'].isna()),
    sum(mm2_crc_df['TrueEP'].isna()),
    sum(mm2_crc_df['HasMOD'].isna())
]

[22, 0, 0]

In [61]:
pd.crosstab(mm2_crc_df['DxMOD'].fillna('missing'), mm2_crc_df['TrueEP'])

TrueEP,0,1
DxMOD,Unnamed: 1_level_1,Unnamed: 2_level_1
No,1,21
Yes,5,51
missing,21,1


In [62]:
pd.crosstab(mm2_crc_df['DxMOD'].fillna('missing'), mm2_crc_df['HasMOD'] )

HasMOD,0,1
DxMOD,Unnamed: 1_level_1,Unnamed: 2_level_1
No,22,0
Yes,0,56
missing,22,0


In [67]:
pd.crosstab(mm2_crc_df['TrueEP'], mm2_crc_df['HasMOD'])

HasMOD,0,1
TrueEP,Unnamed: 1_level_1,Unnamed: 2_level_1
0,22,5
1,22,51
