In [10]:
import pandas as pd
import numpy as np

file_name='ENT_CA_DM_Encounters.txt'
df=pd.read_csv(file_name, delimiter='|',error_bad_lines=False)
print (df.shape[0])
print(df.columns)
cols=['mrn','enc_date','encounter_type','department']
df=df[cols]

449230
Index(['mrn', 'bill_num', 'HAR', 'HAR_MASTER', 'enc_date', 'enc_type_c',
       'encounter_type', 'ed_arrival_dttm', 'hosp_adm_dttm', 'hosp_dc_dttm',
       'department', 'disch_disp_c', 'disch_disp'],
      dtype='object')



Columns (1) have mixed types.Specify dtype option on import or set low_memory=False.



In [11]:
### filter for radonc
print (df.department.value_counts())
df_rad=df[df['department']=='RADIATION ONCOLOGY']
print ('Number of rad/onc notes:',df_rad.shape[0])

### filter for treatment
#df_rad=df_rad[(df_rad['encounter_type']=='Office Visit')|(df_rad['encounter_type']=='Treatment')]
df_rad=df_rad[df_rad['encounter_type']=='Treatment']
print ('Number of rad/onc TREATMENT notes:',df_rad.shape[0])


### filter for target mrns
file='all_mrns_of_interest.csv'
mrn_df=pd.read_csv(file)
mrns_of_interest=mrn_df.MRN.unique()
df_rad=df_rad[df_rad['mrn'].isin(mrns_of_interest)]
print ('Number of rad/onc notes assoc with target patients:',df_rad.shape[0])

HEMATOLOGY/ONCOLOGY      77543
RADIATION ONCOLOGY       67888
OTOLARYNGOLOGY           60734
IV THERAPY               27097
SPEECH & SWALLOWING      12156
                         ...  
GENERAL PEDIATRICS           1
GYN/ONC OFF SITE             1
PATIENT ACCESS CENTER        1
LAKE PARK OB/GYN             1
OPHTHALMOLOGY-MAGNA          1
Name: department, Length: 492, dtype: int64
Number of rad/onc notes: 67888
Number of rad/onc TREATMENT notes: 17772
Number of rad/onc notes assoc with target patients: 2412


In [12]:
### sort by date
df_rad['enc_date']=df_rad['enc_date'].apply(lambda d:d.split(' ')[0])
df_rad['enc_date']=pd.to_datetime(df_rad['enc_date'],errors='coerce',infer_datetime_format=True)
df_rad=df_rad.sort_values(by='enc_date')
### seperate dataframe by paitent
unique_mrns=df_rad.mrn.unique()
ptx_info=[]
for mrn in unique_mrns:
    sub_df=df_rad[df_rad.mrn==mrn]
    ptx_info.append(sub_df)
print ('Number of unique patients:',len(ptx_info))

Number of unique patients: 116


In [13]:
### strain out encounters that are very far apart from other events
def is_groupable(df,day_window=14):
    '''return boolean list of whether an entry is groupable (is close to other dates)'''
    df_func=df
    df_func['yesterday']=df['enc_date'].shift(1)
    df_func['tomorrow']=df['enc_date'].shift(-1)
    flags=[]
    for index,row in df.iterrows():
        flag=False
        today=row['enc_date']
        yesterday=row['yesterday']
        tomorrow=row['tomorrow']
        try:
            diff=abs((today-yesterday).days)
            if diff<=10:
                flag=True
        except:
            print ('no yesterday')
        try:
            diff=abs((today-tomorrow).days)
            if diff<=10:
                flag=True
        except:
            print ('no tomorrow')
        flags.append(flag)
    return flags

ptx_info_2=[]
for index,ptx in enumerate(ptx_info):
    mask=is_groupable(ptx_info[index])
    ptx_info_2.append(ptx[mask])
ptx_info=ptx_info_2
print (len(ptx_info))

116


In [14]:
### filter out empty dfs by combining and then splitting again
df_rebuilt=''
for index,ptx in enumerate(ptx_info):
    if index==0:
        df_rebuilt=ptx
    else:
        df_rebuilt=df_rebuilt.append(ptx)
unique_mrns=df_rebuilt.mrn.unique()
print (len(unique_mrns))
ptx_info=[]
for mrn in unique_mrns:
    sub_df=df_rebuilt[df_rebuilt['mrn']==mrn]
    ptx_info.append(sub_df)
print (ptx_info[1])

103
            mrn   enc_date encounter_type          department  yesterday  \
296499  3261049 2012-07-09      Treatment  RADIATION ONCOLOGY        NaT   
296500  3261049 2012-07-10      Treatment  RADIATION ONCOLOGY 2012-07-09   
296502  3261049 2012-07-12      Treatment  RADIATION ONCOLOGY 2012-07-10   
296512  3261049 2012-08-06      Treatment  RADIATION ONCOLOGY 2012-07-12   
296513  3261049 2012-08-07      Treatment  RADIATION ONCOLOGY 2012-08-06   
296516  3261049 2012-08-09      Treatment  RADIATION ONCOLOGY 2012-08-07   
296527  3261049 2012-09-04      Treatment  RADIATION ONCOLOGY 2012-08-09   
296529  3261049 2012-09-06      Treatment  RADIATION ONCOLOGY 2012-09-04   
296532  3261049 2012-09-08      Treatment  RADIATION ONCOLOGY 2012-09-06   
296384  3261049 2013-03-25      Treatment  RADIATION ONCOLOGY 2012-09-08   
296386  3261049 2013-03-26      Treatment  RADIATION ONCOLOGY 2013-03-25   
296388  3261049 2013-03-27      Treatment  RADIATION ONCOLOGY 2013-03-26   
296389  

In [15]:
def assignGroup(df,day_window=14):
    '''group encounter by how close it is to other dates, using day_window to classify'''
    group=0
    designations=[]
    for index,row in df.iterrows():
        if index==0:
            designations.append(group)
        if index != 0:
            today=row['enc_date']
            yesterday=row['yesterday']
            diff=abs((today-yesterday).days)
            if diff<=day_window:
                designations.append(group)
            else:
                group+=1
                designations.append(group)
    return designations
def getCycles(df):
    '''returns list of cycles with: cycle number,start date,end date'''
    cycles=df['group'].unique()
    toReturn=[]
    for cycle in cycles:
        sub_df=df[df['group']==cycle]
        l=[sub_df.iloc[0]['mrn'],cycle,sub_df.iloc[0]['enc_date'],sub_df.iloc[-1]['enc_date']]
        toReturn.append(l)
    return toReturn
'''get cycles for patients'''

data_many=[]
for index,ptx in enumerate(ptx_info):
    df_working=ptx_info[index]
    df_working['group']=assignGroup(df_working)
    data_many.append(getCycles(df_working))
print (data_many[2])

dfs=[]
for d in data_many:
    sub_df=pd.DataFrame(d,columns=['mrn','cycle','start','end'])
    dfs.append(sub_df)
df_export=''
for index,df_ in enumerate(dfs):
    if index==0:
        df_export=df_
    else:
        df_export=df_export.append(df_)
# df_export=pd.DataFrame(data,columns=['mrn','cycle','start','end'])
print (df_export)
df_export.to_csv('RM_radiation_cycles.csv',index=False)

[[3276187, 1, Timestamp('2012-07-23 00:00:00'), Timestamp('2012-09-21 00:00:00')], [3276187, 2, Timestamp('2014-01-13 00:00:00'), Timestamp('2014-01-24 00:00:00')]]
        mrn  cycle      start        end
0   3260592      0 2012-07-02 2012-08-16
1   3260592      1 2014-04-01 2014-04-14
0   3261049      1 2012-07-09 2012-07-12
1   3261049      2 2012-08-06 2012-08-09
2   3261049      3 2012-09-04 2012-09-08
..      ...    ...        ...        ...
0   3769590      0 2018-07-30 2018-08-31
0   3108147      0 2018-07-30 2018-08-31
0   3793494      0 2018-10-29 2018-11-30
0   3797673      1 2018-11-08 2018-11-23
0   3497265      1 2020-01-29 2020-01-31

[121 rows x 4 columns]


In [16]:
df=pd.read_csv('RM_radiation_cycles.csv')
df['start']=pd.to_datetime(df['start'],infer_datetime_format=True, errors='coerce')
df['end']=pd.to_datetime(df['end'],infer_datetime_format=True, errors='coerce')


import plotly.figure_factory as ff
import matplotlib.pyplot as plt

def make_data(df):
    '''make the data to input into gantt chart, taking input of df '''
    data=[]
    for index,row in df.iterrows():
        Task='RAD'
        Start=row['start']
        End=row['end']
        Source=row['cycle']
        if index==0:
            data=[dict(Task=Task,Start=Start,Finish=End,Source=Source)]
        else:
            data.append(dict(Task=Task,Start=Start,Finish=End,Source=Source))
    return data
def plot_gannt(df):
    fig = ff.create_gantt(make_data(df), index_col='Source', show_colorbar=False,
                      group_tasks=True)
    fig.show()
unique_mrns=df.mrn.unique()
ptx_info=[]
for mrn in unique_mrns:
    sub_df=df[df.mrn==mrn]
    ptx_info.append(sub_df)
plot_gannt(ptx_info[1])
print (ptx_info[1])

       mrn  cycle      start        end
2  3261049      1 2012-07-09 2012-07-12
3  3261049      2 2012-08-06 2012-08-09
4  3261049      3 2012-09-04 2012-09-08
5  3261049      4 2013-03-25 2013-03-29
