In [14]:
import pandas as pd
import numpy as np
import re

file_name='RM_ptx_information.csv'
df=pd.read_csv(file_name)
cols=['MRN','stage','Treatment 1 Date of Progression','IRB']
df=df[cols]
#df=df[df['IRB']=='8980']
print ('starting number patients:',df.shape[0])

starting number patients: 364


In [15]:
### First validate stage - prepare validation cohort
def isGoodStage(stage):
    try:
        good_stages=['I','II','III','IVA','IVB','IVC']
        if stage in good_stages:
            return True
        else:
            return False
    except:
        return False
df_holdout=df[df.stage.apply(isGoodStage)]
df_holdout=df_holdout[['MRN','stage']]
df_holdout.columns=['MRN','Stage']
df_holdout=df_holdout.set_index('MRN')
print ('mrns with stage:',df_holdout.shape[0])
print (df_holdout.head())

### prepare stages from notes
### divide up groups by TNM Staging
df_notes=pd.read_csv('HNDB Progress Notes Processed.csv')
cols_to_keep=['mrn','Cancer_stage']
df_notes=df_notes[cols_to_keep]
df_notes.columns=['MRN','Stage_']
# get rid of notes that have a string as an MRN
try:
    df_notes=df_notes[df_notes['MRN'].str.isnumeric()]
except:
    print('all notes are int')

print('number of mrns at start:',df_notes.shape[0])
df_notes=df_notes.dropna(subset=['MRN','Stage_'])
# a little formatting, as a treat
df_notes['MRN']=df_notes['MRN'].astype(int)
df_notes.Stage_=df_notes.Stage_.str.upper()
# start making df
unique_mrns=df_notes.MRN.unique()
#create list of dataframes - each represents an mrn
ptx_info=[]
for mrn in unique_mrns:
    df_sub=df_notes[df_notes.MRN==mrn]
    ptx_info.append(df_sub)
print ('Num of patients:',len(ptx_info))
data=[]
for index,ptx in enumerate(ptx_info):
    data.append([ptx.iloc[0]['MRN'],ptx.iloc[-1]['Stage_']])
df_notes=pd.DataFrame(data,columns=['MRN','Stage_'])
df_notes.MRN=df_notes.MRN.apply(pd.to_numeric)
df_notes=df_notes.set_index('MRN')

print (df_notes.head())

### merge dataframes
# merge dfs
df_merged = pd.merge(df_holdout, df_notes, on='MRN',how='outer')
print ('size of merged df:',df_merged.shape[0],'\n')

df_val=df_merged.dropna(subset=['Stage','Stage_'],how='any')
#df_val.to_csv('examine.csv')
print ('complete entries:',df_val.shape[0])

#calculate cohen's kappa
from sklearn.metrics import cohen_kappa_score
cat_map={'I':1,'II':2,'III':3,'IVA':4,'IVB':5,'IVC':6}

note_stages=df_val.Stage_.map(cat_map).to_numpy(dtype=int)
val_stages=df_val.Stage.map(cat_map).to_numpy(dtype=int)

print (note_stages)
print (val_stages)

score=cohen_kappa_score(note_stages,val_stages)

# calculate accuracy
total_ptx=df_val.shape[0]
matches=0
for index,row in df_val.iterrows():
    if row['Stage']==row['Stage_']:
        matches=matches+1
accuracy=matches/total_ptx

print ('Accuracy:',round(accuracy,2))
print ('Cohen\'s Kappa:',round(score,2))

mrns with stage: 154
       Stage
MRN         
654028   IVA
656036   IVA
724115   IVA
765010   IVB
788213   IVA
all notes are int
number of mrns at start: 7415
Num of patients: 128
        Stage_
MRN           
2050963    IVA
2460803     II
2209904    IVA
3535060    III
3447609    IVA
size of merged df: 273 

complete entries: 9
[4 4 4 4 4 3 4 4 4]
[3 4 4 4 4 4 4 2 3]
Accuracy: 0.56
Cohen's Kappa: -0.16


In [16]:
### now do date_DM - prepare validation set
file_name='RM_ptx_information.csv'
df=pd.read_csv(file_name)
cols=['MRN','Treatment 1 Date of Progression','IRB']
df=df[cols]
#df=df[df['IRB']=='8980']
cols=['MRN','Treatment 1 Date of Progression']
df=df[cols]
df.columns=['MRN','Date_DM']
print ('val df - starting number patients:',df.shape[0])
def process_dates(sent):
    try:
        search=re.search(r'\d',sent)
        if not search:
            return np.nan
        toReturn=sent
        if ' ' in sent:
            toReturn=toReturn.split(' ')[0]
        if ',' in sent:
            toReturn=toReturn.split(',')[0]
        if ';' in sent:
            toReturn=toReturn.split(';')[0]
        if '.' in sent:
            toReturn=toReturn.replace('.','/')
        return toReturn
    except:
        return np.nan
df['Date_DM']=df['Date_DM'].apply(process_dates)
df=df.dropna(subset=['Date_DM'])

df['Date_DM']=pd.to_datetime(df['Date_DM'], infer_datetime_format=True,errors='coerce') 
mrns_val=df.MRN.unique()
df=df.set_index('MRN')
print ('patients with date_DM',df.shape[0])

val df - starting number patients: 364
patients with date_DM 282


In [17]:
# ### for new csv_df 20200723
# df=pd.read_csv('UC_recurrent_mrns.csv')
# print (df.columns)
# cols=['MRN','Date of Progression']
# df=df[cols]
# try:
#     df.MRN=df.MRN.str.replace('-','')
#     df.MRN=df.MRN.astype('int64')
# except:
#     print('mrns are already in correct form')
# df.columns=['MRN','Date_DM']
# df['Date_DM']=pd.to_datetime(df['Date_DM'], infer_datetime_format=True,errors='coerce') 
# df=df.set_index('MRN')
# df=df.dropna(subset=['Date_DM'])
# print ('patients with date_DM',df.shape[0])

# print (df.columns)

In [18]:
### get Date_RM_ from notes
df_notes=pd.read_csv('RM_MCs.csv',parse_dates=True)
df_notes.columns=['Date','MRN','is_RM']
df_notes['Date']=pd.to_datetime(df_notes['Date'], infer_datetime_format=True,errors='coerce')

### compare overlap of mrns
mrns_notes=df_notes.MRN.unique()
count=0
for mrn in mrns_notes:
    if mrn in mrns_val:
        count+=1
print ('mrns in both the notes and the val df:',count)


unique_mrns=df_notes.MRN.unique()
print ('patients in notes:',len(unique_mrns))
#create list of dataframes - each represents an mrn
ptx_info=[]
for mrn in unique_mrns:
    df_sub=df_notes[df_notes.MRN==mrn]
    ptx_info.append(df_sub)
data=[]
date_earliest=[]
for i,ptx in enumerate(ptx_info):
    date=np.nan
    mrn=ptx.iloc[0]['MRN']
    date_earliest.append([mrn, ptx.iloc[0]['Date']])
    tripped=False
    for index,row in ptx.iterrows():
        if not tripped:
            if row['is_RM']==1:
                date=row['Date']
                tripped=True
    data.append([mrn,date])
df_notes=pd.DataFrame(data, columns=['MRN','Date_DM_'])
df_notes=df_notes.dropna(subset=['Date_DM_'])
df_notes=df_notes.set_index('MRN')
print ('patients with Date_DM:',df_notes.shape[0])
print (df_notes)

mrns in both the notes and the val df: 73
patients in notes: 171
patients with Date_DM: 142
          Date_DM_
MRN               
2050963 2014-10-01
2460803 1999-04-01
2714226 2000-02-01
2209904 2000-09-08
3535060 2016-03-01
...            ...
3108147 2018-05-08
3769590 2018-08-07
3749634 2018-06-08
3793494 2018-07-13
3715744 2018-07-17

[142 rows x 1 columns]


In [19]:
# merge dataframes
df_merged = pd.merge(df, df_notes, on='MRN',how='inner')
df_merged=df_merged.dropna(subset=['Date_DM','Date_DM_'])
print ('patients with info:',df_merged.shape[0])

from datetime import timedelta

def getDateDiff(date1,date2):
    diff=date1-date2
    return diff
diffs=[]
oldest_dates=[]
print('MRN\t\tValidated Date\t\tDate from notes\t\tDiff\t\tEarliest Note')
for index,row in df_merged.iterrows():
    diff=getDateDiff(row['Date_DM'],row['Date_DM_'])
    diffs.append(diff)
    earliest=''
    for l in date_earliest:
        if index==l[0]:
            earliest=l[1]
            oldest_dates.append(earliest)
    print (index,'\t',row['Date_DM'],'\t',row['Date_DM_'],':',diff,'\t',earliest)
df_merged['diff']=diffs
df_merged['diff']=df_merged['diff'].abs()
df_merged['oldest_date']=oldest_dates
df_merged['diff']=df_merged['diff'].dt.days/30
print(df_merged['diff'].describe())
df_merged=df_merged.loc[df_merged['Date_DM']>=df_merged['oldest_date']]
#df_merged=df_merged.loc[df_merged['Date_DM']>='2011']

print(df_merged['diff'].describe())


patients with info: 57
MRN		Validated Date		Date from notes		Diff		Earliest Note
1420705 	 2007-02-26 00:00:00 	 2009-10-28 00:00:00 : -975 days +00:00:00 	 2009-10-28 00:00:00
1504532 	 2018-01-19 00:00:00 	 2014-10-28 00:00:00 : 1179 days 00:00:00 	 2011-04-16 00:00:00
1566343 	 2012-08-11 00:00:00 	 2012-08-11 00:00:00 : 0 days 00:00:00 	 2012-04-01 00:00:00
1868643 	 2013-03-22 00:00:00 	 2012-04-06 00:00:00 : 350 days 00:00:00 	 2012-04-01 00:00:00
1868643 	 2013-03-22 00:00:00 	 2012-04-06 00:00:00 : 350 days 00:00:00 	 2012-04-01 00:00:00
2050963 	 2015-10-02 00:00:00 	 2014-10-01 00:00:00 : 366 days 00:00:00 	 1996-02-26 00:00:00
2573707 	 2002-12-18 00:00:00 	 2012-08-28 00:00:00 : -3541 days +00:00:00 	 2010-08-25 00:00:00
2849915 	 2006-12-14 00:00:00 	 2007-01-01 00:00:00 : -18 days +00:00:00 	 2007-01-01 00:00:00
2913559 	 2007-08-07 00:00:00 	 2012-05-21 00:00:00 : -1749 days +00:00:00 	 2012-05-21 00:00:00
2933746 	 2009-05-14 00:00:00 	 2009-06-17 00:00:00 : -34 days +0