# README.md

# Research question: What are the T dates and values for the recovery group and non recovery group just before and immediately after the initial ADT use? 


## Table of Contents
* [Read data, clean up and get stats](#first-bullet)
* [T test dates and values before ADT](#second-bullet)
* [T test dates and values immediately after ADT](#third-bullet)
* [Output data](#fouth-bullet)
* [Baseline Total Testosterone Histogram](#fifth-bullet)
* [Delta Total Testosterone just before and after ADT](#six-bullet)
* [Time to Recovery Histogram](#seventh-bullet)

In [4]:
import numpy as np
import pandas as pd
import sys, getopt
import csv
from datetime import datetime
from datetime import timedelta, date
import itertools

from matplotlib import pyplot
from pylab import *
import matplotlib.pyplot as plt
%matplotlib inline  
import seaborn as sns

# Read data, clean up and get stats <a class="anchor" id="first-bullet"></a>

In [5]:
df_adt=pd.read_excel("./data/Inital_ADT_use_time_window_121pts_updated_Jan28.xls", header=0)
#df_adt=pd.read_csv("Inital_ADT_use_time_window_121pts_updated_Jan27.csv", header=0)
#Inital_ADT_use_time_window_121pts_updated_Jan27.csv
df_adt.fillna('', inplace=True)

#t results for 4490 pts
df_tvalue=pd.read_excel("./data/pca_tresults_20210119_BP_fixed.xlsx", header=0)
#df_tvalue=pd.read_excel("Radiation_T_test.xlsx", header=0)

In [6]:
df_adt[:3]

Unnamed: 0,person_id,inital_ADT_start_date,inital_ADT_end_date,Inital_ADT_timewindow,Inital_ADT_time_window_days,index1
0,12,2013-05-06 00:00:00,2013-05-06 00:00:00,0 days 00:00:00.000000000,0,0
1,18,2013-11-12 00:00:00,2014-03-13 00:00:00,121 days 00:00:00.000000000,121,1
2,25,2013-08-01 00:00:00,2014-02-18 00:00:00,201 days 00:00:00.000000000,201,2


In [7]:
adt_pt_list=df_adt["person_id"].to_list()

In [8]:
df_tvalue[:3]

Unnamed: 0,person_id,cancer_type_id,year_of_diagnosis,month_of_diagnosis,day_of_diagnosis,dx_date,result_date,loinc_code,source_test_name,test_results
0,2,2,2003,1.0,7.0,2003-01-07,2006-09-05,49041-7,TESTOSTERONE,351.0
1,2,2,2003,1.0,7.0,2003-01-07,2007-01-09,49041-7,TESTOSTERONE,304.0
2,2,2,2003,1.0,7.0,2003-01-07,2007-05-17,49041-7,TESTOSTERONE,259.0


In [9]:
df_tvalue_grouped=df_tvalue.groupby(["person_id"]).size().reset_index(name='counts')
list_pt_test=df_tvalue_grouped["person_id"].to_list()
print("list of pt ids with t test results")
print(len(list_pt_test))
print("")

overlap_list=[x for x in adt_pt_list if x in  list_pt_test]
print(len(overlap_list))
print(len(adt_pt_list))

list of pt ids with t test results
4490

121
121


In [10]:
deleted_list=[]
for ind in df_tvalue.index:
    if (df_tvalue['test_results'][ind]=='CANCELED'): 
        deleted_list.append(ind) 
       
print(df_tvalue.shape)
for ind in deleted_list:
    df_tvalue.drop(index=ind,inplace=True)
print(df_tvalue.shape)

(18643, 10)
(18643, 10)


In [11]:
df_adt[:3]

Unnamed: 0,person_id,inital_ADT_start_date,inital_ADT_end_date,Inital_ADT_timewindow,Inital_ADT_time_window_days,index1
0,12,2013-05-06 00:00:00,2013-05-06 00:00:00,0 days 00:00:00.000000000,0,0
1,18,2013-11-12 00:00:00,2014-03-13 00:00:00,121 days 00:00:00.000000000,121,1
2,25,2013-08-01 00:00:00,2014-02-18 00:00:00,201 days 00:00:00.000000000,201,2


In [12]:
df_adt_pt=df_adt[(df_adt["person_id"] == 36062)]

In [13]:
#get the list of pt ids
list_pts=df_adt["person_id"].to_list()

In [15]:
list_pts[:3]

[12, 18, 25]

In [16]:
len(list_pts)

121

In [17]:
df_tvalue

Unnamed: 0,person_id,cancer_type_id,year_of_diagnosis,month_of_diagnosis,day_of_diagnosis,dx_date,result_date,loinc_code,source_test_name,test_results
0,2,2,2003,1.0,7.0,2003-01-07,2006-09-05,49041-7,TESTOSTERONE,351.00
1,2,2,2003,1.0,7.0,2003-01-07,2007-01-09,49041-7,TESTOSTERONE,304.00
2,2,2,2003,1.0,7.0,2003-01-07,2007-05-17,49041-7,TESTOSTERONE,259.00
3,2,2,2003,1.0,7.0,2003-01-07,2007-06-12,49041-7,TESTOSTERONE,382.00
4,2,2,2003,1.0,7.0,2003-01-07,2007-06-28,49041-7,TESTOSTERONE,439.00
...,...,...,...,...,...,...,...,...,...,...
18638,208953,2,2020,9.0,22.0,2020-09-22,2020-06-30,49041-7,TESTOSTERONE,557.28
18639,209160,2,2020,8.0,11.0,2020-08-11,2012-07-19,49041-7,TESTOSTERONE,219.10
18640,209416,2,2019,5.0,10.0,2019-05-10,2013-06-11,49041-7,TESTOSTERONE,211.15
18641,209416,2,2019,5.0,10.0,2019-05-10,2013-07-02,49041-7,TESTOSTERONE,150.70


In [18]:
i=12
df_tvalue_pt=df_tvalue[(df_tvalue["person_id"]==i)]
df_tvalue_pt[:3]

Unnamed: 0,person_id,cancer_type_id,year_of_diagnosis,month_of_diagnosis,day_of_diagnosis,dx_date,result_date,loinc_code,source_test_name,test_results
84,12,2,2012,10.0,24.0,2012-10-24,2012-11-01,49041-7,TESTOSTERONE,237.44
85,12,2,2012,10.0,24.0,2012-10-24,2013-05-06,49041-7,TESTOSTERONE,15.15
86,12,2,2012,10.0,24.0,2012-10-24,2014-01-28,49041-7,TESTOSTERONE,10.89


In [19]:
check=df_adt_pt['inital_ADT_start_date']
check=df_adt_pt['inital_ADT_start_date'].to_list()
#df1_pt.fillna('', inplace=True)
df_adt_pt.replace('', np.NaN)
df_adt_pt['inital_ADT_start_date'].notnull().values.any()

True

In [20]:
len(check)

1

In [21]:
def closest(lst, K): 
      return lst[min(range(len(lst)), key = lambda i: abs(lst[i] - K))] 

In [22]:
df1=df_adt
df2=df_tvalue

list_pts_adt_table=df1["person_id"].to_list()
df2_pt=df2[(df2["person_id"] == 36062)]  
each_pt_tdates=df2_pt['result_date'].to_list()
each_pt_tdates

[Timestamp('2014-04-23 00:00:00'),
 Timestamp('2014-07-02 00:00:00'),
 Timestamp('2014-07-31 00:00:00'),
 Timestamp('2016-09-06 00:00:00'),
 Timestamp('2016-09-06 00:00:00'),
 Timestamp('2016-12-13 00:00:00'),
 Timestamp('2017-04-25 00:00:00'),
 Timestamp('2017-08-08 00:00:00'),
 Timestamp('2017-12-12 00:00:00'),
 Timestamp('2018-02-20 00:00:00'),
 Timestamp('2018-03-13 00:00:00'),
 Timestamp('2018-04-10 00:00:00'),
 Timestamp('2018-07-10 00:00:00'),
 Timestamp('2018-10-09 00:00:00'),
 Timestamp('2019-07-11 00:00:00'),
 Timestamp('2019-09-10 00:00:00'),
 Timestamp('2019-10-17 00:00:00'),
 Timestamp('2019-12-12 00:00:00'),
 Timestamp('2020-01-09 00:00:00'),
 Timestamp('2020-02-06 00:00:00'),
 Timestamp('2020-03-12 00:00:00')]

In [23]:
df1_pt=df1[(df1["person_id"] == 323)]  
df1_pt

Unnamed: 0,person_id,inital_ADT_start_date,inital_ADT_end_date,Inital_ADT_timewindow,Inital_ADT_time_window_days,index1
7,323,2015-06-18 00:00:00,2016-09-07 00:00:00,447 days 00:00:00.000000000,447,7


In [24]:
df2_pt=df2[(df2["person_id"] == 323)]  
df2_pt

Unnamed: 0,person_id,cancer_type_id,year_of_diagnosis,month_of_diagnosis,day_of_diagnosis,dx_date,result_date,loinc_code,source_test_name,test_results
473,323,2,2015,6.0,9.0,2015-06-09,2015-07-22,49041-7,TESTOSTERONE,26.74
474,323,2,2015,6.0,9.0,2015-06-09,2015-09-09,49041-7,TESTOSTERONE,15.82
475,323,2,2015,6.0,9.0,2015-06-09,2015-11-30,49041-7,TESTOSTERONE,10.0
476,323,2,2015,6.0,9.0,2015-06-09,2016-02-29,49041-7,TESTOSTERONE,15.02
477,323,2,2015,6.0,9.0,2015-06-09,2016-03-09,49041-7,TESTOSTERONE,14.17
478,323,2,2015,6.0,9.0,2015-06-09,2016-06-01,49041-7,TESTOSTERONE,15.93
479,323,2,2015,6.0,9.0,2015-06-09,2016-08-26,49041-7,TESTOSTERONE,18.49


In [25]:
#pd.to_datetime(df1_pt['inital_ADT_end_date'], utc=False)
a=df1_pt['inital_ADT_start_date'].to_list()[0]
#df.dates.values.astype('M8[D]')

In [26]:
a

Timestamp('2015-06-18 00:00:00')

In [27]:
#df_tvalue_pt=df2[(df2["person_id"] ==12)]    
#df2_pt_T_75more=df2_pt.loc[df2_pt['test_results']>=75]

In [28]:
#list_pts_adt_table

In [29]:
a=df1_pt['inital_ADT_start_date'].to_list()[0]

In [30]:
a

Timestamp('2015-06-18 00:00:00')

# Match T test date and value just before ADT <a class="anchor" id="second-bullet"></a>

In [31]:
result_list=[]

#for y in list_pts:      
#for y in list_pts_adt_table:
#for y in [12427]:
for y in list_pts_adt_table: 
    
    df1_pt=df1[(df1["person_id"] == y)]
    
    if len(df1_pt.inital_ADT_start_date)==0:    
    #if(df1_pt.inital_ADT_start_date.values.any()==""):        
        a='NaN'
        gap='NaN'
        tdate='NaN'
        tvalue='NaN'
        N_t_dates='NaN'
        N_t_dates_beforeADT='NaN'
        adt_end=df1_pt['inital_ADT_end_date'].to_list()[0] 

    if len(df1_pt.inital_ADT_start_date)!=0:    
    #if(df1_pt.inital_ADT_start_date.values.any()!=""):
        a=df1_pt['inital_ADT_start_date'].to_list()[0]
        
        #add 6 months to the inital ADT end date
        #a_plus_6mons=a + timedelta(days=180)
        adt_end=df1_pt['inital_ADT_end_date'].to_list()[0] 
        #pick up initial ADT window 
        gap=df1_pt['Inital_ADT_time_window_days'].to_list()[0] 
    
        df2_pt=df2[(df2["person_id"] == y)]    
        each_pt_tdates = df2_pt['result_date'].to_list()

        ##get rid of nan
        cleaned_a=[]
        cleaned_a= [x for x in each_pt_tdates if str(x) != 'NaN']
        cleaned_aa= [x for x in cleaned_a if str(x) != 'nan']
        cleaned_aaa= [x for x in cleaned_aa if str(x) != 'NaT']
        N_t_dates=len(cleaned_aaa)

        #only keep the ones before ADT start date, including on the day
        cleaned_aaaa= [x for x in cleaned_aaa if x <=a]
        
        start_sorted_bin=sorted(cleaned_aaaa)
        N_t_dates_beforeADT=len(cleaned_aaaa)
    
        if(len(start_sorted_bin)==0):
            tdate='NaN' 
            tvalue="NaN"

        if(len(start_sorted_bin)>0):
            #apply the closet function
            #tdate=closest(start_sorted_bin,a_plus_6mons)
            tdate=closest(start_sorted_bin,a)
            #tvalue=df2_pt.loc[df['result_date'] == tdate].test_result
    
            #pick up T-value for T-date
            df2_pt_pick=df2_pt[(df2_pt['result_date']==tdate)]
            tvalue=df2_pt_pick.test_results.values[0]
    
    #print(y, a, gap,tdate, tvalue,N_t_dates,N_t_dates_beforeADT)
    result_per_pt=[y, a, adt_end, gap, tdate, tvalue,N_t_dates,N_t_dates_beforeADT]
    result_list.append(result_per_pt)

TypeError: '<=' not supported between instances of 'Timestamp' and 'str'

In [None]:
df_results_before_adt=pd.DataFrame(result_list)
df_results_before_adt.columns=(['person_id', 'inital_ADT_start_date', 'inital_ADT_end_date', 'Inital_ADT_time_window_days', 'Tdate_justbeforeADT', 'Tvalue_justbeforeADT', "N_T_dates","N_T_dates_beforeADT"])

In [None]:
df_results_before_adt[:3]

In [None]:
shape(df_results_before_adt)

# Output data <a class="anchor" id="third-bullet"></a>

In [None]:
df_results_before_adt.to_csv("radiation_tdate_tvalue_beforeADT_all_121pts_results_new.csv", encoding='utf-8', index=False)

# Match T test date and value Immediately after ADT <a class="anchor" id="third-bullet"></a>

In [32]:
result_after_list=[]

#for y in list_pts:      
for y in list_pts:
#for y in [12427]:
#for y in list_pts_adt_table: 
    
    df1_pt=df1[(df1["person_id"] == y)]
    
    if(df1_pt.inital_ADT_end_date.values.any()==""):        
        b='NaN'
        gap='NaN'
        tdate='NaN'
        tvalue='NaN'
        N_t_dates='NaN'
        N_t_dates_afterADT='NaN'

    if(df1_pt.inital_ADT_end_date.values.any()!=""):
        b=df1_pt['inital_ADT_end_date'].to_list()[0]
        
        df2_pt=df2[(df2["person_id"] == y)]    
        each_pt_tdates = df2_pt['result_date'].to_list()

        ##get rid of nan
        cleaned_a=[]
        cleaned_a= [x for x in each_pt_tdates if str(x) != 'NaN']
        cleaned_aa= [x for x in cleaned_a if str(x) != 'nan']
        cleaned_aaa= [x for x in cleaned_aa if str(x) != 'NaT']
        N_t_dates=len(cleaned_aaa)

        #only keep the ones after ADT end date including on the day
        cleaned_aaaa= [x for x in cleaned_aaa if x >=b]
        start_sorted_bin=sorted(cleaned_aaaa)
        N_t_dates_afterADT=len(cleaned_aaaa)
    
        if(len(start_sorted_bin)==0):
            tdate='NaN' 
            tvalue="NaN"

        if(len(start_sorted_bin)>0):
            #apply the closet function
            #tdate=closest(start_sorted_bin,a_plus_6mons)
            tdate=closest(start_sorted_bin,b)
            #tvalue=df2_pt.loc[df['result_date'] == tdate].test_result
    
            #pick up T-value for T-date
            df2_pt_pick=df2_pt[(df2_pt['result_date']==tdate)]
            tvalue=df2_pt_pick.test_results.values[0]
    
    #print(y, b, tdate, tvalue ,N_t_dates,N_t_dates_afterADT)
    result_after_per_pt=[y, a, tdate, tvalue ,N_t_dates,N_t_dates_afterADT]
    result_after_list.append(result_after_per_pt)

In [33]:
df_results_after_adt=pd.DataFrame(result_after_list)
df_results_after_adt.columns=(['person_id', 'inital_ADT_end_date', 'Tdate_justafterADT', 'Tvalue_justafterADT', "N_T_dates","N_T_dates_afterADT"])

In [34]:
df_results_after_adt.to_csv("radiation_tdate_tvalue_afterADT_all_121pts_results_new.csv", encoding='utf-8', index=False)

# Compute for the delta against the baseline 

In [35]:
df_delta=pd.read_excel("Results_radiation_cohort_121pts_Inital_ADT_windows_with_Tvalues_before_and_after_ADT_Feb1_2021.xls", header=0)

FileNotFoundError: [Errno 2] No such file or directory: 'Results_radiation_cohort_121pts_Inital_ADT_windows_with_Tvalues_before_and_after_ADT_Feb1_2021.xls'

In [36]:
df_delta[:3]

NameError: name 'df_delta' is not defined

In [None]:
list(df_delta.columns.values)

In [None]:
#for the pts with T value avai before ADT, get delta T values
df_delta.fillna('', inplace=True)
df_delta[df_delta['Tvalue_justbeforeADT'] == ''].index

In [None]:
df_with_tbeforeadt=df_delta[df_delta['Tdate_justbeforeADT'] != '']
shape(df_with_tbeforeadt)

In [None]:
df_with_tbeforeadt_and_after=df_with_tbeforeadt[df_with_tbeforeadt['Tvalue_justafterADT'] != '']
shape(df_with_tbeforeadt_and_after)

In [None]:
df_with_tbeforeadt_and_after[:5]

In [None]:
#create a column for the delta 
df_with_tbeforeadt_and_after['delta_Tvalue'] =df_with_tbeforeadt_and_after['Tvalue_justafterADT'] -df_with_tbeforeadt_and_after['Tvalue_justbeforeADT']
#create a column for the delta 
#df_with_tbeforeadt_and_after['delta_Tvalue_recovery'] =df_with_tbeforeadt_and_after['Tvalue_justafterADT'] -df_with_tbeforeadt_and_after['Tvalue_justbeforeADT']

In [None]:
df_with_tbeforeadt_and_after.mean()

In [None]:
df_with_tbeforeadt_and_after.Tvalue_justbeforeADT.std()

# Match T test date and value after ADT and assign T Recovery status 

In [None]:
y=12
df1_pt=df1[(df1["person_id"] == y)]
df2_pt=df2[(df2["person_id"] == y)]
b=df1_pt['inital_ADT_end_date'].to_list()[0]
df3_pt=df2_pt[df2_pt.test_results > 75][df2_pt.result_date >=b]
#df4_pt=df3_pt[df3_pt.result_date>=b]
# and df2_pt.result_date >=b

In [None]:
b

In [None]:
df2_pt=df2[(df2["person_id"] == 12)] 
df2_pt 

In [None]:
result_status_list=[]

for y in list_pts:     
#for y in list_pts_adt_table:
#for y in [12427]:
#for y in list_pts_adt_table: 
    
    df1_pt=df1[(df1["person_id"] == y)]
    
    if(df1_pt.inital_ADT_end_date.values.any()==""):        
        b='NaN'
        #gap='NaN'
        tdate='NaN'
        tvalue='NaN'
        status="NaN"
        N_t_dates='NaN'
        N_t_dates_afterADT='NaN'

    if(df1_pt.inital_ADT_end_date.values.any()!=""):
        b=df1_pt['inital_ADT_end_date'].to_list()[0]
        
        df2_pt=df2[(df2["person_id"] == y)]    
        each_pt_tdates = df2_pt['result_date'].to_list()

        ##get rid of nan
        cleaned_a=[]
        cleaned_a= [x for x in each_pt_tdates if str(x) != 'NaN']
        cleaned_aa= [x for x in cleaned_a if str(x) != 'nan']
        cleaned_aaa= [x for x in cleaned_aa if str(x) != 'NaT']
        N_t_dates=len(cleaned_aaa)

        #only keep the ones after ADT end date including on the day
        cleaned_aaaa= [x for x in cleaned_aaa if x >=b]
        start_sorted_bin=sorted(cleaned_aaaa)
        N_t_dates_afterADT=len(cleaned_aaaa)
   
        if(len(start_sorted_bin)==0):
            tdate='NaN' 
            tvalue="NaN"
            status="NaN"
#get the 75 
#get the 300
        if(len(start_sorted_bin)>0):
            df3_pt=df2_pt[df2_pt.test_results >=75][df2_pt.result_date >=b]
            df4_pt=df2_pt[df2_pt.test_results <75][df2_pt.result_date >=b]

            if not df3_pt.empty:
                status ="Y"
                tdate=df3_pt['result_date'].to_list()[0]
                tvalue=df3_pt['test_results'].to_list()[0]
            
            #if not df4_pt.empty:
            if df3_pt.empty:
                status ="N"
                tdate=df4_pt['result_date'].to_list()[-1]
                tvalue=df4_pt['test_results'].to_list()[-1]
    
    print(y, b, tdate, tvalue,status, N_t_dates,N_t_dates_afterADT)
    result_status_per_pt=[y,b,tdate, tvalue, status,N_t_dates,N_t_dates_afterADT]
    result_status_list.append(result_status_per_pt)

In [None]:
df_result_status=pd.DataFrame(result_status_list)
df_result_status.columns=(['person_id', 'inital_ADT_end_date', 'Tdate_rec_norec', 'Tvalue_rec_norec', 'Status',"N_T_dates","N_T_dates_afterADT"])

In [None]:
df_result_status

In [None]:
df_result_status.to_csv("radiation_tdate_tvalue_recovery_status_all_121pts_results_new.csv", encoding='utf-8', index=False)

# use T value >=300 as a critrion

In [None]:
result_status_list=[]

for y in list_pts:     
#for y in list_pts_adt_table:
#for y in [12427]:
#for y in list_pts_adt_table: 
    
    df1_pt=df1[(df1["person_id"] == y)]
    
    if(df1_pt.inital_ADT_end_date.values.any()==""):        
        b='NaN'
        #gap='NaN'
        tdate='NaN'
        tvalue='NaN'
        status="NaN"
        N_t_dates='NaN'
        N_t_dates_afterADT='NaN'

    if(df1_pt.inital_ADT_end_date.values.any()!=""):
        b=df1_pt['inital_ADT_end_date'].to_list()[0]
        
        df2_pt=df2[(df2["person_id"] == y)]    
        each_pt_tdates = df2_pt['result_date'].to_list()

        ##get rid of nan
        cleaned_a=[]
        cleaned_a= [x for x in each_pt_tdates if str(x) != 'NaN']
        cleaned_aa= [x for x in cleaned_a if str(x) != 'nan']
        cleaned_aaa= [x for x in cleaned_aa if str(x) != 'NaT']
        N_t_dates=len(cleaned_aaa)

        #only keep the ones after ADT end date including on the day
        cleaned_aaaa= [x for x in cleaned_aaa if x >=b]
        start_sorted_bin=sorted(cleaned_aaaa)
        N_t_dates_afterADT=len(cleaned_aaaa)
   
        if(len(start_sorted_bin)==0):
            tdate='NaN' 
            tvalue="NaN"
            status="NaN"
#get the 75 
#get the 300
        if(len(start_sorted_bin)>0):
            df3_pt=df2_pt[df2_pt.test_results >=300][df2_pt.result_date >=b]
            df4_pt=df2_pt[df2_pt.test_results <300][df2_pt.result_date >=b]

            if not df3_pt.empty:
                status ="Y"
                tdate=df3_pt['result_date'].to_list()[0]
                tvalue=df3_pt['test_results'].to_list()[0]
            
            #if not df4_pt.empty:
            if df3_pt.empty:
                status ="N"
                tdate=df4_pt['result_date'].to_list()[-1]
                tvalue=df4_pt['test_results'].to_list()[-1]
    
    #print(y, b, tdate, tvalue,status, N_t_dates,N_t_dates_afterADT)
    result_status_per_pt=[y,b,tdate, tvalue, status,N_t_dates,N_t_dates_afterADT]
    result_status_list.append(result_status_per_pt)

In [None]:
df_result_status300=pd.DataFrame(result_status_list)
df_result_status300.columns=(['person_id', 'inital_ADT_end_date', 'Tdate_rec_norec', 'Tvalue_rec_norec', 'Status',"N_T_dates","N_T_dates_afterADT"])
df_result_status300

In [None]:
df_rec_pt300=df_result_status300[df_result_status300.Status=="Y"]
df_nonrec_pt300=df_result_status300[df_result_status300.Status=="N"]

In [None]:
print(len(df_rec_pt300))
print(len(df_nonrec_pt300))

In [None]:
df_result_status300.to_csv("radiation_tdate_tvalue300_recovery_status_all_121pts_results_new.csv", encoding='utf-8', index=False)

In [None]:
df_result_status300['Time_to_rec/nonrec']=df_result_status300['Tdate_rec_norec']-df_result_status300['inital_ADT_end_date']

In [None]:
#create a column for the inital adt use window as days
df_result_status300['Time_to_rec/nonrec_days']=df_result_status300['Time_to_rec/nonrec'].dt.days

In [None]:
df_result_status300

In [None]:
df_rec_pt300=df_result_status300[df_result_status300.Status=="Y"]
df_nonrec_pt300=df_result_status300[df_result_status300.Status=="N"]

# plot

In [None]:
Time_to_rec= df_rec_pt300[["Time_to_rec/nonrec_days"]].reset_index()
Time_to_rec.fillna('', inplace=True)
ax=Time_to_rec.plot.scatter(x= "index", y="Time_to_rec/nonrec_days", c='DarkBlue')
print(Time_to_rec.mean())
print(Time_to_rec.median())

In [None]:
###plot the T value histogram

sns.set_style("darkgrid")
sns.set_context("poster", font_scale=0.6, rc={"lines.linewidth": 2})
binsize=180

bins=np.arange(0, 1500, binsize)
fig, ax = plt.subplots(figsize = (8,5))

(n, bins, patches) = plt.hist(Time_to_rec["Time_to_rec/nonrec_days"], bins=bins,color="green", alpha=0.4)

#decorate

plt.title("Time to Recovery, Bin Size="+ str(int(binsize)),fontsize=18)
plt.xlabel("Time in Days", fontsize=16, fontweight="bold",color="gray")
plt.ylabel("Population", fontsize=16,fontweight="bold", color="gray")

ax.set_xticks(np.arange(0, 1500, 180))
ax.set_xticklabels(np.arange(0, 1500, 180))

# plot the label/text to each bin
for i in range(0, len(n)):
    x_pos = (bins[i + 1] - bins[i])/4 + bins[i]
    y_pos = n[i] + 0.2
    label = str(int(n[i])) # relative frequency of each bin
    ax.text(x_pos, y_pos, label,fontsize=16,fontweight="bold",color="gray" )
    ax.xaxis.tick_bottom()

# Baseline Total Testosterone Histogram<a class="anchor" id="fifth-bullet"></a>

In [None]:
baseline_tt= df_with_tbeforeadt_and_after[["Tvalue_justbeforeADT"]].reset_index()
baseline_tt.fillna('', inplace=True)
ax=baseline_tt.plot.scatter(x= "index", y="Tvalue_justbeforeADT", c='DarkBlue')

In [None]:
###plot the T value histogram

sns.set_style("darkgrid")
sns.set_context("poster", font_scale=0.6, rc={"lines.linewidth": 2})
binsize=75

bins=np.arange(0, 800, binsize)
fig, ax = plt.subplots(figsize = (8,5))

(n, bins, patches) = plt.hist(baseline_tt["Tvalue_justbeforeADT"], bins=bins,color="green", alpha=0.4)

#decorate

plt.title("Baseline Total Testosterone, Bin Size="+ str(int(binsize)),fontsize=18)
plt.xlabel("T-value", fontsize=16, fontweight="bold",color="gray")
plt.ylabel("Population", fontsize=16,fontweight="bold", color="gray")

ax.set_xticks(np.arange(0, 800, 100))
ax.set_xticklabels(np.arange(0, 800, 100))

# plot the label/text to each bin
for i in range(0, len(n)):
    x_pos = (bins[i + 1] - bins[i])/4 + bins[i]
    y_pos = n[i] + 0.2
    label = str(int(n[i])) # relative frequency of each bin
    ax.text(x_pos, y_pos, label,fontsize=16,fontweight="bold",color="gray" )
    ax.xaxis.tick_bottom()

In [None]:
afterADT_tt= df_with_tbeforeadt_and_after[["Tvalue_justafterADT"]].reset_index()
afterADT_tt.fillna('', inplace=True)
ax=afterADT_tt.plot.scatter(x= "index", y="Tvalue_justafterADT", c='DarkBlue')

In [None]:
recovery_baseline=df_with_tbeforeadt_and_after[df_with_tbeforeadt_and_after['delta_Tvalue']>= 0]

In [None]:
recovery_baseline

In [None]:
recovery_baseline

In [None]:
###plot the T value histogram

sns.set_style("darkgrid")
sns.set_context("poster", font_scale=0.6, rc={"lines.linewidth": 2})
binsize=75

bins=np.arange(0, 800, binsize)
fig, ax = plt.subplots(figsize = (8,5))

(n, bins, patches) = plt.hist(afterADT_tt["Tvalue_justafterADT"], bins=bins,color="green", alpha=0.4)

#decorate

plt.title("Just After ADT Total Testosterone, Bin Size="+ str(int(binsize)),fontsize=18)
plt.xlabel("T-value", fontsize=16, fontweight="bold",color="gray")
plt.ylabel("Population", fontsize=16,fontweight="bold", color="gray")

ax.set_xticks(np.arange(0, 800, 100))
ax.set_xticklabels(np.arange(0, 800, 100))

# plot the label/text to each bin
for i in range(0, len(n)):
    x_pos = (bins[i + 1] - bins[i])/4 + bins[i]
    y_pos = n[i] + 0.2
    label = str(int(n[i])) # relative frequency of each bin
    ax.text(x_pos, y_pos, label,fontsize=16,fontweight="bold",color="gray" )
    ax.xaxis.tick_bottom()

In [None]:
delta_tt= df_with_tbeforeadt_and_after[['delta_Tvalue']].reset_index()
delta_tt.fillna('', inplace=True)
ax=delta_tt.plot.scatter(x= "index", y='delta_Tvalue', c='DarkBlue')
print(delta_tt.max())
print(delta_tt.min())
print(delta_tt.mean())

# Delta Total Testosterone Just Before and After ADT  <a class="anchor" id="sixth-bullet"></a>

In [None]:
###plot the T value histogram

sns.set_style("darkgrid")
sns.set_context("poster", font_scale=0.6, rc={"lines.linewidth": 2})
binsize=75

bins=np.arange(-800, 50, binsize)
fig, ax = plt.subplots(figsize = (8,5))

(n, bins, patches) = plt.hist(delta_tt['delta_Tvalue'], bins=bins,color="green", alpha=0.4)

#decorate

plt.title("Delta Total Testosterone, Bin Size="+ str(int(binsize)),fontsize=18)
plt.xlabel("T-value", fontsize=16, fontweight="bold",color="gray")
plt.ylabel("Population", fontsize=16,fontweight="bold", color="gray")

ax.set_xticks(np.arange(-800, 50, 100))
ax.set_xticklabels(np.arange(-800, 50, 100))

# plot the label/text to each bin
for i in range(0, len(n)):
    x_pos = (bins[i + 1] - bins[i])/4 + bins[i]
    y_pos = n[i] + 0.2
    label = str(int(n[i])) # relative frequency of each bin
    ax.text(x_pos, y_pos, label,fontsize=16,fontweight="bold",color="gray" )
    ax.xaxis.tick_bottom()

In [None]:
# df_with_tbeforeadt_and_trecov=df_with_tbeforeadt[df_with_tbeforeadt['T recovery/nonrecovery value'] != '']
#shape(df_with_tbeforeadt_and_trecov)

In [None]:
df_with_tbeforeadt_and_trecov

In [None]:
list(df_with_tbeforeadt_and_trecov.columns.values)

In [None]:
df_with_tbeforeadt_and_trecov['delta_Tvalue_rec_NEW']=df_with_tbeforeadt_and_trecov['T recovery/nonrecovery value']-df_with_tbeforeadt_and_trecov['Tvalue_justbeforeADT']

In [None]:
df_with_tbeforeadt_and_trecov

In [None]:
delta_tt_rec= df_with_tbeforeadt_and_trecov[['delta_Tvalue_rec_NEW']].reset_index()
delta_tt_rec.fillna('', inplace=True)
ax=delta_tt_rec.plot.scatter(x= "index", y='delta_Tvalue_rec_NEW', c='DarkBlue')
print(delta_tt_rec.min())
print(delta_tt_rec.max())
print(delta_tt_rec.mean())
print(shape(delta_tt_rec))

In [None]:
###plot the T value histogram

sns.set_style("darkgrid")
sns.set_context("poster", font_scale=0.6, rc={"lines.linewidth": 2})
binsize=75

bins=np.arange(-800, 50, binsize)
fig, ax = plt.subplots(figsize = (8,5))

(n, bins, patches) = plt.hist(delta_tt_rec['delta_Tvalue_rec_NEW'], bins=bins,color="green", alpha=0.4)

#decorate

plt.title("T Recovery or Last Test T Value - Baseline T Value, Bin Size="+ str(int(binsize)),fontsize=18)
plt.xlabel("T-value", fontsize=16, fontweight="bold",color="gray")
plt.ylabel("Population", fontsize=16,fontweight="bold", color="gray")

ax.set_xticks(np.arange(-800, 50, 100))
ax.set_xticklabels(np.arange(-800, 50, 100))

# plot the label/text to each bin
for i in range(0, len(n)):
    x_pos = (bins[i + 1] - bins[i])/4 + bins[i]
    y_pos = n[i] + 0.2
    label = str(int(n[i])) # relative frequency of each bin
    ax.text(x_pos, y_pos, label,fontsize=16,fontweight="bold",color="gray" )
    ax.xaxis.tick_bottom()

In [None]:
recovery_baseline_final=df_with_tbeforeadt_and_trecov[df_with_tbeforeadt_and_trecov['delta_Tvalue_rec_NEW']>= 0]
recovery_baseline_final
list(recovery_baseline_final.columns.values)

del_col=['last note date',
 'Gleasson score',
 'race',
 'age ',
 'dx_date',
 'DOB',
 'date_of_death',
 'age from dx_date',
 'age from last note date']

for col in del_col:
    del recovery_baseline_final[col]
print(recovery_baseline_final.shape)

In [None]:
del recovery_baseline_final["Inital_ADT_timewindow"]

In [None]:
recovery_baseline_final

# Time to Recovery <a class="anchor" id="seventh-bullet"></a>

In [None]:
df_delta_notempty=df_delta[df_delta['ADT_end_to_T_testdate'] != '']
#recovery group
#df_recovery=df_delta_notempty[df_delta_notempty['Staus']=='Y']
df_recovery=df_delta_notempty[df_delta_notempty['Staus']=='Y']

df_delta_notemptyfix=df_recovery[df_recovery['ADT_end_to_T_testdate']>= 0]

adt_to_t_gap_rec=df_delta_notemptyfix[['ADT_end_to_T_testdate']].reset_index()
adt_to_t_gap_rec.fillna('', inplace=True)

print(shape(adt_to_t_gap_rec))

ax= adt_to_t_gap_rec.plot.scatter(x= "index", y='ADT_end_to_T_testdate', c='DarkBlue')
print(adt_to_t_gap.min())
print(adt_to_t_gap.max())
print(adt_to_t_gap.mean())

print("median()")
print(adt_to_t_gap.median())

In [None]:
#mean value for the recovery group ADT end to T recovery time 
###plot the T value histogram

sns.set_style("darkgrid")
sns.set_context("poster", font_scale=0.6, rc={"lines.linewidth": 2})
binsize=60

bins=np.arange(0, 800, binsize)
fig, ax = plt.subplots(figsize = (8,5))

(n, bins, patches) = plt.hist(adt_to_t_gap_rec['ADT_end_to_T_testdate'], bins=bins,color="green", alpha=0.4)

#decorate

plt.title("Time to Recovery, Bin Size="+ str(int(binsize)),fontsize=18)
plt.xlabel("Days", fontsize=16, fontweight="bold",color="gray")
plt.ylabel("Population", fontsize=16,fontweight="bold", color="gray")

ax.set_xticks(np.arange(0, 800, 60))
ax.set_xticklabels(np.arange(0, 800, 60))

# plot the label/text to each bin
for i in range(0, len(n)):
    x_pos = (bins[i + 1] - bins[i])/4 + bins[i]
    y_pos = n[i] + 0.2
    label = str(int(n[i])) # relative frequency of each bin
    ax.text(x_pos, y_pos, label,fontsize=16,fontweight="bold",color="gray" )
    ax.xaxis.tick_bottom()

In [None]:
#a different T recovery critera 

df_delta_notempty=df_delta[df_delta['ADT_end_to_T_testdate'] != '']
#recovery group
#df_recovery=df_delta_notempty[df_delta_notempty['Staus']=='Y']
df_recovery=df_delta_notempty[df_delta_notempty['Staus']=='Y']
df_delta_notemptyfix=df_recovery[df_recovery['T recovery/nonrecovery value']>= 300]

adt_to_t_gap_rec=df_delta_notemptyfix[['ADT_end_to_T_testdate']].reset_index()
adt_to_t_gap_rec.fillna('', inplace=True)

print(shape(adt_to_t_gap_rec))

ax= adt_to_t_gap_rec.plot.scatter(x= "index", y='ADT_end_to_T_testdate', c='DarkBlue')
print(adt_to_t_gap_rec.min())
print(adt_to_t_gap_rec.max())
print(adt_to_t_gap_rec.mean())
print("median()")
print(adt_to_t_gap_rec.median())

In [None]:
#mean value for the recovery group ADT end to T recovery time 
###plot the T value histogram

sns.set_style("darkgrid")
sns.set_context("poster", font_scale=0.6, rc={"lines.linewidth": 2})
binsize=60

bins=np.arange(0, 800, binsize)
fig, ax = plt.subplots(figsize = (8,5))

(n, bins, patches) = plt.hist(adt_to_t_gap_rec['ADT_end_to_T_testdate'], bins=bins,color="green", alpha=0.4)

#decorate

plt.title("Time to Recovery, Bin Size="+ str(int(binsize)),fontsize=18)
plt.xlabel("Days", fontsize=16, fontweight="bold",color="gray")
plt.ylabel("Population", fontsize=16,fontweight="bold", color="gray")

ax.set_xticks(np.arange(0, 800, 60))
ax.set_xticklabels(np.arange(0, 800, 60))

# plot the label/text to each bin
for i in range(0, len(n)):
    x_pos = (bins[i + 1] - bins[i])/4 + bins[i]
    y_pos = n[i] + 0.2
    label = str(int(n[i])) # relative frequency of each bin
    ax.text(x_pos, y_pos, label,fontsize=16,fontweight="bold",color="gray" )
    ax.xaxis.tick_bottom()

In [None]:
df_non_reco=pd.read_excel("non_recovery_group_with_last_note_date.xlsx", header=0)
df_reco=pd.read_excel("recovery_group_with_last_note_date.xlsx", header=0)

In [None]:
df_non_reco[:5]

In [None]:
df_non_reco.columns

In [None]:
df_reco['T recovery/nonrecovery value']

In [None]:
df_non_reco['followup_time']=df_non_reco['last note date']-df_non_reco['inital_ADT_end_date']
df_non_reco['followup_time_in_days']=df_non_reco['followup_time'].dt.days

In [None]:
df_reco['followup_time']=df_reco['last note date']-df_reco['inital_ADT_end_date']
df_reco['followup_time_in_days']=df_reco['followup_time'].dt.days

In [None]:
df_non_reco['followup_time_in_days']
df_non_reco_notempty=df_non_reco[df_non_reco['followup_time_in_days'].notnull()]
df_non_reco_notempty['followup_time_in_days']

In [None]:
#df_non_reco_notempty2['followup_time']
#df_non_reco_notempty2['followup_time_in_days']

In [None]:
#df_non_reco_notempty=df_non_reco[df_non_reco['followup_time']!= '']
df_non_reco_notempty=df_non_reco[df_non_reco['followup_time_in_days'].notnull()]
#df_non_reco_notempty2=df_non_reco_notempty[df_non_reco_notempty['followup_time_in_days']!='NaT']

non_rec_followup_time=df_non_reco_notempty[['followup_time_in_days']].reset_index()
print(shape(non_rec_followup_time))

ax=non_rec_followup_time.plot.scatter(x= "index", y='followup_time_in_days', c='DarkBlue')
print(non_rec_followup_time.min())
print(non_rec_followup_time.max())
print(non_rec_followup_time.mean())
print("median()")
print(non_rec_followup_time.median())

In [None]:
#mean value for the recovery group ADT end to T recovery time 
###plot the T value histogram

sns.set_style("darkgrid")
sns.set_context("poster", font_scale=0.6, rc={"lines.linewidth": 2})
binsize=360

bins=np.arange(0, 2800, binsize)
fig, ax = plt.subplots(figsize = (8,5))

(n, bins, patches) = plt.hist(df_non_reco_notempty['followup_time_in_days'], bins=bins,color="green", alpha=0.4)
#decorate

plt.title("Non-Recovery Follow-up Time, Bin Size="+ str(int(binsize)),fontsize=18)
plt.xlabel("Days", fontsize=16, fontweight="bold",color="gray")
plt.ylabel("Population", fontsize=16,fontweight="bold", color="gray")

ax.set_xticks(np.arange(0, 2800, 360))
ax.set_xticklabels(np.arange(0, 2800, 360))

# plot the label/text to each bin
for i in range(0, len(n)):
    x_pos = (bins[i + 1] - bins[i])/4 + bins[i]
    y_pos = n[i] + 0.05
    label = str(int(n[i])) # relative frequency of each bin
    ax.text(x_pos, y_pos, label,fontsize=16,fontweight="bold",color="gray" )
    ax.xaxis.tick_bottom()

In [None]:
#df_reco['followup_time_in_days']=df_reco['followup_time'].dt.days

#df_reco_notempty=df_reco[df_reco['followup_time_in_days']!= '']
df_reco_notempty=df_reco[df_reco['followup_time_in_days'].notnull()]
#df_non_reco_notempty2=df_non_reco_notempty[df_non_reco_notempty['followup_time_in_days']!='NaT']

rec_followup_time=df_reco_notempty[['followup_time_in_days']].reset_index()
print(shape(rec_followup_time))

ax=rec_followup_time.plot.scatter(x= "index", y='followup_time_in_days', c='DarkBlue')
print(rec_followup_time.min())
print(rec_followup_time.max())
print(rec_followup_time.mean())
print("median()")
print(rec_followup_time.median())

In [None]:
df_reco_notempty['followup_time_in_days'][15:30]

In [None]:
#mean value for the recovery group ADT end to T recovery time 
###plot the T value histogram

sns.set_style("darkgrid")
sns.set_context("poster", font_scale=0.6, rc={"lines.linewidth": 2})
binsize=360

bins=np.arange(0, 2800, binsize)
fig, ax = plt.subplots(figsize = (8,5))

(n, bins, patches) = plt.hist(df_reco_notempty['followup_time_in_days'], bins=bins,color="green", alpha=0.4)
#decorate

plt.title("Recovery Group Follow-up Time, Bin Size="+ str(int(binsize)),fontsize=18)
plt.xlabel("Days", fontsize=16, fontweight="bold",color="gray")
plt.ylabel("Population", fontsize=16,fontweight="bold", color="gray")

ax.set_xticks(np.arange(0, 2800, 360))
ax.set_xticklabels(np.arange(0, 2800, 360))

# plot the label/text to each bin
for i in range(0, len(n)):
    x_pos = (bins[i + 1] - bins[i])/4 + bins[i]
    y_pos = n[i] + 0.05
    label = str(int(n[i])) # relative frequency of each bin
    ax.text(x_pos, y_pos, label,fontsize=16,fontweight="bold",color="gray" )
    ax.xaxis.tick_bottom()

In [None]:
rec_followup_time

In [None]:
#Merge two dataframes

merged_df = pd.concat([rec_followup_time, non_rec_followup_time])

In [None]:
merged_df.followup_time_in_days.median()