In [1]:
import pandas as pd
import numpy as np
import scipy
from tableone import TableOne
%load_ext autoreload
%autoreload 2

# Load Epitope Database

In [2]:
path_cohort = '~/Global Data/KaplanMeier/DESAsurvival_original.csv'
df_cohort = pd.read_csv(path_cohort, sep=';', decimal=',')

# path_cohort = '~/UMCUtrecht/ProcessedData/procare_1_cohort/procare_1_cohort.csv'
# path_cohort = '~/UMCUtrecht/ProcessedData/procare_1_cohort/procare_1_cohort.xlsx'
path_antibody = '~/Global Data/20211104_mismatch_ep_db.pickle' # Danial\Global Data\
df_abs = pd.read_pickle(path_antibody)
# There is also another DESA column in the df_cohort.
# df_cohort = pd.read_excel(path_cohort, skiprows=1)

df = df_abs.merge(df_cohort, on='TransplantID')

# Missing CIPs
 The data set procare_1_cohort does not consider the imputed CIP hours. While the dataset DESAsurvival_original does consider the missing values

##  Table 1

### Revision - Short communication

In [3]:
df['DESA_Status'].value_counts()

No MFI         2840
No DESA         853
No HLAE-Abs     558
DESA            439
Name: DESA_Status, dtype: int64

In [4]:
# selected variables
cols = [
    'DESA_Status', 
    'RecipientAge_NOTR', 'RecipientSex_NOTR', 'DialysisYears',
    'DonorAge_NOTR', 'DonorSex_NOTR', 
    # 'TypeOfDonor_NOTR',
    'TypeCadaveric_NOTR', 
    'CIPHour_DBD', 'CIPHour_DCD',
    'CIPHour',
    'Retransplant', 
    'CurrentPRA_NOTR',
    'HighPRA_NOTR',
    'ALG_ATG_OKT3_AntiPanTMoAb',
    'IL2rMoAb_T0',
    'NumMismatch_ABDR_broad',
    'Steroids_T0', 'MMF_T0', 'Aza_T0', 'Sirolimus_T0', 'Cyclosporin_T0', 'Tacrolimus_T0', 'Unknown_T0', 'Others_T0',
    'DialysisYN',
    
]
df_t1 = df[cols]
#df_t1['DESA_Status'] = df_t1['DESA_Status'].apply(lambda x:'DESA' if x == 'DESA' else 'No DESA') # submitted
df_t1['DESA_Status'] = df_t1['DESA_Status'].apply(lambda x:'No HLAE-Abs' if x == 'No MFI' else x) # revision; distinguish no DESA with HLA-abs from no HLA-abs
df_t1['TypeCadaveric_NOTR'].fillna('Living', inplace=True)

df_t1['CIPDeceadDonor'] = (df_t1['CIPHour_DBD'] + df_t1['CIPHour_DCD']).replace(0, np.nan)
for g, v in df.groupby('TypeOfDonor_NOTR')['CIPHour']:
    if g == 'Living':
        df_t1['CIPLivingDonor'] = v
df_t1.drop(['CIPHour_DBD', 'CIPHour_DCD', 'CIPHour'], axis =1, inplace=True)
df_t1['MMF/azathioprine'] = df[['MMF_T0', 'Aza_T0']].apply(lambda x: 'Yes'  if ((x[0] == 'Yes') | (x[1] == 'Yes')) else 'No', axis=1)
df_t1['Cyclosporin/Tacrolimus'] = df[['Cyclosporin_T0', 'Tacrolimus_T0']].apply(lambda x: 'Yes'  if ((x[0] == 'Yes') | (x[1] == 'Yes')) else 'No', axis=1)
df_t1.drop(['Cyclosporin_T0', 'Tacrolimus_T0', 'MMF_T0', 'Aza_T0'], axis =1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_t1['DESA_Status'] = df_t1['DESA_Status'].apply(lambda x:'No HLAE-Abs' if x == 'No MFI' else x) # revision; distinguish no DESA with HLA-abs from no HLA-abs
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_t1['CIPDeceadDonor'] = (df_t1['CIPHour_DBD'] + df_t1['CIPHour_DC

In [5]:
df_t1['DESA_Status'].value_counts()

No HLAE-Abs    3398
No DESA         853
DESA            439
Name: DESA_Status, dtype: int64

In [6]:
#df_t1.to_csv('/Users/Danial/UMCUtrecht/ProcessedData/tableone.csv')

In [7]:
'MMF/azathioprine' in df_t1.columns
'Cyclosporin/Tacrolimus' in df_t1.columns

True

In [8]:
# columns containing categorical variables
categorical = ['RecipientSex_NOTR', 'DonorSex_NOTR', 
'IL2rMoAb_T0', 'ALG_ATG_OKT3_AntiPanTMoAb',
'Retransplant', 'TypeCadaveric_NOTR', 'DialysisYN',
'Steroids_T0', 'Sirolimus_T0', 'MMF/azathioprine', 'Cyclosporin/Tacrolimus', 'Unknown_T0', 'Others_T0',
]

# optionally, a categorical variable for stratification
groupby = ['DESA_Status']

# rename the death column
labels={
    'DonorAge_NOTR': 'Donor Age',
    'DonorSex_NOTR': 'Donor Sex',
    'RecipientAge_NOTR': 'Recipiet Age',
    'RecipientSex_NOTR': 'Recipiet Sex',
    'DialysisYears': 'Time on dialysis, y',
    'TypeCadaveric_NOTR': 'Type of Donor',
    'CIPDeceadDonor': ' Cold Ischemia Time Deceased Donors', 
    'CIPLivingDonor': ' Cold Ischemia Time Living Donors', 
    'CurrentPRA_NOTR': 'PRA at time of transplant, %',
    'HighPRA_NOTR': 'Highest PRA, %',
    'NumMismatch_ABDR_broad': 'HLA-A/B/DR broad mismatches, mean ± SD', 
    'Dialysis': 'DialysisYN',
}

continuous = {
    'DonorAge_NOTR', 'RecipientAge_NOTR', 'CIPDeceadDonor', 'DialysisYears',
    'CIPLivingDonor', 'CurrentPRA_NOTR', 'HighPRA_NOTR', 'NumMismatch_ABDR_broad',
    }
# htest = {var:scipy.stats.mannwhitneyu for var in continuous}

htest = {'RecipientAge_NOTR': scipy.stats.mannwhitneyu}

# link to documentation https://github.com/tompollard/tableone/blob/master/tableone/tableone.py

table1 = TableOne(
    df_t1, 
    groupby=groupby,
    missing=True,
    categorical=categorical,
    pval=True,
    rename=labels,
    htest_name=True,
    # htest=htest,
)
table1

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by DESA_Status,Grouped by DESA_Status,Grouped by DESA_Status,Grouped by DESA_Status,Grouped by DESA_Status,Grouped by DESA_Status,Grouped by DESA_Status
Unnamed: 0_level_1,Unnamed: 1_level_1,Missing,Overall,DESA,No DESA,No HLAE-Abs,P-Value,Test
n,,,4690,439,853,3398,,
"Recipiet Age, mean (SD)",,0.0,45.4 (14.4),44.6 (13.7),45.1 (13.7),45.6 (14.6),0.270,One-way ANOVA
"Recipiet Sex, n (%)",Female,0.0,1879 (40.1),267 (60.8),497 (58.3),1115 (32.8),<0.001,Chi-squared
"Recipiet Sex, n (%)",Male,,2811 (59.9),172 (39.2),356 (41.7),2283 (67.2),,
"Time on dialysis, y, mean (SD)",,0.0,2.8 (2.5),3.3 (3.0),3.4 (3.0),2.5 (2.2),<0.001,One-way ANOVA
"Donor Age, mean (SD)",,0.0,44.3 (15.0),45.1 (15.3),43.5 (14.6),44.4 (15.1),0.163,One-way ANOVA
"Donor Sex, n (%)",Female,0.0,2366 (50.4),193 (44.0),412 (48.3),1761 (51.8),0.003,Chi-squared
"Donor Sex, n (%)",Male,,2324 (49.6),246 (56.0),441 (51.7),1637 (48.2),,
"Type of Donor, n (%)",Heartbeating,0.0,2427 (51.7),248 (56.5),551 (64.6),1628 (47.9),<0.001,Chi-squared
"Type of Donor, n (%)",Living,,1455 (31.0),124 (28.2),184 (21.6),1147 (33.8),,


### Revision - DESA paper

In [9]:
# selected variables
cols = [
    'DESA_Status', 
    'RecipientAge_NOTR', 'RecipientSex_NOTR', 'DialysisYears',
    'DonorAge_NOTR', 'DonorSex_NOTR', 
    # 'TypeOfDonor_NOTR',
    'TypeCadaveric_NOTR', 
    'CIPHour_DBD', 'CIPHour_DCD',
    'CIPHour',
    'Retransplant', 
    'CurrentPRA_NOTR',
    'HighPRA_NOTR',
    'ALG_ATG_OKT3_AntiPanTMoAb',
    'IL2rMoAb_T0',
    'NumMismatch_ABDR_broad',
    'Steroids_T0', 'MMF_T0', 'Aza_T0', 'Sirolimus_T0', 'Cyclosporin_T0', 'Tacrolimus_T0', 'Unknown_T0', 'Others_T0',
    'DialysisYN',
    
]
df_t1 = df[cols]
df_t1['DESA_Status'] = df_t1['DESA_Status'].apply(lambda x:'DESA' if x == 'DESA' else 'No DESA') # submitted
df_t1['TypeCadaveric_NOTR'].fillna('Living', inplace=True)

df_t1['CIPDeceadDonor'] = (df_t1['CIPHour_DBD'] + df_t1['CIPHour_DCD']).replace(0, np.nan)
for g, v in df.groupby('TypeOfDonor_NOTR')['CIPHour']:
    if g == 'Living':
        df_t1['CIPLivingDonor'] = v
df_t1.drop(['CIPHour_DBD', 'CIPHour_DCD', 'CIPHour'], axis =1, inplace=True)
df_t1['MMF/azathioprine'] = df[['MMF_T0', 'Aza_T0']].apply(lambda x: 'Yes'  if ((x[0] == 'Yes') | (x[1] == 'Yes')) else 'No', axis=1)
df_t1['Cyclosporin/Tacrolimus'] = df[['Cyclosporin_T0', 'Tacrolimus_T0']].apply(lambda x: 'Yes'  if ((x[0] == 'Yes') | (x[1] == 'Yes')) else 'No', axis=1)
df_t1.drop(['Cyclosporin_T0', 'Tacrolimus_T0', 'MMF_T0', 'Aza_T0'], axis =1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_t1['DESA_Status'] = df_t1['DESA_Status'].apply(lambda x:'DESA' if x == 'DESA' else 'No DESA') # submitted
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_t1['CIPDeceadDonor'] = (df_t1['CIPHour_DBD'] + df_t1['CIPHour_DCD']).replace(0, np.nan)
A value is trying to be se

In [10]:
df_t1['DESA_Status'].value_counts()

No DESA    4251
DESA        439
Name: DESA_Status, dtype: int64

In [11]:
# columns containing categorical variables
categorical = ['RecipientSex_NOTR', 'DonorSex_NOTR', 
'IL2rMoAb_T0', 'ALG_ATG_OKT3_AntiPanTMoAb',
'Retransplant', 'TypeCadaveric_NOTR', 'DialysisYN',
'Steroids_T0', 'Sirolimus_T0', 'MMF/azathioprine', 'Cyclosporin/Tacrolimus', 'Unknown_T0', 'Others_T0',
]

# optionally, a categorical variable for stratification
groupby = ['DESA_Status']

# rename the death column
labels={
    'DonorAge_NOTR': 'Donor Age',
    'DonorSex_NOTR': 'Donor Sex',
    'RecipientAge_NOTR': 'Recipiet Age',
    'RecipientSex_NOTR': 'Recipiet Sex',
    'DialysisYears': 'Time on dialysis, y',
    'TypeCadaveric_NOTR': 'Type of Donor',
    'CIPDeceadDonor': ' Cold Ischemia Time Deceased Donors', 
    'CIPLivingDonor': ' Cold Ischemia Time Living Donors', 
    'CurrentPRA_NOTR': 'PRA at time of transplant, %',
    'HighPRA_NOTR': 'Highest PRA, %',
    'NumMismatch_ABDR_broad': 'HLA-A/B/DR broad mismatches, mean ± SD', 
    'Dialysis': 'DialysisYN',
}

continuous = {
    'DonorAge_NOTR', 'RecipientAge_NOTR', 'CIPDeceadDonor', 'DialysisYears',
    'CIPLivingDonor', 'CurrentPRA_NOTR', 'HighPRA_NOTR', 'NumMismatch_ABDR_broad',
    }
# htest = {var:scipy.stats.mannwhitneyu for var in continuous}

htest = {'RecipientAge_NOTR': scipy.stats.mannwhitneyu}

# link to documentation https://github.com/tompollard/tableone/blob/master/tableone/tableone.py

table1 = TableOne(
    df_t1, 
    groupby=groupby,
    missing=True,
    categorical=categorical,
    pval=True,
    rename=labels,
    htest_name=True,
    # htest=htest,
)
table1

Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by DESA_Status,Grouped by DESA_Status,Grouped by DESA_Status,Grouped by DESA_Status,Grouped by DESA_Status,Grouped by DESA_Status
Unnamed: 0_level_1,Unnamed: 1_level_1,Missing,Overall,DESA,No DESA,P-Value,Test
n,,,4690,439,4251,,
"Recipiet Age, mean (SD)",,0.0,45.4 (14.4),44.6 (13.7),45.5 (14.4),0.168,Two Sample T-test
"Recipiet Sex, n (%)",Female,0.0,1879 (40.1),267 (60.8),1612 (37.9),<0.001,Chi-squared
"Recipiet Sex, n (%)",Male,,2811 (59.9),172 (39.2),2639 (62.1),,
"Time on dialysis, y, mean (SD)",,0.0,2.8 (2.5),3.3 (3.0),2.7 (2.4),<0.001,Two Sample T-test
"Donor Age, mean (SD)",,0.0,44.3 (15.0),45.1 (15.3),44.2 (15.0),0.229,Two Sample T-test
"Donor Sex, n (%)",Female,0.0,2366 (50.4),193 (44.0),2173 (51.1),0.005,Chi-squared
"Donor Sex, n (%)",Male,,2324 (49.6),246 (56.0),2078 (48.9),,
"Type of Donor, n (%)",Heartbeating,0.0,2427 (51.7),248 (56.5),2179 (51.3),0.111,Chi-squared
"Type of Donor, n (%)",Living,,1455 (31.0),124 (28.2),1331 (31.3),,


In [12]:
from scipy.stats import mannwhitneyu, chi2_contingency, fisher_exact
def mannwhitneyu_test(data, column,  **kwargs):
    desa_group = data[data.DESA_Status == 'DESA'][column]
    print(round(desa_group.mean(), 2), round(desa_group.std(), 2))
    no_desa_group = data[data.DESA_Status == 'No DESA'][column]
    print(round(no_desa_group.mean(), 2), round(no_desa_group.std(), 2))
    print(mannwhitneyu(desa_group, no_desa_group,  **kwargs))

def chi2_test(data, column, fisher_test:bool=False, **kwargs):
    # https://pythonfordatascienceorg.wordpress.com/chi-square-python/
    # The H0 (Null Hypothesis): There is no relationship between variable one and variable two.
    # The H1 (Alternative Hypothesis): There is a relationship between variable 1 and variable 2.
    
    group_by = data['DESA_Status']
    cat_var = data[column]
    crosstab = pd.crosstab(group_by, cat_var)
    if fisher_test:
        print(fisher_exact(crosstab))
    else:
        print(chi2_contingency(crosstab, **kwargs))
    return crosstab


In [13]:
mannwhitneyu_test(df_t1, 'RecipientAge_NOTR')

44.58 13.7
45.53 14.42
MannwhitneyuResult(statistic=891817.5, pvalue=0.12637953889452627)


In [14]:
mannwhitneyu_test(df_t1, 'DonorAge_NOTR')

45.14 15.35
44.21 14.99
MannwhitneyuResult(statistic=979342.0, pvalue=0.08678171354691977)


In [15]:
mannwhitneyu_test(df_t1, 'DialysisYears')

3.3 3.04
2.7 2.4
MannwhitneyuResult(statistic=1021561.5, pvalue=0.0010447064879497146)


In [16]:

mannwhitneyu_test(df_t1, 'NumMismatch_ABDR_broad', nan_policy='omit')

2.27 1.38
2.34 1.47


TypeError: mannwhitneyu() got an unexpected keyword argument 'nan_policy'

In [None]:
chi2_test(df_t1, 'Steroids_T0', correction=False)

(4.666630025856633, 0.09697374444640541, 2, array([[   9.82835821,  429.17164179],
       [  31.58955224, 1379.41044776],
       [  63.58208955, 2776.41791045]]))


Steroids_T0,No,Yes
DESA_Status,Unnamed: 1_level_1,Unnamed: 2_level_1
DESA,16,423
No DESA,32,1379
No MFI,57,2783


In [None]:
chi2_test(df_t1, 'Sirolimus_T0', correction=False)

(1.653340774367591, 0.4375035813899528, 2, array([[ 412.41663113,   26.58336887],
       [1325.55778252,   85.44221748],
       [2668.02558635,  171.97441365]]))


Sirolimus_T0,No,Yes
DESA_Status,Unnamed: 1_level_1,Unnamed: 2_level_1
DESA,416,23
No DESA,1332,79
No MFI,2658,182


In [None]:
chi2_test(df_t1, 'MMF/azathioprine', correction=False)

(3.2207196677503624, 0.19981570067958493, 2, array([[ 104.18059701,  334.81940299],
       [ 334.84925373, 1076.15074627],
       [ 673.97014925, 2166.02985075]]))


MMF/azathioprine,No,Yes
DESA_Status,Unnamed: 1_level_1,Unnamed: 2_level_1
DESA,89,350
No DESA,338,1073
No MFI,686,2154


In [None]:
chi2_test(df_t1, 'Unknown_T0', fisher_test=True)

ValueError: The input `table` must be of shape (2, 2).

In [None]:
chi2_test(df_t1, 'Others_T0', correction=False)

(10.087967861596132, 0.00149240598008464, 1, array([[ 382.83795309,   56.16204691],
       [3707.16204691,  543.83795309]]))


Others_T0,No,Yes
DESA_Status,Unnamed: 1_level_1,Unnamed: 2_level_1
DESA,404,35
No DESA,3686,565


# Tineke Questions:

A. number of patients not on dialysis (no) is higher than in the previous DSA cohort, while our cohort is slightly smaller… So I expect to have the same or less numbers compared to the DSA cohort. The same for the unknowns. Are the numbers correct?

* I was able to replicate the AJT 2018 table for most of variables. For Dialysis type, however, I found slightly different count numbers. If we compare our paper with teh replication they should make sense. 

-------------------------
Replication of AJT 2018:            
Yes_haemo    2441
Yes_peri     1712
No            542
Unknown        29

-------------------------
AJT 2018
Yes_haemo    2472
Yes_peri     1715
No            515
Unknown        22

B. in the DSA cohort, differences between groups for age at transplant (recipient) resulted in a p-value 0.01, while in our cohort p-value is 0.126. Is this p-value correct? The mean +/- SD are quite similar between both cohorts, so I’m asking just to be sure.

* I repeated the analysis and got the same result again
*  As you see for both groups the recipient age means (45.5 vs 44.6) are closer compared to AJT 2018 paper (45.6 vs 44.2). Thus a larger p-value (age distributions are similar) is expected which means there is not a significant age distributions difference .

C. You’ve used the Fisher’s exact test to compare unknown initial immunosuppression instead of the Chi-square test. Is there a reason why you did that? And again in comparison to the DSA cohort, there is a huge difference in p-value (.47 in DSA, 0.056 in DESA cohort), while the numbers are quite similar. Is this also correct?

* The method that I used to create the table automatically used a Fisher test under the hood. I think this is because of the count of items <5 see the below explenations. 
* The difference is explained from large difference on the counts of No's (Although Yes's have similar counts). The count difference on No is alos because our minority group is smaller than the AJT 2018 paper. 

"The conventional rule of thumb is that if all of the expected numbers are greater than 5, it's acceptable to use the chi-square or G–test; if an expected number is less than 5, you should use an alternative, such as an exact test of goodness-of-fit or a Fisher's exact test of independence", [http://www.biostathandbook.com/small.html]


D. The same for the comparison of MMF/Azathioprine and Cyclosporine/tacrolimus. Also very different from the DSA cohort…
* I was able to replicate the AJT 2018 table for all of immunosupressive regiems. The counts were quite the same except the Others (with a tinyy/couple of count difference). The p-values were also the same except MMF/Azathioprine (I found 0.34 AJT 2018 paper found 0.20).
* I redid the p-values and found  