# Table 2: Admissions Demographics

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import math
import statistics
import numpy as np
import scipy.stats as st
from scipy import stats
alpha = .05
import statsmodels.api as sm

In [2]:
admissions = pd.read_excel('data/aml_data_8.16.23.xlsx')
len(admissions)

480

In [3]:
df = pd.DataFrame()

## get_pval_b func

In [4]:
def get_pval_b(field):
    contingency_table = pd.crosstab(index=admissions['infection_present'], columns=admissions[field])
    res = stats.barnard_exact([contingency_table[0],contingency_table[1]])
    return res

In [5]:
def get_pval_b2(field):
    contingency_table = pd.crosstab(index=admissions['infection_present'], columns=admissions[field])
    res = stats.boschloo_exact([contingency_table[0],contingency_table[1]])
    return res

## get pval_median func

In [6]:
def get_pval_median(field):
    result_tot_pat = stats.shapiro(admissions[field])
    if result_tot_pat.pvalue < alpha:
        #mann whitney
        tot_pat_res = stats.mannwhitneyu(neg_ads[field],pos_ads[field])
    else:
        #ttest
        tot_pat_res = stats.ttest_ind(neg_ads[field],pos_ads[field],nan_policy='omit')

    return tot_pat_res

## get_stats func

In [7]:
def get_stats(field,val):
    return [len(neg_ads[neg_ads[field]==val]),
    len(neg_ads[neg_ads[field]==val])/len(neg_ads[field]),
    len(pos_ads[pos_ads[field]==val]),
    len(pos_ads[pos_ads[field]==val])/len(pos_ads[field]), 
    len(admissions[admissions[field]==val]),
    len(admissions[admissions[field]==val])/len(admissions[field])] 

### Num admissions

In [8]:
pos_ads = admissions[admissions['infection_present'] == 1]
print('There are '+str(len(pos_ads))+' infection-positive admissions.')
print('Percentage pos '+str(len(pos_ads)/len(admissions)) )
print('There are '+str(len(pos_ads.MRN.unique()))+' patients that were infected.')
#pos_ads.head()
neg_ads = admissions[admissions['infection_present'] == 0]
print('There are '+str(len(neg_ads))+' infection-negative admissions.')
print('Percentage negative '+str(len(neg_ads)/len(admissions)) )
#neg_ads.head()

There are 91 infection-positive admissions.
Percentage pos 0.18958333333333333
There are 52 patients that were infected.
There are 389 infection-negative admissions.
Percentage negative 0.8104166666666667


In [9]:
len(admissions.MRN.unique())

95

### Sex

In [10]:
admissions['male'] = admissions['gender'].map({'M':1, 'F':0})
neg_ads['male'] = neg_ads['gender'].map({'M':1, 'F':0})
pos_ads['male'] = pos_ads['gender'].map({'M':1, 'F':0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  neg_ads['male'] = neg_ads['gender'].map({'M':1, 'F':0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pos_ads['male'] = pos_ads['gender'].map({'M':1, 'F':0})


In [11]:
df["Male"] = get_stats('male',1)
df["Female Stats"] = get_stats('male',0)
df

Unnamed: 0,Male,Female Stats
0,199.0,190.0
1,0.511568,0.488432
2,54.0,37.0
3,0.593407,0.406593
4,253.0,227.0
5,0.527083,0.472917


In [12]:
get_pval_b('male')

BarnardExactResult(statistic=1.407668615251027, pvalue=0.16149048043027456)

### Race

In [13]:
df["White Stats"] = get_stats('white_caucasian',1)
df["Non White Stats"] = get_stats('white_caucasian',0)
df

Unnamed: 0,Male,Female Stats,White Stats,Non White Stats
0,199.0,190.0,265.0,124.0
1,0.511568,0.488432,0.681234,0.318766
2,54.0,37.0,69.0,22.0
3,0.593407,0.406593,0.758242,0.241758
4,253.0,227.0,334.0,146.0
5,0.527083,0.472917,0.695833,0.304167


In [14]:
get_pval_b('white_caucasian')

BarnardExactResult(statistic=1.4374783486931773, pvalue=0.15925045245399355)

### Diagnosis

In [15]:
df["AML"] = get_stats('AML_diag','AML')
#df["Down Syd AML"] = get_stats('AML_diag','Down Syd AML')
df["2nd AML"] = get_stats('AML_diag','2nd AML')
#df["APML"] = get_stats('AML_diag','APML')
df

Unnamed: 0,Male,Female Stats,White Stats,Non White Stats,AML,2nd AML
0,199.0,190.0,265.0,124.0,371.0,18.0
1,0.511568,0.488432,0.681234,0.318766,0.953728,0.046272
2,54.0,37.0,69.0,22.0,85.0,6.0
3,0.593407,0.406593,0.758242,0.241758,0.934066,0.065934
4,253.0,227.0,334.0,146.0,456.0,24.0
5,0.527083,0.472917,0.695833,0.304167,0.95,0.05


In [16]:
one_hot = pd.get_dummies(admissions['AML_diag'])
admissions = admissions.join(one_hot)

In [17]:
pos_ads = admissions[admissions['infection_present'] == 1]
neg_ads = admissions[admissions['infection_present'] == 0]

In [18]:
get_pval_median('AML')

MannwhitneyuResult(statistic=18047.5, pvalue=0.43963874286188487)

In [19]:
get_pval_median('2nd AML')

MannwhitneyuResult(statistic=17351.5, pvalue=0.43963874286188487)

In [20]:
get_pval_b('AML')

BarnardExactResult(statistic=-0.774722767706877, pvalue=0.44617032447582883)

### Age

In [21]:
neg_ads.age.describe()

count    389.000000
mean       7.901054
std        5.688102
min        0.000000
25%        2.140000
50%        7.150000
75%       13.570000
max       19.200000
Name: age, dtype: float64

In [22]:
pos_ads.age.describe()

count    91.000000
mean      9.468571
std       5.734161
min       0.930000
25%       3.845000
50%       9.810000
75%      14.500000
max      19.220000
Name: age, dtype: float64

In [23]:
admissions.age.describe()

count    480.000000
mean       8.198229
std        5.723995
min        0.000000
25%        2.237500
50%        8.310000
75%       13.812500
max       19.220000
Name: age, dtype: float64

In [24]:
get_pval_median('age')

MannwhitneyuResult(statistic=14955.5, pvalue=0.021268460584726173)

### First BMI

In [25]:
neg_ads.first_bmi_kg_m2.describe()

count    388.000000
mean      18.956477
std        5.196454
min       10.000000
25%       15.800000
50%       17.400000
75%       20.000000
max       44.600000
Name: first_bmi_kg_m2, dtype: float64

In [26]:
pos_ads.first_bmi_kg_m2.describe()

count    91.000000
mean     20.128571
std       6.374538
min      13.500000
25%      16.350000
50%      17.900000
75%      21.850000
max      46.600000
Name: first_bmi_kg_m2, dtype: float64

In [27]:
admissions.first_bmi_kg_m2.describe()

count    479.000000
mean      19.179151
std        5.452075
min       10.000000
25%       15.800000
50%       17.500000
75%       20.250000
max       46.600000
Name: first_bmi_kg_m2, dtype: float64

In [28]:
get_pval_median('first_bmi_kg_m2')

Ttest_indResult(statistic=-1.8504100414390119, pvalue=0.06487261023086215)

### Neutropenia

In [29]:
neg_ads.neutropenia.sum()

319

In [30]:
neg_ads.neutropenia.sum()/len(neg_ads)

0.8200514138817481

In [31]:
pos_ads.neutropenia.sum()

88

In [32]:
pos_ads.neutropenia.sum()/len(pos_ads)

0.967032967032967

In [33]:
admissions.neutropenia.sum()

407

In [34]:
admissions.neutropenia.sum()/len(admissions)

0.8479166666666667

### Lowest ANC

In [35]:
neg_ads.lowest_neutrophil.describe()

count    386.000000
mean       0.621259
std        2.055551
min        0.000000
25%        0.002000
50%        0.008000
75%        0.034000
max       17.978000
Name: lowest_neutrophil, dtype: float64

In [36]:
pos_ads.lowest_neutrophil.describe()

count    91.000000
mean      0.187516
std       1.464885
min       0.000000
25%       0.000000
50%       0.003000
75%       0.007000
max      13.899000
Name: lowest_neutrophil, dtype: float64

In [37]:
admissions.lowest_neutrophil.describe()

count    477.000000
mean       0.538512
std        1.962742
min        0.000000
25%        0.001000
50%        0.007000
75%        0.022000
max       17.978000
Name: lowest_neutrophil, dtype: float64

In [38]:
get_pval_median('lowest_neutrophil')

Ttest_indResult(statistic=1.9015782025002501, pvalue=0.05783110469338381)

### Lowest Platelet

In [39]:
neg_ads.lowest_platelet.describe()

count     386.000000
mean       76.606218
std       100.375867
min         0.000000
25%        10.000000
50%        27.000000
75%       110.750000
max      1176.000000
Name: lowest_platelet, dtype: float64

In [40]:
pos_ads.lowest_platelet.describe()

count     91.000000
mean      42.934066
std       47.516033
min        1.000000
25%       10.000000
50%       11.000000
75%      101.500000
max      154.000000
Name: lowest_platelet, dtype: float64

In [41]:
admissions.lowest_platelet.describe()

count     477.000000
mean       70.182390
std        93.549173
min         0.000000
25%        10.000000
50%        15.000000
75%       108.000000
max      1176.000000
Name: lowest_platelet, dtype: float64

In [42]:
get_pval_median('lowest_platelet')

Ttest_indResult(statistic=3.116921895787596, pvalue=0.0019382832527590113)

### Lowest Hemoglobin

In [43]:
neg_ads.lowest_hemoglobin.describe()

count    385.000000
mean       9.905455
std        1.487078
min        3.700000
25%       10.000000
50%       10.100000
75%       10.300000
max       16.100000
Name: lowest_hemoglobin, dtype: float64

In [44]:
pos_ads.lowest_hemoglobin.describe()

count    90.000000
mean     10.191111
std       1.128253
min       4.700000
25%      10.000000
50%      10.100000
75%      10.300000
max      16.400000
Name: lowest_hemoglobin, dtype: float64

In [45]:
admissions.lowest_hemoglobin.describe()

count    475.000000
mean       9.959579
std        1.429366
min        3.700000
25%       10.000000
50%       10.100000
75%       10.300000
max       16.400000
Name: lowest_hemoglobin, dtype: float64

In [46]:
get_pval_median('lowest_hemoglobin')

Ttest_indResult(statistic=-1.7103526748858844, pvalue=0.08785641594979429)

### Lowest Monocytes

In [47]:
neg_ads.lowest_monocytes.describe()

count    383.000000
mean       2.621410
std        5.563556
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max       35.000000
Name: lowest_monocytes, dtype: float64

In [48]:
pos_ads.lowest_monocytes.describe()

count    91.000000
mean      0.879121
std       3.087016
min       0.000000
25%       0.000000
50%       0.000000
75%       0.000000
max      19.000000
Name: lowest_monocytes, dtype: float64

In [49]:
admissions.lowest_monocytes.describe()

count    474.000000
mean       2.286920
std        5.223337
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max       35.000000
Name: lowest_monocytes, dtype: float64

In [50]:
get_pval_median('lowest_monocytes')

Ttest_indResult(statistic=2.882252994492411, pvalue=0.004128497887636125)

### LOS

In [51]:
neg_ads.LOS.describe()

count    389.000000
mean      21.720748
std       12.863455
min        0.351389
25%        9.462500
50%       24.288194
75%       29.109722
max       74.301389
Name: LOS, dtype: float64

In [52]:
pos_ads.LOS.describe()

count    91.000000
mean     28.954792
std      12.508978
min       2.186806
25%      24.241319
50%      28.404167
75%      33.135764
max      81.857639
Name: LOS, dtype: float64

In [53]:
admissions.LOS.describe()

count    480.000000
mean      23.092202
std       13.095427
min        0.351389
25%       18.257292
50%       25.225347
75%       29.979688
max       81.857639
Name: LOS, dtype: float64

In [54]:
get_pval_median('LOS')

MannwhitneyuResult(statistic=11970.0, pvalue=1.5129548588987548e-06)

In [55]:
# double check
stats.ttest_ind(neg_ads['LOS'],pos_ads['LOS'],nan_policy='omit')

Ttest_indResult(statistic=-4.854361901487121, pvalue=1.638293272194672e-06)

### Num PICU visits

In [56]:
print(len(neg_ads[neg_ads['icu_los']>0]))
#len(neg_ads[neg_ads['icu_los']>0])/len(neg_ads)
neg_ads.icu_visit.sum()/len(neg_ads)

34


0.08740359897172237

In [57]:
print(len(pos_ads[pos_ads['icu_los']>0]))
len(pos_ads[pos_ads['icu_los']>0])/len(pos_ads)

20


0.21978021978021978

In [58]:
print(len(admissions[admissions['icu_los']>0]))
len(admissions[admissions['icu_los']>0])/len(admissions)

54


0.1125

### PICU LOS

In [59]:
neg_ads.icu_los.describe()

count    34.000000
mean      6.633824
std       8.660392
min       0.040000
25%       1.222500
50%       2.935000
75%       7.155000
max      41.030000
Name: icu_los, dtype: float64

In [60]:
pos_ads.icu_los.describe()

count    21.000000
mean      8.742381
std      12.122500
min       0.000000
25%       1.610000
50%       3.160000
75%      10.440000
max      48.850000
Name: icu_los, dtype: float64

In [61]:
admissions.icu_los.describe()

count    55.000000
mean      7.438909
std      10.066350
min       0.000000
25%       1.245000
50%       3.080000
75%       9.250000
max      48.850000
Name: icu_los, dtype: float64

In [62]:
get_pval_median('icu_los')

Ttest_indResult(statistic=-0.7516658720374025, pvalue=0.45557706941736853)

## Medications

In [63]:
admissions.dropna(subset=['cytarabine'],inplace=True)
admissions.dropna(subset=['levo'],inplace=True)
#admissions.dropna(subset=['vanco'],inplace=True)
len(admissions)

429

In [64]:
print(len(admissions[admissions.infection_present == 0]))
len(admissions[admissions.infection_present == 0])/len(admissions)

343


0.7995337995337995

In [65]:
print(len(admissions[admissions.infection_present == 1]))
len(admissions[admissions.infection_present == 1])/len(admissions)

86


0.20046620046620048

In [66]:
neg_ads.dropna(subset=['cytarabine'],inplace=True)
pos_ads.dropna(subset=['cytarabine'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  neg_ads.dropna(subset=['cytarabine'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pos_ads.dropna(subset=['cytarabine'],inplace=True)


### Cytarabine

In [67]:
df["Cyt Stats"] = get_stats('cytarabine',1)
df["No Cyt Stats"] = get_stats('cytarabine',0)
df

Unnamed: 0,Male,Female Stats,White Stats,Non White Stats,AML,2nd AML,Cyt Stats,No Cyt Stats
0,199.0,190.0,265.0,124.0,371.0,18.0,283.0,60.0
1,0.511568,0.488432,0.681234,0.318766,0.953728,0.046272,0.825073,0.174927
2,54.0,37.0,69.0,22.0,85.0,6.0,79.0,7.0
3,0.593407,0.406593,0.758242,0.241758,0.934066,0.065934,0.918605,0.081395
4,253.0,227.0,334.0,146.0,456.0,24.0,362.0,67.0
5,0.527083,0.472917,0.695833,0.304167,0.95,0.05,0.843823,0.156177


In [68]:
get_pval_b('cytarabine')

BarnardExactResult(statistic=2.136448233790324, pvalue=0.03361006406230964)

#### Cytarabine dosage

In [69]:
neg_ads["Cytarabine mg/m2/day"].describe()

count      343.000000
mean      1261.986055
std       2123.105987
min          0.000000
25%        121.549619
50%        208.590853
75%       1955.621630
max      12218.181818
Name: Cytarabine mg/m2/day, dtype: float64

In [70]:
import scipy.stats as st
st.norm.interval(alpha=0.95, loc=np.nanmean(neg_ads["Cytarabine mg/m2/day"]), scale=st.sem(neg_ads["Cytarabine mg/m2/day"],nan_policy='omit'))

(1037.301765999479, 1486.670344591018)

In [71]:
pos_ads["Cytarabine mg/m2/day"].describe()

count       86.000000
mean      2112.633214
std       2754.126543
min          0.000000
25%        175.867947
50%       1607.184355
75%       2111.285939
max      12537.313433
Name: Cytarabine mg/m2/day, dtype: float64

In [72]:
st.norm.interval(alpha=0.95, loc=np.nanmean(pos_ads["Cytarabine mg/m2/day"]), scale=st.sem(pos_ads["Cytarabine mg/m2/day"],nan_policy='omit'))

(1530.553107816906, 2694.7133191959847)

In [73]:
admissions["Cytarabine mg/m2/day"].describe()

count      429.000000
mean      1432.512059
std       2285.718483
min          0.000000
25%        128.859510
50%        220.959596
75%       1974.943052
max      12537.313433
Name: Cytarabine mg/m2/day, dtype: float64

In [74]:
st.norm.interval(alpha=0.95, loc=np.nanmean(admissions["Cytarabine mg/m2/day"]), scale=st.sem(admissions["Cytarabine mg/m2/day"],nan_policy='omit'))

(1216.219198135495, 1648.8049199429406)

In [75]:
neg_ads.dropna(subset=['Cytarabine mg/m2/day'],inplace=True)
pos_ads.dropna(subset=['Cytarabine mg/m2/day'],inplace=True)
get_pval_median('Cytarabine mg/m2/day')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  neg_ads.dropna(subset=['Cytarabine mg/m2/day'],inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pos_ads.dropna(subset=['Cytarabine mg/m2/day'],inplace=True)


MannwhitneyuResult(statistic=11148.0, pvalue=0.0004502303110800332)

### Levofloxacin

In [76]:
df["Levo Stats"] = get_stats('levo',1)
df["No Levo Stats"] = get_stats('levo',0)
df

Unnamed: 0,Male,Female Stats,White Stats,Non White Stats,AML,2nd AML,Cyt Stats,No Cyt Stats,Levo Stats,No Levo Stats
0,199.0,190.0,265.0,124.0,371.0,18.0,283.0,60.0,142.0,201.0
1,0.511568,0.488432,0.681234,0.318766,0.953728,0.046272,0.825073,0.174927,0.413994,0.586006
2,54.0,37.0,69.0,22.0,85.0,6.0,79.0,7.0,21.0,65.0
3,0.593407,0.406593,0.758242,0.241758,0.934066,0.065934,0.918605,0.081395,0.244186,0.755814
4,253.0,227.0,334.0,146.0,456.0,24.0,362.0,67.0,163.0,266.0
5,0.527083,0.472917,0.695833,0.304167,0.95,0.05,0.843823,0.156177,0.379953,0.620047


In [77]:
get_pval_b('levo')

BarnardExactResult(statistic=-2.9010069658269124, pvalue=0.0045783328715056325)

### Vancomycin

In [78]:
df["Vanco Stats"] = get_stats('vanco',1)
df["No Vanco Stats"] = get_stats('vanco',0)
df

Unnamed: 0,Male,Female Stats,White Stats,Non White Stats,AML,2nd AML,Cyt Stats,No Cyt Stats,Levo Stats,No Levo Stats,Vanco Stats,No Vanco Stats
0,199.0,190.0,265.0,124.0,371.0,18.0,283.0,60.0,142.0,201.0,289.0,54.0
1,0.511568,0.488432,0.681234,0.318766,0.953728,0.046272,0.825073,0.174927,0.413994,0.586006,0.842566,0.157434
2,54.0,37.0,69.0,22.0,85.0,6.0,79.0,7.0,21.0,65.0,74.0,12.0
3,0.593407,0.406593,0.758242,0.241758,0.934066,0.065934,0.918605,0.081395,0.244186,0.755814,0.860465,0.139535
4,253.0,227.0,334.0,146.0,456.0,24.0,362.0,67.0,163.0,266.0,363.0,66.0
5,0.527083,0.472917,0.695833,0.304167,0.95,0.05,0.843823,0.156177,0.379953,0.620047,0.846154,0.153846


In [79]:
get_pval_b('vanco')

BarnardExactResult(statistic=0.4113779434124534, pvalue=0.7036778892775385)

# Table 1: Patient Demographics

In [80]:
import numpy as np
import pandas as pd
import seaborn as sns
import math
import statistics
import numpy as np
import scipy.stats as st

### get_pd_pval()

In [81]:
def get_pd_pval(field):
    patients_field = patients.loc[:,["infection_present",field]]
    bsi_neg= patients_field[(patients_field[field]==0)]['infection_present']
    bsi_pos = patients_field[(patients_field[field]==1)]['infection_present']
    
    tot_patients = stats.shapiro(patients_field[field])

    if tot_patients.pvalue < alpha:
        #mann whitney
        res = stats.mannwhitneyu(bsi_neg,bsi_pos)
    else:
        #ttest
        res = stats.ttest_ind(bsi_neg,bsi_pos)

    return res

In [82]:
def get_pd_pval_b(field):
    contingency_table = pd.crosstab(index=admissions['infection_present'], columns=admissions[field])
    res = stats.barnard_exact([contingency_table[0],contingency_table[1]])
    return res

In [83]:
def get_pd_pval_b2(field):
    contingency_table = pd.crosstab(index=admissions['infection_present'], columns=admissions[field])
    res = stats.boschloo_exact([contingency_table[0],contingency_table[1]])
    return res

### get_pd_pval_median() func

In [84]:
def get_pd_pval_median(field):
    result_tot_pat = stats.shapiro(patients[field])
    if result_tot_pat.pvalue < alpha:
        #mann whitney
        tot_pat_res = stats.mannwhitneyu(neg_patients[field],pos_patients[field])
    else:
        #ttest
        tot_pat_res = stats.ttest_ind(neg_patients[field],pos_patients[field],nan_policy='omit')

    return tot_pat_res

### get_pd_stats()

In [85]:
def get_pd_stats(field, value):
    print('Neg BSI')
    print(len(neg_patients[neg_patients[field]==value]))
    print(len(neg_patients[neg_patients[field]==value])/len(neg_patients))
    print('Pos BSI')
    print(len(pos_patients[pos_patients[field]==value]))
    print(len(pos_patients[pos_patients[field]==value])/len(pos_patients))
    print('Total')
    print(len(patients[patients[field]==value]))
    print(len(patients[patients[field]==value])/len(patients))

In [86]:
patients = pd.read_excel('data/patient_demographics_data.xlsx')

In [87]:
neg_patients = patients[patients['infection_present']==0]
pos_patients = patients[patients['infection_present']==1]

In [88]:
len(neg_patients)

43

In [89]:
len(neg_patients)/len(patients)

0.45263157894736844

In [90]:
len(pos_patients)

52

In [91]:
len(pos_patients)/len(patients)

0.5473684210526316

### Num Admissions

In [92]:
patients.num_admissions.describe()

count    95.000000
mean      5.052632
std       2.438034
min       1.000000
25%       3.000000
50%       5.000000
75%       6.500000
max      13.000000
Name: num_admissions, dtype: float64

In [93]:
import scipy.stats as st
st.norm.interval(alpha=0.95, loc=np.mean(patients.num_admissions), scale=st.sem(patients.num_admissions))

(4.562372081128308, 5.542891076766428)

In [94]:
neg_patients.num_admissions.describe()

count    43.000000
mean      4.069767
std       2.175572
min       1.000000
25%       2.000000
50%       4.000000
75%       6.000000
max       8.000000
Name: num_admissions, dtype: float64

In [95]:
import scipy.stats as st
st.norm.interval(alpha=0.95, loc=np.mean(neg_patients.num_admissions), scale=st.sem(neg_patients.num_admissions))

(3.4195071314949783, 4.720027752225952)

In [96]:
pos_patients.num_admissions.describe()

count    52.000000
mean      5.865385
std       2.360034
min       1.000000
25%       4.000000
50%       6.000000
75%       7.000000
max      13.000000
Name: num_admissions, dtype: float64

In [97]:
st.norm.interval(alpha=0.95, loc=np.mean(pos_patients.num_admissions), scale=st.sem(pos_patients.num_admissions))

(5.223931958363363, 6.506837272405867)

### Deaths

In [98]:
print(len(patients[patients.deceased == 1]))
len(patients[patients.deceased == 1])/len(patients)

36


0.37894736842105264

In [99]:
print(len(neg_patients[neg_patients.deceased == 1]))
len(neg_patients[neg_patients.deceased == 1])/len(neg_patients)

18


0.4186046511627907

In [100]:
print(len(pos_patients[pos_patients.deceased == 1]))
len(pos_patients[pos_patients.deceased == 1])/len(pos_patients)

18


0.34615384615384615

In [101]:
get_pd_pval_b('deceased')

BarnardExactResult(statistic=-1.0711522458779001, pvalue=0.31788373689236454)

### Sex

In [102]:
patients.loc[:,'male'] = patients['gender'].map({'M':1, 'F':0})
neg_patients.loc[:,'male'] = neg_patients['gender'].map({'M':1, 'F':0})
pos_patients.loc[:,'male'] = pos_patients['gender'].map({'M':1, 'F':0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [103]:
get_pd_stats('male',1)

Neg BSI
20
0.46511627906976744
Pos BSI
27
0.5192307692307693
Total
47
0.49473684210526314


In [104]:
get_pd_stats('male',0)

Neg BSI
23
0.5348837209302325
Pos BSI
25
0.4807692307692308
Total
48
0.5052631578947369


In [105]:
get_pd_pval_b('male')

BarnardExactResult(statistic=1.9537142885602183, pvalue=0.05279244824060565)

### Race

In [106]:
get_pd_stats('white_caucasian',1)

Neg BSI
30
0.6976744186046512
Pos BSI
39
0.75
Total
69
0.7263157894736842


In [107]:
get_pd_stats('white_caucasian',0)

Neg BSI
13
0.3023255813953488
Pos BSI
13
0.25
Total
26
0.2736842105263158


In [108]:
get_pd_pval_b('white_caucasian')

BarnardExactResult(statistic=1.6232924267688968, pvalue=0.1089976559247995)

### Diganosis Age

In [109]:
neg_patients['age_diagnosis'].describe()

count    43.000000
mean      7.604842
std       5.776675
min       0.049315
25%       1.749315
50%       6.764384
75%      12.382192
max      18.509589
Name: age_diagnosis, dtype: float64

In [110]:
pos_patients['age_diagnosis'].describe()

count    52.000000
mean      8.060063
std       5.622297
min       0.342466
25%       2.043836
50%       8.857534
75%      12.880822
max      18.750685
Name: age_diagnosis, dtype: float64

In [111]:
patients['age_diagnosis'].describe()

count    95.000000
mean      7.854016
std       5.666754
min       0.049315
25%       1.868493
50%       7.980822
75%      13.016438
max      18.750685
Name: age_diagnosis, dtype: float64

In [112]:
get_pd_pval_median('age_diagnosis')

MannwhitneyuResult(statistic=1058.0, pvalue=0.6564103927620106)

### Age at 1st infection

In [113]:
pos_patients['age_first_infection'].describe()

count    52.000000
mean      8.264808
std       5.685857
min       0.930000
25%       2.057500
50%       9.200000
75%      13.412500
max      19.220000
Name: age_first_infection, dtype: float64

### Diagnosis

In [114]:
get_pd_stats('AML_diag','AML')

Neg BSI
37
0.8604651162790697
Pos BSI
47
0.9038461538461539
Total
84
0.8842105263157894


In [115]:
get_pd_stats('AML_diag','2nd AML')

Neg BSI
6
0.13953488372093023
Pos BSI
5
0.09615384615384616
Total
11
0.11578947368421053


In [116]:
one_hot = pd.get_dummies(patients['AML_diag'])
patients = patients.join(one_hot)

In [117]:
neg_patients = patients[patients['infection_present']==0]
pos_patients = patients[patients['infection_present']==1]

In [118]:
get_pd_pval_b('AML')

BarnardExactResult(statistic=-0.3224366026765867, pvalue=0.7708909715769282)

# Table 5: Cyt >= 2000

In [119]:
admissions = pd.read_excel('data/aml_data_8.16.23.xlsx')
admissions.dropna(subset=['cytarabine'],inplace=True)
len(admissions)

429

In [120]:
table5 = pd.DataFrame()

In [121]:
pos_ads = admissions[admissions['infection_present'] == 1]
print('There are '+str(len(pos_ads))+' infection-positive admissions.')
print('Percentage pos '+str(len(pos_ads)/len(admissions)) )
print('There are '+str(len(pos_ads.MRN.unique()))+' patients that were infected.')
#pos_ads.head()
neg_ads = admissions[admissions['infection_present'] == 0]
print('There are '+str(len(neg_ads))+' infection-negative admissions.')
print('Percentage negative '+str(len(neg_ads)/len(admissions)) )
#neg_ads.head()

There are 86 infection-positive admissions.
Percentage pos 0.20046620046620048
There are 47 patients that were infected.
There are 343 infection-negative admissions.
Percentage negative 0.7995337995337995


In [122]:
table5["no_cyt"] = get_stats('cytarabine',0)

In [123]:
get_pval_b('cytarabine')

BarnardExactResult(statistic=2.136448233790324, pvalue=0.03361006406230964)

In [124]:
table5["cyt_2000"] = get_stats('cyt_2000',1)
table5["cyt_less_2000"] = get_stats('cyt_2000',0)
table5["cyt1_1999"] = get_stats('cyt_1_1999',1)
table5

Unnamed: 0,no_cyt,cyt_2000,cyt_less_2000,cyt1_1999
0,60.0,59.0,284.0,224.0
1,0.174927,0.172012,0.827988,0.653061
2,7.0,32.0,54.0,47.0
3,0.081395,0.372093,0.627907,0.546512
4,67.0,91.0,338.0,271.0
5,0.156177,0.212121,0.787879,0.631702


In [125]:
get_pval_b('cyt_2000')

BarnardExactResult(statistic=4.058378226689207, pvalue=0.00021230167662919936)

In [126]:
contingency_table = [[60,224,59],[7,47,32]]
res = stats.chi2_contingency([contingency_table[0],contingency_table[1]])
res

(18.06399917706291,
 0.00011952325673110384,
 2,
 array([[ 53.56876457, 216.67365967,  72.75757576],
        [ 13.43123543,  54.32634033,  18.24242424]]))