In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.stats import proportion
from scipy import stats
%matplotlib inline
os.chdir('/Users/YuChen/Desktop/loyola') # for mac 
#os.chdir('C:\\Users\\YuChen\\Desktop\\loyola') for windows

In [2]:
CAData = pd.read_stata('SPINE_ILEUS_CA_REVISED.dta')

## Reading the data

In [3]:
varLst = ['ACDF','CLAM','SHORT','LONG','ALIF','PLIF','LOS','COMBINED','COMBINED_SHORT','COMBINED_LONG',
          'ileus_NPOA','SHORT_ILEUS',
          'LONG_ILEUS','ALIF_ILEUS','PLIF_ILEUS', 'DIED', 'MI_NPOA',
           'Sepsis_NPOA', 'PE_NPOA','ACDF_ILEUS','CLAM_ILEUS','COMBINED_ILEUS','COMBINED_SHORT_ILEUS','COMBINED_LONG_ILEUS']

CAData = pd.read_stata('SPINE_ILEUS_CA_REVISED.dta', columns=varLst)

In [4]:
CAData.describe()



Unnamed: 0,ACDF,CLAM,SHORT,LONG,ALIF,PLIF,LOS,COMBINED,COMBINED_SHORT,COMBINED_LONG,...,PLIF_ILEUS,DIED,MI_NPOA,Sepsis_NPOA,PE_NPOA,ACDF_ILEUS,CLAM_ILEUS,COMBINED_ILEUS,COMBINED_SHORT_ILEUS,COMBINED_LONG_ILEUS
count,345067.0,345067.0,345067.0,345067.0,345067.0,345067.0,345010.0,345067.0,345067.0,345067.0,...,345067.0,345045.0,345067.0,345067.0,345067.0,345067.0,345067.0,345067.0,345067.0,345067.0
mean,0.195733,0.043823,0.459282,0.12173,0.08494,0.061374,7.709695,0.010923,0.074049,0.014551,...,0.002292,0.033816,0.008416,0.044522,0.0,0.000606,0.000484,0.001078,0.005373,0.001837
std,0.397012,0.204741,0.498372,0.327466,0.279424,0.240172,79.797306,0.103546,0.262318,0.119164,...,0.047785,0.19398,0.091449,0.206195,0.0,0.024598,0.021989,0.032803,0.072964,0.042796
min,0.0,0.0,0.0,0.0,0.0,0.0,-6666.0,0.0,0.0,0.0,...,0.0,-9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,...,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,...,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,1.0,0.0,0.0,0.0,,0.0,0.0,0.0,...,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,358.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0


In [5]:
CAData.head(5)

Unnamed: 0,ACDF,CLAM,SHORT,LONG,ALIF,PLIF,LOS,COMBINED,COMBINED_SHORT,COMBINED_LONG,...,PLIF_ILEUS,DIED,MI_NPOA,Sepsis_NPOA,PE_NPOA,ACDF_ILEUS,CLAM_ILEUS,COMBINED_ILEUS,COMBINED_SHORT_ILEUS,COMBINED_LONG_ILEUS
0,0.0,1.0,1.0,0.0,0.0,0.0,15.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,18.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Processing the data:

### 1. Finding NA values

In [6]:
np.sum(np.array(pd.isnull(CAData)),0)

array([ 0,  0,  0,  0,  0,  0, 57,  0,  0,  0,  0,  0,  0,  0,  0, 22,  0,
        0,  0,  0,  0,  0,  0,  0])

### 2. Finding negative values

In [7]:
np.sum(np.array(CAData < 0), 0)

array([ 0,  0,  0,  0,  0,  0, 48,  0,  0,  0,  0,  0,  0,  0,  0, 19,  0,
        0,  0,  0,  0,  0,  0,  0])

### 3. Finding patients who length of stay (LOS) is 1 day or less

In [8]:
np.sum(np.array(CAData.LOS<=1),0)

36850

## Cleaning data:

In [9]:
for feature in CAData.columns: 
    CAData = CAData[CAData[feature] >= 0]

In [10]:
CAData = CAData[CAData['LOS'] > 1]

In [11]:
CAData.shape

(308119, 24)

## Answer to Questions:

### 1) Comparing ACDF_ILEUS vs MI_NPOA to see if there is a higher rate of MI for patients that undergo ACDF_ILEUS.


In [12]:
ct_ACDF_MI = pd.crosstab(CAData.ACDF_ILEUS,CAData.MI_NPOA); ct_ACDF_MI

MI_NPOA,0.0,1.0
ACDF_ILEUS,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,305052,2862
1.0,203,2


In [13]:
pd.crosstab(CAData.ACDF_ILEUS,CAData.MI_NPOA).apply(lambda x: x/x.sum(), 0)


MI_NPOA,0.0,1.0
ACDF_ILEUS,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.999335,0.999302
1.0,0.000665,0.000698


In [14]:
print('Odds ratio:', stats.fisher_exact(ct_ACDF_MI)[0])


('Odds ratio:', 1.0501182472555277)


### 2) Comparing CLAM_ILEUS vs MI_NPOA to see if there is a higher rate of MI for patients that undergo CLAM_ILEUS.


In [15]:
ct_CLAM_MI = pd.crosstab(CAData.CLAM_ILEUS,CAData.MI_NPOA); ct_CLAM_MI

MI_NPOA,0.0,1.0
CLAM_ILEUS,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,305091,2861
1.0,164,3


In [16]:
pd.crosstab(CAData.CLAM_ILEUS,CAData.MI_NPOA).apply(lambda x: x/x.sum(), 0)


MI_NPOA,0.0,1.0
CLAM_ILEUS,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.999463,0.998953
1.0,0.000537,0.001047


In [17]:
print('Odds ratio:', stats.fisher_exact(ct_CLAM_MI)[0])


('Odds ratio:', 1.9506930887204712)


### 3) Comparing LONG_ILEUS vs MI_NPOA to see if there is a higher rate of MI for patients that undergo LONG_ILEUS.


In [18]:
ct_LONG_MI = pd.crosstab(CAData.LONG_ILEUS,CAData.MI_NPOA); ct_LONG_MI

MI_NPOA,0.0,1.0
LONG_ILEUS,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,303595,2853
1.0,1660,11


In [19]:
pd.crosstab(CAData.LONG_ILEUS,CAData.MI_NPOA).apply(lambda x: x/x.sum(), 0)


MI_NPOA,0.0,1.0
LONG_ILEUS,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.994562,0.996159
1.0,0.005438,0.003841


In [20]:
print('Odds ratio:', stats.fisher_exact(ct_LONG_MI)[0])


('Odds ratio:', 0.70514339165283635)


### 4) Comparing SHORT_ILEUS vs MI_NPOA to see if there is a higher rate of MI for patients that undergo SHORT_ILEUS.


In [21]:
ct_SHORT_MI = pd.crosstab(CAData.SHORT_ILEUS,CAData.MI_NPOA); ct_SHORT_MI

MI_NPOA,0.0,1.0
SHORT_ILEUS,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,301494,2843
1.0,3761,21


In [22]:
pd.crosstab(CAData.SHORT_ILEUS,CAData.MI_NPOA).apply(lambda x: x/x.sum(), 0)


MI_NPOA,0.0,1.0
SHORT_ILEUS,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.987679,0.992668
1.0,0.012321,0.007332


In [23]:
print('Odds ratio:', stats.fisher_exact(ct_SHORT_MI)[0])


('Odds ratio:', 0.59213096852819491)


### 5). Comparing ALIF_ILEUS vs MI_NPOA to see if there is a higher rate of MI for patients that undergo ALIF_ILEUS.

In [24]:
ct_ALIF_MI = pd.crosstab(CAData.ALIF_ILEUS,CAData.MI_NPOA); ct_ALIF_MI

MI_NPOA,0.0,1.0
ALIF_ILEUS,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,302936,2855
1.0,2319,9


In [25]:
pd.crosstab(CAData.ALIF_ILEUS,CAData.MI_NPOA).apply(lambda x: x/x.sum(), 0)


MI_NPOA,0.0,1.0
ALIF_ILEUS,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.992403,0.996858
1.0,0.007597,0.003142


In [26]:
print('Odds ratio:', stats.fisher_exact(ct_ALIF_MI)[0])

('Odds ratio:', 0.41180018260784851)


### 6). Comparing PLIF_ILEUS vs MI_NPOA to see if there is a higher rate of MI for patients that undergo PLIF_ILEUS.

In [27]:
ct_PLIF_MI = pd.crosstab(CAData.PLIF_ILEUS,CAData.MI_NPOA); ct_PLIF_MI

MI_NPOA,0.0,1.0
PLIF_ILEUS,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,304465,2863
1.0,790,1


In [28]:
pd.crosstab(CAData.PLIF_ILEUS,CAData.MI_NPOA).apply(lambda x: x/x.sum(), 0)


MI_NPOA,0.0,1.0
PLIF_ILEUS,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.997412,0.999651
1.0,0.002588,0.000349


In [29]:
print('Odds ratio:', stats.fisher_exact(ct_PLIF_MI)[0])

('Odds ratio:', 0.1346135990839033)
