# MIS581 Capstone Project Data Analysis Code

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

### Read File into Pandas DataFrame and View First 5 Records

In [None]:
df = pd.read_csv('C:/Users/pjwoo/Desktop/healthcare/mimic3d.csv')
df.head()

### Find and Handle Missing Values

In [None]:
df.isnull().sum()

In [None]:
df['religion'].fillna(value='NOT SPECIFIED', inplace=True)
df.dropna(subset=['AdmitDiagnosis'], how='any', inplace=True)
df['marital_status'].fillna(value='UNKNOWN (DEFAULT)', inplace=True)

### Narrow data to a subset of diagnoses

In [None]:
df.AdmitDiagnosis.nunique()

In [None]:
print('Percentage of Unique Diagnoses:',df.AdmitDiagnosis.nunique(),'/',df.AdmitDiagnosis.count(),'=',(df.AdmitDiagnosis.nunique()/df.AdmitDiagnosis.count()))

In [None]:
df.AdmitDiagnosis.value_counts().head(10)

In [None]:
coronary = df[df['AdmitDiagnosis'].str.contains(pat='CORONARY ARTERY DISEASE')]

In [None]:
coronary.AdmitDiagnosis.nunique()

In [None]:
coronary.AdmitDiagnosis.value_counts().head(60)

In [None]:
coronary.loc[coronary['AdmitDiagnosis'].str.contains(pat='CORONARY ARTERY BYPASS'), 'AdmitDiagnosis'] = 'CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS'
coronary.loc[coronary['AdmitDiagnosis'].str.contains(pat='CATH'), 'AdmitDiagnosis'] = 'CORONARY ARTERY DISEASE\CATH'
coronary.loc[coronary['AdmitDiagnosis'].str.contains(pat='AORTIC STENOSIS'), 'AdmitDiagnosis'] = 'CORONARY ARTERY DISEASE\AORTIC STENOSIS'
coronary.loc[coronary['AdmitDiagnosis'].str.contains(pat='ANGIOPLASTY'), 'AdmitDiagnosis'] = 'CORONARY ARTERY DISEASE\ANGIOPLASTY'
coronary.loc[coronary['AdmitDiagnosis'].str.contains(pat='VALVE REPLACEMENT'), 'AdmitDiagnosis'] = 'CORONARY ARTERY DISEASE\VALVE REPLACEMENT'
coronary.loc[coronary['AdmitDiagnosis'].str.contains(pat='CONGESTIVE HEART FAILURE'), 'AdmitDiagnosis'] = 'CORONARY ARTERY DISEASE\CONGESTIVE HEART FAILURE'
coronary.loc[coronary['AdmitDiagnosis'].str.contains(pat='STENT'), 'AdmitDiagnosis'] = 'CORONARY ARTERY DISEASE\STENT'
coronary.loc[coronary['AdmitDiagnosis'].str.contains(pat='CHF'), 'AdmitDiagnosis'] = 'CORONARY ARTERY DISEASE\CONGESTIVE HEART FAILURE'
coronary.loc[coronary['AdmitDiagnosis'].str.contains(pat='ANGINA'), 'AdmitDiagnosis'] = 'CORONARY ARTERY DISEASE\ANGINA'
coronary.loc[coronary['AdmitDiagnosis'].str.contains(pat='CHEST PAIN'), 'AdmitDiagnosis'] = 'CORONARY ARTERY DISEASE\CHEST PAIN'
coronary.loc[coronary['AdmitDiagnosis'].str.contains(pat='SDA'), 'AdmitDiagnosis'] = 'CORONARY ARTERY DISEASE'
coronary.loc[coronary['AdmitDiagnosis'].str.contains(pat='VESSEL'), 'AdmitDiagnosis'] = 'CORONARY ARTERY DISEASE'
coronary.loc[coronary['AdmitDiagnosis'].str.contains(pat=';'), 'AdmitDiagnosis'] = 'CORONARY ARTERY DISEASE'
coronary.loc[coronary['AdmitDiagnosis'].str.contains(pat='INTERVENTION'), 'AdmitDiagnosis'] = 'CORONARY ARTERY DISEASE\INTERVENTION'
coronary.loc[coronary['AdmitDiagnosis'].str.contains(pat='COROANRY'), 'AdmitDiagnosis'] = 'CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS'
coronary.loc[coronary['AdmitDiagnosis'].str.contains(pat='LEFT MAIN'), 'AdmitDiagnosis'] = 'CORONARY ARTERY DISEASE'
coronary.loc[coronary['AdmitDiagnosis'].str.contains(pat='AS CORONARY'), 'AdmitDiagnosis'] = 'CORONARY ARTERY DISEASE'

coronary.AdmitDiagnosis.nunique()

In [None]:
coronary.shape

In [None]:
print('Percentage of Unique Diagnoses:',coronary.AdmitDiagnosis.nunique(),'/',coronary.AdmitDiagnosis.count(),'=',(coronary.AdmitDiagnosis.nunique()/coronary.AdmitDiagnosis.count()))

### Find and remove outliers from key variables

In [None]:
coronary.describe()

In [None]:
sns.boxplot(y=coronary['NumDiagnosis'], showfliers=False)

In [None]:
coronary = coronary[coronary['NumDiagnosis']<4]

In [None]:
sns.boxplot(y=coronary['LOSdays'], showfliers=False)

In [None]:
coronary = coronary[coronary['LOSdays']<16]

In [None]:
coronary.describe()

### Create new dataframe using a subset of variables

In [None]:
data = coronary[['gender','age','LOSdays','admit_type','admit_location','AdmitDiagnosis','insurance',
                 'marital_status','ethnicity','NumDiagnosis','AdmitProcedure','NumProcs','NumCallouts',
                 'NumCPTevents','NumProcEvents','NumTransfers','NumNotes']]
data.head()

In [None]:
data.shape

### Label encode categorical text variables

In [None]:
le = LabelEncoder()
data['gender_le'] = le.fit_transform(data['gender'])
data['admit_type_le'] = le.fit_transform(data['admit_type'])
data['admit_location_le'] = le.fit_transform(data['admit_location'])
data['insurance_le'] = le.fit_transform(data['insurance'])
data['marital_status_le'] = le.fit_transform(data['marital_status'])
data['ethnicity_le'] = le.fit_transform(data['ethnicity'])
data['AdmitProcedure_le'] = le.fit_transform(data['AdmitProcedure'])
data['AdmitDiagnosis_le'] = le.fit_transform(data['AdmitDiagnosis'])

### Create heatmap showing correlation between numerical variables

In [None]:
plt.subplots(figsize=(20,15))
sns.heatmap(data.corr(), annot=True)

### Create new dataframe with only numerical variables

In [None]:
dfnum = data[['gender_le','age','LOSdays','admit_type_le','admit_location_le','AdmitDiagnosis_le','insurance_le',
          'marital_status_le','ethnicity_le','NumDiagnosis','AdmitProcedure_le','NumProcs','NumCallouts',
                 'NumCPTevents','NumProcEvents','NumTransfers','NumNotes']]
dfnum.head()

In [None]:
dfnum.shape

### Create X and y variables and split into train and test datasets

In [None]:
X = dfnum.drop('LOSdays', axis='columns')
y = dfnum.LOSdays
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
X_train.shape

### Train, test, and evaluate linear regression model

In [None]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)
y_pred = linreg.predict(X_test)

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Coefficient of Determination:', linreg.score(X, y))
print('Intercept:', linreg.intercept_)
feature_cols = ['gender_le','age','LOSdays','admit_type_le','admit_location_le','AdmitDiagnosis_le','insurance_le',
          'marital_status_le','ethnicity_le','NumDiagnosis','AdmitProcedure_le','NumProcs','NumCallouts',
                 'NumCPTevents','NumProcEvents','NumTransfers','NumNotes']
print('Coefficients:')
list(zip(feature_cols, linreg.coef_))


In [None]:
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
results['Variance'] = abs(results['Predicted']-results['Actual'])
results['ErrPerc'] = results['Variance']/results['Actual']*100
results.head(20)