In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
import statsmodels.api as sm
from patsy import dmatrices
from tabulate import tabulate
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('OUData_Cleaned_and_Processed.csv')

In [3]:
# Target variable: Flipped
# Input Variables: {Age (numerical), Gender (categorical),
# PrimaryInsuranceCategory (categorical), DRG01 (categorical), BloodPressureDiff (numerical),
# BloodPressureLower (numerical), BloodPressureUpper (numerical), Pulse (numerical), PulseOximetry
# (numerical), Respirations (numerical), and Temperature (numerical)}

In [4]:
# Handling categorical variables with patsy's dmatrices
formula = 'Flipped ~ Age + Gender + PrimaryInsuranceCategory + DRG01 + BloodPressureDiff + BloodPressureLower + BloodPressureUpper + Pulse + PulseOximetry + Respirations + Temperature'
y, X = dmatrices(formula, df, return_type='dataframe')

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
y_train = y_train.drop(columns=['Flipped[False]'])
y_test = y_test.drop(columns=['Flipped[False]'])

# Standardize the numerical variables to have zero mean and unit variance
scaler = StandardScaler()

# Fit on training data and transform both training and test data
# Ensuring indices are preserved when converting back to DataFrame
X_train_scaled = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)

X_test_scaled = scaler.transform(X_test)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

# Initialize and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train.values.ravel())  # y_train.ravel() to convert y_train to a 1D array if needed

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
conf_matrix = confusion_matrix(y_test, y_pred)

# Using tabulate to format the output of the confusion matrix
print("Confusion Matrix:")
print(tabulate(conf_matrix, tablefmt="grid"))

# Using tabulate to format the classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
+-----+----+
| 123 | 50 |
+-----+----+
|  84 | 73 |
+-----+----+

Classification Report:
              precision    recall  f1-score   support

         0.0       0.59      0.71      0.65       173
         1.0       0.59      0.46      0.52       157

    accuracy                           0.59       330
   macro avg       0.59      0.59      0.58       330
weighted avg       0.59      0.59      0.59       330



In [15]:
# Data preparation (assuming X, y are ready)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
y_train = y_train.drop(columns=['Flipped[False]'])
y_test = y_test.drop(columns=['Flipped[False]'])

# Initialize the Decision Tree classifier
#model = DecisionTreeClassifier(random_state=12345)

#Random forest
#model = RandomForestClassifier(random_state=12345, n_estimators=100)

# Train the model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Output formatted Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(tabulate(conf_matrix, headers=['Predicted No', 'Predicted Yes'], tablefmt='grid', showindex=['Actual No', 'Actual Yes']))

# Output formatted Classification Report
class_report = classification_report(y_test, y_pred, output_dict=True)
print("\nClassification Report:")
print(tabulate(pd.DataFrame(class_report).transpose(), headers="keys", tablefmt="grid"))



Confusion Matrix:
+------------+----------------+-----------------+
|            |   Predicted No |   Predicted Yes |
| Actual No  |            117 |              56 |
+------------+----------------+-----------------+
| Actual Yes |             86 |              71 |
+------------+----------------+-----------------+

Classification Report:
+--------------+-------------+----------+------------+------------+
|              |   precision |   recall |   f1-score |    support |
| 0.0          |    0.576355 | 0.676301 |   0.62234  | 173        |
+--------------+-------------+----------+------------+------------+
| 1.0          |    0.559055 | 0.452229 |   0.5      | 157        |
+--------------+-------------+----------+------------+------------+
| accuracy     |    0.569697 | 0.569697 |   0.569697 |   0.569697 |
+--------------+-------------+----------+------------+------------+
| macro avg    |    0.567705 | 0.564265 |   0.56117  | 330        |
+--------------+-------------+----------+-----

  model.fit(X_train, y_train)


In [20]:
#attempting to improve the model

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# Train a decision tree without pruning to find the ccp_alpha values
tree = DecisionTreeClassifier(random_state=42)
path = tree.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

# Grid search to find the best ccp_alpha
clf = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(clf, param_grid={'ccp_alpha': ccp_alphas}, cv=5)
grid_search.fit(X_train, y_train)
print("Best ccp_alpha:", grid_search.best_params_)

# Prune the tree using the optimal ccp_alpha
tree_pruned = DecisionTreeClassifier(random_state=42, ccp_alpha=grid_search.best_params_['ccp_alpha'])
tree_pruned.fit(X_train, y_train)
y_pred = tree_pruned.predict(X_test)

# Evaluate the pruned tree
print(f"Accuracy on test set: {accuracy_score(y_test, y_pred):.2f}")

# Output formatted Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(tabulate(conf_matrix, headers=['Predicted No', 'Predicted Yes'], tablefmt='grid', showindex=['Actual No', 'Actual Yes']))

# Output formatted Classification Report
class_report = classification_report(y_test, y_pred, output_dict=True)
print("\nClassification Report:")
print(tabulate(pd.DataFrame(class_report).transpose(), headers="keys", tablefmt="grid"))

Best ccp_alpha: {'ccp_alpha': 0.008986517635530544}
Accuracy on test set: 0.60
Confusion Matrix:
+------------+----------------+-----------------+
|            |   Predicted No |   Predicted Yes |
| Actual No  |            136 |              37 |
+------------+----------------+-----------------+
| Actual Yes |             95 |              62 |
+------------+----------------+-----------------+

Classification Report:
+--------------+-------------+----------+------------+-----------+
|              |   precision |   recall |   f1-score |   support |
| 0.0          |    0.588745 | 0.786127 |   0.673267 |     173   |
+--------------+-------------+----------+------------+-----------+
| 1.0          |    0.626263 | 0.394904 |   0.484375 |     157   |
+--------------+-------------+----------+------------+-----------+
| accuracy     |    0.6      | 0.6      |   0.6      |       0.6 |
+--------------+-------------+----------+------------+-----------+
| macro avg    |    0.607504 | 0.590516 |  