In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
# Set the correct file path for the medical data
medical_file_path = "medical_clean.csv"
# Read the medical data file with keep_default_na
df = pd.read_csv(medical_file_path, keep_default_na=False, index_col=0)

# Code to check for duplicates
has_duplicates = df.duplicated().any()
print("Duplicates present:", has_duplicates)

# Check for missing data
missing_data = df.isnull().sum()

# Display the missing data counts
print("Missing data counts:")
print(missing_data)

# Display data types
df.info()
# Visually inspect df
pd.set_option("display.max_columns", None)
df.head(5)

# Summary statistics for the dependent variable
ReAdmis_summary = df["ReAdmis"].describe()
print("Summary Statistics for ReAdmis:")
print(ReAdmis_summary)


# Summary statistics for Initial_days
Initial_days_summary = df["Initial_days"].describe()
print("Summary Statistics for Initial_days:")
print(Initial_days_summary)


# Summary statistics for Age
Age_summary = df["Age"].describe()
print("Summary Statistics for Age:")
print(Age_summary)

# Summary statistics for Doc_visits 
Doc_visits_summary = df["Doc_visits"].describe()
print("Summary Statistics for Doc_visits:")
print(Doc_visits_summary)

# Summary statistics for vitD_supp
vitD_supp_summary = df["vitD_supp"].describe()
print("Summary Statistics for vitD_supp:")
print(vitD_supp_summary)

# Summary statistics for Allergic_rhinitis
Allergic_rhinitis_summary = df["Allergic_rhinitis"].describe()
print("Summary Statistics for Allergic_rhinitis:")
print(Allergic_rhinitis_summary)

# Summary statistics for Arthritis
Arthritis_summary = df["Arthritis"].describe()
print("Summary Statistics for Arthritis:")
print(Arthritis_summary)

# Summary statistics for Asthma
Asthma_summary = df["Asthma"].describe()
print("Summary Statistics for Asthma:")
print(Asthma_summary)

# Summary statistics for BackPain
BackPain_summary = df["BackPain"].describe()
print("Summary Statistics for BackPain:")
print(BackPain_summary)

# Summary statistics for Complication_risk
Complication_risk_summary = df["Complication_risk"].describe()
print("Summary Statistics for Complication_risk:")
print(Complication_risk_summary)

# Summary statistics for Diabetes
Diabetes_summary = df["Diabetes"].describe()
print("Summary Statistics for Diabetes:")
print(Diabetes_summary)

# Summary statistics for Gender
Gender_summary = df["Gender"].describe()
print("Summary Statistics for Gender:")
print(Gender_summary)

# Summary statistics for HighBlood
HighBlood_summary = df["HighBlood"].describe()
print("Summary Statistics for HighBlood:")
print(HighBlood_summary)

# Summary statistics for Hyperlipidemia
Hyperlipidemia_summary = df["Hyperlipidemia"].describe()
print("Summary Statistics for Hyperlipidemia:")
print(Hyperlipidemia_summary)

# Summary statistics for Initial_admin
Initial_admin_summary = df["Initial_admin"].describe()
print("Summary Statistics for Initial_admin:")
print(Initial_admin_summary)

# Summary statistics for Overweight
Overweight_summary = df["Overweight"].describe()
print("Summary Statistics for Overweight:")
print(Overweight_summary)

# Summary statistics for Reflux_esophagitis
Reflux_esophagitis_summary = df["Reflux_esophagitis"].describe()
print("Summary Statistics for Reflux_esophagitis:")
print(Reflux_esophagitis_summary)

# Summary statistics for Services
Services_summary = df["Services"].describe()
print("Summary Statistics for Services:")
print(Services_summary)

# Summary statistics for Stroke
Stroke_summary = df["Stroke"].describe()
print("Summary Statistics for Stroke:")
print(Stroke_summary)


# Update 'Initial_days' and 'vitD_supp' to int
df["Initial_days"] = df["Initial_days"].astype(int)
df["vitD_supp"] = df["vitD_supp"].astype(int)

# Update 'Gender' to category
df["Gender"] = df["Gender"].astype("category")

# Update columns to boolean using mapping
bool_mapping = {"Yes": 1, "No": 0}
columns_to_convert = ["HighBlood", "Stroke", "Overweight", "Arthritis", "Diabetes", "Hyperlipidemia", "BackPain", "Anxiety", "Allergic_rhinitis", "Reflux_esophagitis", "Asthma", "ReAdmis"]
for col in columns_to_convert:
    df[col] = df[col].map(bool_mapping)

# Identify numerical features for scaling
numerical_features = ["Initial_days", "Age", "Doc_visits", "vitD_supp"]

# Standardize the numerical features
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Generate columns of dummy values for categorical variables
gender_df = pd.get_dummies(data=df["Gender"], drop_first=False)
int_admit_df = pd.get_dummies(data=df["Initial_admin"], drop_first=False)
comp_risk_df = pd.get_dummies(data=df["Complication_risk"], drop_first=False)
services_df = pd.get_dummies(data=df["Services"], drop_first=False)

# Create new df with model variables
KNN_df = df[["Initial_days", "Age", "Doc_visits", "vitD_supp", "HighBlood", "Stroke", "Overweight", "Arthritis", "Diabetes", "Hyperlipidemia", "BackPain", "Anxiety", "Allergic_rhinitis", "Reflux_esophagitis", "Asthma", "ReAdmis"]].copy()

# Add dummy variables to KNN_df
KNN_df = pd.concat([KNN_df, gender_df, int_admit_df, comp_risk_df, services_df], axis=1)

# Ensure all dummies are 1/0
dummy_columns = gender_df.columns.tolist() + int_admit_df.columns.tolist() + comp_risk_df.columns.tolist() + services_df.columns.tolist()
for col in dummy_columns:
    KNN_df[col] = KNN_df[col].astype(int)

    
# Visually inspect df
pd.set_option("display.max_columns", None)
KNN_df.head(5)

# Save KNN_df to a CSV file
KNN_df.to_csv("KNN_df.csv", index=False)
print("KNN_df has been saved to KNN_df.csv'.")

# Set the correct file path for KNN df
KNN_df_path = "KNN_df.csv"
df = pd.read_csv(KNN_df_path, keep_default_na=False)

# Splitting the data into features and target
X = df.drop('ReAdmis', axis=1)
y = df['ReAdmis']

# Splitting the data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Combine the training features and target into a single DataFrame
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

# Save the train and test DataFrames to CSV files
train_df.to_csv("train_df.csv", index=False)
test_df.to_csv("test_df.csv", index=False)

print("train_df and test_df have been saved to train_df.csv and test_df.csv.")

# Standardize the features and perform GridSearchCV
KNN_df_path = "KNN_df.csv"
KNN_df = pd.read_csv(KNN_df_path, keep_default_na=False)

# Separate features and target
X = KNN_df.drop("ReAdmis", axis=1)
y = KNN_df["ReAdmis"]

# Identify numerical and categorical columns
numerical_features = ["Initial_days", "Age", "Doc_visits", "vitD_supp"]
categorical_features = [col for col in X.columns if col not in numerical_features]
print("Columns in X before standardization:", X.columns)

# Standardize the numerical features
scaler = StandardScaler()
X[numerical_features] = scaler.fit_transform(X[numerical_features])

# Combine the standardized numerical features with the categorical features
X = np.hstack([X[numerical_features], X[categorical_features]])

# Perform GridSearchCV to find the optimal number of neighbors
param_grid = {'n_neighbors': range(1, 31)}
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
grid_search.fit(X, y)

# Get the best number of neighbors
best_k = grid_search.best_params_['n_neighbors']
print(f'The optimal number of neighbors is {best_k}')

# Fit KNN classifier with the optimal number of neighbors
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X, y)
best_accuracy = grid_search.best_score_
print(f'Best cross-validated accuracy with k={best_k}: {best_accuracy}')


# Perform KNN using the value of k=22
X_train = np.ascontiguousarray(X_train)
X_test = np.ascontiguousarray(X_test)

knn = KNeighborsClassifier(n_neighbors=22)
knn.fit(X_train, y_train)

# Generate y_pred array for model's confusion matrix
y_pred = knn.predict(X_test)
final_matrix = confusion_matrix(y_test, y_pred)

# Print confusion matrix and accuracy score of model
print("The confusion matrix for this KNN model:")
print("Predicted No ReAdmis | Predicted ReAdmis")
print(f"                 {final_matrix[0]} Actual No ReAdmis")
print(f"                 {final_matrix[1]} Actual ReAdmis")
print(f"The training accuracy of this KNN classification is {knn.score(X_train, y_train):.5f}.")
print(f"The testing accuracy of this KNN classification model is {knn.score(X_test, y_test):.5f}.")


# Generate AUC score and print
y_pred_prob = knn.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for KNN Classification')
plt.show()
print(f"The Area Under the Curve (AUC) score is: {roc_auc_score(y_test, y_pred_prob)}\n")
print(classification_report(y_test, y_pred))
