In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error
import matplotlib.pyplot as plt

# Set the correct file path for the medical df
medical_file_path = "medical_clean.csv"
# Read the medical df file with keep_default_na
df = pd.read_csv(medical_file_path, keep_default_na=False, index_col=0)


# Code to check for duplicates
has_duplicates = df.duplicated().any()
print("Duplicates present:", has_duplicates)

# Check for missing data
missing_data = df.isnull().sum()

# Display the missing data counts
print("Missing data counts:")
print(missing_data)

# Display data types
df.info()

# Visually inspect df
pd.set_option("display.max_columns", None)
df.head(5)


# Label encode binary categorical columns
label_encoder = LabelEncoder()
binary_columns = ['ReAdmis', 'HighBlood', 'Stroke', 'Overweight', 'Diabetes', 'Asthma']
for col in binary_columns:
    df[f'{col}_encoded'] = label_encoder.fit_transform(df[col])

# Check for missing values after label encoding
print("Missing values after label encoding:")
print(df[[f'{col}_encoded' for col in binary_columns]].isnull().sum())


# Initialize OneHotEncoder
one_hot_encoder = OneHotEncoder(sparse_output=False)

# Encode categorical columns
initial_admin_encoded = one_hot_encoder.fit_transform(df[['Initial_admin']])
initial_admin_encoded_df = pd.DataFrame(initial_admin_encoded, columns=one_hot_encoder.get_feature_names_out(['Initial_admin']))

services_encoded = one_hot_encoder.fit_transform(df[['Services']])
services_encoded_df = pd.DataFrame(services_encoded, columns=one_hot_encoder.get_feature_names_out(['Services']))

gender_encoded = one_hot_encoder.fit_transform(df[['Gender']])
gender_encoded_df = pd.DataFrame(gender_encoded, columns=one_hot_encoder.get_feature_names_out(['Gender']))

# Concatenate the encoded DataFrames
df_combined = pd.concat([df, initial_admin_encoded_df, services_encoded_df, gender_encoded_df], axis=1)

# Check for missing values after concatenation
print("Missing values after concatenation:")
print(df_combined.isnull().sum())

# Handle missing values (if any)
# Fill missing values for numeric columns
numeric_columns = df_combined.select_dtypes(include=['number']).columns
df_combined[numeric_columns] = df_combined[numeric_columns].fillna(df_combined[numeric_columns].median())

# Fill missing values for non-numeric columns
non_numeric_columns = df_combined.select_dtypes(exclude=['number']).columns
df_combined[non_numeric_columns] = df_combined[non_numeric_columns].fillna('Unknown')

# Check for missing values after handling
print("Missing values after handling:")
print(df_combined.isnull().sum())

# Drop original categorical columns
df_combined.drop(['ReAdmis', 'Gender', 'Initial_admin', 'HighBlood', 'Stroke', 'Overweight', 'Diabetes', 'Asthma', 'Services'], axis=1, inplace=True, errors='ignore')

# Select columns for the decision tree model
selected_columns = [
    'Population', 'Children', 'Age', 'Income', 'VitD_levels', 'Doc_visits', 'Full_meals_eaten', 'vitD_supp', 
    'Initial_days', 'TotalCharge', 'Additional_charges'
] + list(initial_admin_encoded_df.columns) + list(services_encoded_df.columns) + list(gender_encoded_df.columns) + [f'{col}_encoded' for col in binary_columns if col != 'ReAdmis']

# Create new DataFrame for the decision tree
tree_df = df_combined[selected_columns].assign(ReAdmis=df['ReAdmis_encoded'])

# Check for missing values in the final DataFrame
print("Missing values in the final DataFrame:")
print(tree_df.isnull().sum())

# Display the final DataFrame
print(tree_df.head())

# Save tree_df to a CSV file
tree_df.to_csv("tree_df.csv", index=False)
print("tree_df has been saved to tree_df.csv.")


# Ensure there are no missing values in the target column
tree_df = tree_df.dropna(subset=['ReAdmis'])

# Separate features and target variable
X = tree_df.drop('ReAdmis', axis=1)
y = tree_df['ReAdmis']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the decision tree model
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)

# Combine the training features and target into a single DataFrame
train_tree_df = pd.concat([X_train, y_train], axis=1)
test_tree_df = pd.concat([X_test, y_test], axis=1)

# Save the train and test DataFrames to CSV files
train_tree_df.to_csv("train_tree_df.csv", index=False)
test_tree_df.to_csv("test_tree_df.csv", index=False)

print("train_tree_df has been saved to 'train_tree_df.csv'.")
print("test_tree_df has been saved to 'test_tree_df.csv'.")






# Determine the number of nodes in the tree
num_nodes = decision_tree.tree_.node_count
print(f"Number of nodes in the tree: {num_nodes}")

# Determine the depth of the tree
tree_depth = decision_tree.get_depth()
print(f"Depth of the tree: {tree_depth}")

# Determine the number of leaves on the tree
num_leaves = decision_tree.get_n_leaves()
print(f"Number of leaves in the tree: {num_leaves}")



# Determine feature importances
feature_importances = decision_tree.feature_importances_
for feature, importance in zip(X.columns, feature_importances):
    print(f"Feature: {feature}, Importance: {importance}")

# Create a DataFrame for better visualization
importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print(importance_df)

# Plot the feature importances
plt.figure(figsize=(12, 8))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importances in Decision Tree')
plt.gca().invert_yaxis()
plt.show()



# Visualize the decision tree
plt.figure(figsize=(20,10))
plot_tree(
    decision_tree, 
    feature_names=list(X.columns), 
    class_names=["Class 0", "Class 1"], 
    filled=True, 
    rounded=True, 
    proportion=True, 
    precision=2
)
plt.show()



# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=decision_tree, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_decision_tree = grid_search.best_estimator_

# Train the pruned decision tree on the entire training set
best_decision_tree.fit(X_train, y_train)

# Make predictions
y_pred = best_decision_tree.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print("Best Parameters:", best_params)
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)
print("Mean Squared Error:", mse)



# Visualize the pruned decision tree
plt.figure(figsize=(20,10))
plot_tree(
    best_decision_tree, 
    feature_names=list(X.columns), 
    class_names=["Class 0", "Class 1"], 
    filled=True, 
    rounded=True, 
    proportion=True, 
    precision=2
)
plt.show()