In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import numpy as np

# Assuming log_reg, X_encoded, y_test, y_pred are available from previous context

# 1. Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Low Telehealth (0)', 'High Telehealth (1)'],
            yticklabels=['Low Telehealth (0)', 'High Telehealth (1)'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Telehealth Classification')
plt.savefig("confusion_matrix.png")
plt.close()

# 2. Feature Coefficients Graph
# Get coefficients and feature names
coefficients = log_reg.coef_[0]
feature_names = X_encoded.columns

# Combine, sort by magnitude, and select top/bottom features
coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
coef_df['Abs_Coefficient'] = np.abs(coef_df['Coefficient'])
coef_df = coef_df.sort_values(by='Abs_Coefficient', ascending=False).drop(columns='Abs_Coefficient')

# Select the top 15 features by magnitude
top_n = 15
top_features = pd.concat([coef_df.head(top_n // 2), coef_df.tail(top_n // 2)]).sort_values(by='Coefficient', ascending=False)
if len(coef_df) >= top_n:
    top_features = coef_df.iloc[:top_n].sort_values(by='Coefficient', ascending=False)
else:
    # If there are fewer than 15 features, just plot all of them
    top_features = coef_df.sort_values(by='Coefficient', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(x='Coefficient', y='Feature', data=top_features, palette='vlag')
plt.title(f'Top {len(top_features)} Logistic Regression Coefficients')
plt.xlabel('Coefficient Value (Influence on Log Odds of High Telehealth)')
plt.ylabel('Feature')
plt.tight_layout()
plt.savefig("feature_coefficients.png")
plt.close()

print("Confusion matrix and feature coefficients plot saved.")

# 3. Correlation Matrix Calculation (preparation for explanation)
# Due to the large number of features (many from one-hot encoding), 
# the full correlation matrix is huge (76x76). I will calculate it,
# but only print the number of features and explain the nature of correlation in OHE data.
correlation_matrix = X_encoded.corr()
print(f"Number of features in the encoded dataset: {len(X_encoded.columns)}")

# I will save a small, representative subset to CSV for user, perhaps focusing on the 'Year' and a few key geo/demographic features.
# Let's select 'Year' and the first 10 OHE columns for a snippet.
# feature_subset = X_encoded.columns[:11]
# subset_corr_matrix = X_encoded[feature_subset].corr()
# subset_corr_matrix.to_csv("subset_correlation_matrix.csv")
# I will proceed with the explanation without saving a matrix as it is unlikely to be very informative in this case, but I will write the explanation.

NameError: name 'y_test' is not defined