## Telemed - Testing

In [1]:
import pandas as pd
import numpy as np

# Load the dataset
file_name = "TMEDTREND_PUBLIC_250827.csv"
df = pd.read_csv(file_name)

# Inspect the data
print(df.head())
print(df.info())

   Year  quarter Bene_Geo_Desc Bene_Mdcd_Mdcr_Enrl_Stus Bene_Race_Desc  \
0  2020  Overall      National                      All            All   
1  2020  Overall      National                      All            All   
2  2020  Overall      National                      All            All   
3  2020  Overall      National                      All            All   
4  2020  Overall      National                      All            All   

  Bene_Sex_Desc Bene_Mdcr_Entlmt_Stus Bene_Age_Desc Bene_RUCA_Desc  \
0           All                   All           All            All   
1           All                   All           All          Rural   
2           All                   All           All          Urban   
3           All                   All           All        Unknown   
4           All                   All          0-64            All   

   Total_Bene_TH_Elig  Total_PartB_Enrl  Total_Bene_Telehealth  Pct_Telehealth  
0          30946785.0      3.224489e+07             1

In [2]:
# 1. Drop rows with missing Pct_Telehealth
df_clean = df.dropna(subset=['Pct_Telehealth']).copy()

# 2. Create Binary Target Variable
# Calculate the median of Pct_Telehealth
median_telehealth = df_clean['Pct_Telehealth'].median()
print(f"Median Pct_Telehealth: {median_telehealth}")

# Create the binary target: 1 if Pct_Telehealth > median, 0 otherwise
df_clean['High_Telehealth'] = (df_clean['Pct_Telehealth'] > median_telehealth).astype(int)

# Drop the original 'Pct_Telehealth' and the aggregate count columns
columns_to_drop = [
    'Pct_Telehealth',
    'Total_Bene_TH_Elig',
    'Total_PartB_Enrl',
    'Total_Bene_Telehealth',
    'Bene_Mdcd_Mdcr_Enrl_Stus', # This column is mostly 'All' based on snippet, let's check unique values.
    'Bene_Mdcr_Entlmt_Stus' # This column is mostly 'All' based on snippet, let's check unique values.
]
df_clean = df_clean.drop(columns=columns_to_drop, errors='ignore')

# Check unique values for key categorical columns before one-hot encoding
categorical_cols = ['quarter', 'Bene_Geo_Desc', 'Bene_Race_Desc', 'Bene_Sex_Desc', 'Bene_Age_Desc', 'Bene_RUCA_Desc']
for col in categorical_cols:
    print(f"\nUnique values in {col}: {df_clean[col].nunique()}")
    print(df_clean[col].value_counts(normalize=True).head())

Median Pct_Telehealth: 0.1776

Unique values in quarter: 5
quarter
1          0.230565
Overall    0.192645
2          0.192287
3          0.192287
4          0.192215
Name: proportion, dtype: float64

Unique values in Bene_Geo_Desc: 55
Bene_Geo_Desc
National    0.098686
Alabama     0.017689
Alaska      0.017689
Arizona     0.017689
Arkansas    0.017689
Name: proportion, dtype: float64

Unique values in Bene_Race_Desc: 6
Bene_Race_Desc
All                       0.700111
Black/African American    0.060515
Non-Hispanic White        0.060515
Hispanic                  0.060515
Asian/Pacific Islander    0.060479
Name: proportion, dtype: float64

Unique values in Bene_Sex_Desc: 3
Bene_Sex_Desc
All       0.890142
Female    0.054929
Male      0.054929
Name: proportion, dtype: float64

Unique values in Bene_Age_Desc: 5
Bene_Age_Desc
All            0.780284
0-64           0.054929
65-74          0.054929
75-84          0.054929
85 and over    0.054929
Name: proportion, dtype: float64

Unique valu

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

# Define features (X) and target (y)
X = df_clean.drop('High_Telehealth', axis=1)
y = df_clean['High_Telehealth']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include='object').columns.tolist()
# Year is the only non-categorical, non-target column left
numerical_cols = ['Year']

# 1. One-Hot Encode Categorical Features
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Align the 'Year' column with the standardizer (even though it's just one column)
# This step is often good practice if multiple numerical columns exist.
scaler = StandardScaler()
X_encoded[numerical_cols] = scaler.fit_transform(X_encoded[numerical_cols])

# 2. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# 3. Train Logistic Regression Model
log_reg = LogisticRegression(solver='liblinear', random_state=42)
log_reg.fit(X_train, y_train)

# 4. Predict and Evaluate
y_pred = log_reg.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Model Accuracy: {accuracy}")
print("\nClassification Report:\n", report)

# Display the first few feature columns (post-encoding) for context
print("\nFirst few columns of encoded features (X_encoded.head()):")
print(X_encoded.head())

Model Accuracy: 0.8093447905477981

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.82      0.81      2722
           1       0.83      0.80      0.81      2864

    accuracy                           0.81      5586
   macro avg       0.81      0.81      0.81      5586
weighted avg       0.81      0.81      0.81      5586


First few columns of encoded features (X_encoded.head()):
       Year  quarter_2  quarter_3  quarter_4  quarter_Overall  \
0 -1.407601      False      False      False             True   
1 -1.407601      False      False      False             True   
2 -1.407601      False      False      False             True   
4 -1.407601      False      False      False             True   
5 -1.407601      False      False      False             True   

   Bene_Geo_Desc_Alaska  Bene_Geo_Desc_Arizona  Bene_Geo_Desc_Arkansas  \
0                 False                  False                   False   
1              

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import numpy as np

# Assuming log_reg, X_encoded, y_test, y_pred are available from previous context

# 1. Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Low Telehealth (0)', 'High Telehealth (1)'],
            yticklabels=['Low Telehealth (0)', 'High Telehealth (1)'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix for Telehealth Classification')
plt.savefig("confusion_matrix.png")
plt.close()

# 2. Feature Coefficients Graph
# Get coefficients and feature names
coefficients = log_reg.coef_[0]
feature_names = X_encoded.columns

# Combine, sort by magnitude, and select top/bottom features
coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
coef_df['Abs_Coefficient'] = np.abs(coef_df['Coefficient'])
coef_df = coef_df.sort_values(by='Abs_Coefficient', ascending=False).drop(columns='Abs_Coefficient')

# Select the top 15 features by magnitude
top_n = 15
top_features = pd.concat([coef_df.head(top_n // 2), coef_df.tail(top_n // 2)]).sort_values(by='Coefficient', ascending=False)
if len(coef_df) >= top_n:
    top_features = coef_df.iloc[:top_n].sort_values(by='Coefficient', ascending=False)
else:
    # If there are fewer than 15 features, just plot all of them
    top_features = coef_df.sort_values(by='Coefficient', ascending=False)

plt.figure(figsize=(10, 8))
sns.barplot(x='Coefficient', y='Feature', data=top_features, palette='vlag')
plt.title(f'Top {len(top_features)} Logistic Regression Coefficients')
plt.xlabel('Coefficient Value (Influence on Log Odds of High Telehealth)')
plt.ylabel('Feature')
plt.tight_layout()
plt.savefig("feature_coefficients.png")
plt.close()

print("Confusion matrix and feature coefficients plot saved.")

# 3. Correlation Matrix Calculation (preparation for explanation)
# Due to the large number of features (many from one-hot encoding), 
# the full correlation matrix is huge (76x76). I will calculate it,
# but only print the number of features and explain the nature of correlation in OHE data.
correlation_matrix = X_encoded.corr()
print(f"Number of features in the encoded dataset: {len(X_encoded.columns)}")

# I will save a small, representative subset to CSV for user, perhaps focusing on the 'Year' and a few key geo/demographic features.
# Let's select 'Year' and the first 10 OHE columns for a snippet.
# feature_subset = X_encoded.columns[:11]
# subset_corr_matrix = X_encoded[feature_subset].corr()
# subset_corr_matrix.to_csv("subset_correlation_matrix.csv")
# I will proceed with the explanation without saving a matrix as it is unlikely to be very informative in this case, but I will write the explanation.


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x='Coefficient', y='Feature', data=top_features, palette='vlag')


Confusion matrix and feature coefficients plot saved.
Number of features in the encoded dataset: 72
