In [None]:

# 📌 Step 1: Import Libraries


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix


# Step 2: Load and Clean Data

file_path = r"D:\medical_billing_denial_analysis\data\data.csv"
df = pd.read_csv(file_path, skiprows=2)

df.columns = df.columns.str.strip().str.replace(" ", "_")

# Drop "#" column
if "#" in df.columns:
    df = df.drop(columns=["#"])

# Create denial flag
df["Denied"] = df["Denial_Reason"].notnull().astype(int)

# Remove $ and convert amounts to float
df["Payment_Amount"] = df["Payment_Amount"].replace('[\$,]', '', regex=True).astype(float)
df["Balance"] = df["Balance"].replace('[\$,]', '', regex=True).astype(float)

print(df.head())

# Step 3: Encode Categorical Features

le_cpt = LabelEncoder()
le_payer = LabelEncoder()
le_physician = LabelEncoder()

df["CPT_Code_enc"] = le_cpt.fit_transform(df["CPT_Code"].astype(str))
df["Insurance_Company_enc"] = le_payer.fit_transform(df["Insurance_Company"].astype(str))
df["Physician_Name_enc"] = le_physician.fit_transform(df["Physician_Name"].astype(str))

# Features and target
X = df[["CPT_Code_enc","Insurance_Company_enc","Physician_Name_enc","Payment_Amount","Balance"]]
y = df["Denied"]


# Step 4: Train-Test Split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Step 5: Train Model

model = RandomForestClassifier(random_state=42, class_weight="balanced")
model.fit(X_train, y_train)

# Step 6: Evaluate Model

y_pred = model.predict(X_test)

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))


  df["Payment_Amount"] = df["Payment_Amount"].replace('[\$,]', '', regex=True).astype(float)
  df["Balance"] = df["Balance"].replace('[\$,]', '', regex=True).astype(float)


   CPT_Code Insurance_Company Physician_Name  Payment_Amount  Balance  \
0   99213.0          Medicare      Dr. Smith             0.0    100.0   
1   99214.0             Aetna    Dr. Johnson            80.0     20.0   
2   99215.0             Cigna        Dr. Lee             0.0    150.0   
3   93000.0  UnitedHealthcare      Dr. Patel            50.0      0.0   
4   99212.0        Blue Cross        Dr. Kim             0.0     75.0   

                      Denial_Reason  Denied  
0          16 - Missing information       1  
1                               NaN       0  
2  45 - Charge exceeds fee schedule       1  
3                               NaN       0  
4          96 - Non-covered service       1  

=== Classification Report ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         3
           1       1.00      1.00      1.00         3

    accuracy                           1.00         6
   macro avg       1.00      1.00   

In [None]:
import joblib
import os


# Step 7: Save Model and Encoders

model_dir = r"D:\medical_billing_denial_analysis\modelsbefore"
os.makedirs(model_dir, exist_ok=True)

# Save model
model_path = os.path.join(model_dir, "denial_prediction_model.pkl")
joblib.dump(model, model_path)

# Save encoders
joblib.dump(le_cpt, os.path.join(model_dir, "le_cpt.pkl"))
joblib.dump(le_payer, os.path.join(model_dir, "le_payer.pkl"))
joblib.dump(le_physician, os.path.join(model_dir, "le_physician.pkl"))

print(f"✅ Model saved at: {model_path}")


✅ Model saved at: D:\medical_billing_denial_analysis\modelsbefore\denial_prediction_model.pkl


In [2]:
pip install streamlit

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


#Synthetic Data Generation

In [5]:
import pandas as pd
import numpy as np
import random

# Set random seed for reproducibility
np.random.seed(42)

# Reference values
cpt_codes = ["99212", "99213", "99214", "99215", "93000"]
payers = ["Medicare", "Aetna", "Cigna", "UnitedHealthcare", "Blue Cross"]
physicians = ["Dr. Smith", "Dr. Johnson", "Dr. Lee", "Dr. Patel", "Dr. Kim"]
denial_reasons = [
    "16 - Missing information",
    "45 - Charge exceeds fee schedule",
    "96 - Non-covered service",
    "197 - Precertification/authorization absent",
    None  # no denial
]

# Function to generate synthetic row
def generate_claim():
    cpt = random.choice(cpt_codes)
    payer = random.choice(payers)
    physician = random.choice(physicians)

    # Payment and balance logic
    if cpt == "93000":  # ECG test
        total_charge = 50
    elif cpt == "99212":
        total_charge = 75
    elif cpt == "99213":
        total_charge = 100
    elif cpt == "99214":
        total_charge = 120
    else:  # 99215
        total_charge = 150

    denial = random.choices(denial_reasons, weights=[0.2,0.2,0.2,0.1,0.3])[0]

    if denial is None:
        payment = round(total_charge * np.random.uniform(0.7, 1.0), 2)
        balance = round(total_charge - payment, 2)
    else:
        payment = 0.00
        balance = total_charge

    return [cpt, payer, physician, f"${payment:.2f}", f"${balance:.2f}", denial]

# Generate dataset
rows = [generate_claim() for _ in range(2000)]  # 2000 rows
df_synth = pd.DataFrame(rows, columns=["CPT_Code", "Insurance_Company", "Physician_Name", "Payment_Amount", "Balance", "Denial_Reason"])

# Save synthetic dataset
file_path = r"D:\medical_billing_denial_analysis\data\synthetic_data.csv"
df_synth.to_csv(file_path, index=False)

print(f"✅ Synthetic dataset created with {df_synth.shape[0]} rows at {file_path}")
print(df_synth.head())


✅ Synthetic dataset created with 2000 rows at D:\medical_billing_denial_analysis\data\synthetic_data.csv
  CPT_Code Insurance_Company Physician_Name Payment_Amount  Balance  \
0    99213             Aetna      Dr. Smith          $0.00  $100.00   
1    99214             Aetna        Dr. Kim          $0.00  $120.00   
2    99212          Medicare        Dr. Lee         $60.93   $14.07   
3    99212  UnitedHealthcare        Dr. Kim         $73.89    $1.11   
4    99215        Blue Cross        Dr. Lee        $137.94   $12.06   

              Denial_Reason  
0  96 - Non-covered service  
1  96 - Non-covered service  
2                      None  
3                      None  
4                      None  


In [None]:

# 📌 Step 1: Import Libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib


# 📌 Step 2: Load Data

file_path = r"D:\medical_billing_denial_analysis\data\synthetic_data.csv"
df = pd.read_csv(file_path)

print("✅ Data Loaded:", df.shape)
print(df.head())


# 📌 Step 3: Preprocessing


# Strip column names
df.columns = df.columns.str.strip().str.replace(" ", "_")

# Drop "#" if it exists
if "#" in df.columns:
    df = df.drop(columns=["#"])

# Handle money columns
for col in ["Payment_Amount", "Balance"]:
    df[col] = df[col].astype(str).replace(r'[\$,]', '', regex=True).replace('', '0').astype(float)

# Create Denied flag
df["Denied"] = df["Denial_Reason"].notnull().astype(int)

# Fill missing categorical values
df["CPT_Code"] = df["CPT_Code"].fillna("Unknown")
df["Insurance_Company"] = df["Insurance_Company"].fillna("Unknown")
df["Physician_Name"] = df["Physician_Name"].fillna("Unknown")


# 📌 Step 4: Feature Engineering


# Payment to total ratio
df["Payment_Ratio"] = df["Payment_Amount"] / (df["Payment_Amount"] + df["Balance"] + 1e-6)

# Denial rate per CPT
cpt_denial_rate = df.groupby("CPT_Code")["Denied"].mean()
df["CPT_Denial_Rate"] = df["CPT_Code"].map(cpt_denial_rate)

# Denial rate per Payer
payer_denial_rate = df.groupby("Insurance_Company")["Denied"].mean()
df["Payer_Denial_Rate"] = df["Insurance_Company"].map(payer_denial_rate)


# 📌 Step 5: Encode Categorical Features

le_cpt = LabelEncoder()
le_payer = LabelEncoder()
le_physician = LabelEncoder()

df["CPT_Code_enc"] = le_cpt.fit_transform(df["CPT_Code"].astype(str))
df["Insurance_Company_enc"] = le_payer.fit_transform(df["Insurance_Company"].astype(str))
df["Physician_Name_enc"] = le_physician.fit_transform(df["Physician_Name"].astype(str))


# 📌 Step 6: Train-Test Split

features = [
    "CPT_Code_enc", "Insurance_Company_enc", "Physician_Name_enc",
    "Payment_Amount", "Balance", "Payment_Ratio",
    "CPT_Denial_Rate", "Payer_Denial_Rate"
]

X = df[features]
y = df["Denied"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 📌 Step 7: Train Model with Cross-Validation

model = RandomForestClassifier(
    random_state=42,
    class_weight="balanced",
    n_estimators=200,
    max_depth=12
)

# Cross-validation
cv_scores = cross_val_score(model, X, y, cv=5, scoring="f1")
print(f"✅ Cross-Validation F1 Scores: {cv_scores}")
print(f"✅ Average F1 Score: {cv_scores.mean():.4f}")

# Final training on train set
model.fit(X_train, y_train)


# 📌 Step 8: Evaluate Model

y_pred = model.predict(X_test)

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

print("\n=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))


# 📌 Step 9: Save Model & Encoders

joblib.dump(model, r"D:\medical_billing_denial_analysis\models\denial_model.pkl")
joblib.dump(le_cpt, r"D:\medical_billing_denial_analysis\models\le_cpt.pkl")
joblib.dump(le_payer, r"D:\medical_billing_denial_analysis\models\le_payer.pkl")
joblib.dump(le_physician, r"D:\medical_billing_denial_analysis\models\le_physician.pkl")

print("✅ Model & Encoders saved successfully.")


✅ Data Loaded: (2000, 6)
   CPT_Code Insurance_Company Physician_Name Payment_Amount  Balance  \
0     99213             Aetna      Dr. Smith          $0.00  $100.00   
1     99214             Aetna        Dr. Kim          $0.00  $120.00   
2     99212          Medicare        Dr. Lee         $60.93   $14.07   
3     99212  UnitedHealthcare        Dr. Kim         $73.89    $1.11   
4     99215        Blue Cross        Dr. Lee        $137.94   $12.06   

              Denial_Reason  
0  96 - Non-covered service  
1  96 - Non-covered service  
2                       NaN  
3                       NaN  
4                       NaN  
✅ Cross-Validation F1 Scores: [1. 1. 1. 1. 1.]
✅ Average F1 Score: 1.0000

=== Classification Report ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       122
           1       1.00      1.00      1.00       278

    accuracy                           1.00       400
   macro avg       1.00      1.00      