# Claude programming code responses test for diabetes dataset

## Prompt 1

In [14]:
#LLM code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Load the data
print("Loading diabetes data...")
df = pd.read_csv('../diabetesdatasets/diabetic_data.csv')

def analyze_demographic_distribution(df):
    # Analyze race and gender distribution
    print("\nDemographic Distribution Analysis:")
    print("\nRace distribution:")
    race_dist = df['race'].value_counts()
    print(race_dist)
    print("\nGender distribution:")
    gender_dist = df['gender'].value_counts()
    print(gender_dist)
    
    # Calculate intersectional demographics
    print("\nIntersectional demographics (Race x Gender):")
    intersectional = pd.crosstab(df['race'], df['gender'])
    print(intersectional)
    
    return race_dist, gender_dist, intersectional

def analyze_outcome_disparities(df):
    # Analyze readmission rates by demographic groups
    print("\nReadmission Rate Analysis:")
    
    # Convert readmission to binary (any readmission vs no readmission)
    df['readmitted_binary'] = df['readmitted'].map({'NO': 0, '>30': 1, '<30': 1})
    
    # Calculate readmission rates by race
    race_readmission = df.groupby('race')['readmitted_binary'].agg(['mean', 'count'])
    race_readmission.columns = ['readmission_rate', 'sample_size']
    print("\nReadmission rates by race:")
    print(race_readmission)
    
    # Calculate readmission rates by gender
    gender_readmission = df.groupby('gender')['readmitted_binary'].agg(['mean', 'count'])
    gender_readmission.columns = ['readmission_rate', 'sample_size']
    print("\nReadmission rates by gender:")
    print(gender_readmission)
    
    # Calculate intersectional readmission rates
    intersectional_readmission = df.groupby(['race', 'gender'])['readmitted_binary'].agg(['mean', 'count'])
    intersectional_readmission.columns = ['readmission_rate', 'sample_size']
    print("\nIntersectional readmission rates:")
    print(intersectional_readmission)
    
    return race_readmission, gender_readmission, intersectional_readmission

def analyze_treatment_disparities(df):
    # Analyze treatment patterns across demographics
    treatment_cols = ['metformin', 'insulin', 'diabetesMed']
    
    print("\nTreatment Pattern Analysis:")
    for treatment in treatment_cols:
        print(f"\n{treatment} usage rates:")
        # By race
        race_treatment = df.groupby('race')[treatment].apply(lambda x: (x == 'Yes').mean())
        print("\nBy race:")
        print(race_treatment)
        # By gender
        gender_treatment = df.groupby('gender')[treatment].apply(lambda x: (x == 'Yes').mean())
        print("\nBy gender:")
        print(gender_treatment)

def plot_demographic_disparities(race_dist, gender_dist, race_readmission, gender_readmission):
    # Create a figure with multiple subplots
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))
    
    # Plot demographic distributions
    race_dist.plot(kind='bar', ax=ax1, title='Race Distribution')
    ax1.set_ylabel('Count')
    ax1.tick_params(axis='x', rotation=45)
    
    gender_dist.plot(kind='bar', ax=ax2, title='Gender Distribution')
    ax2.set_ylabel('Count')
    
    # Plot readmission rates
    race_readmission['readmission_rate'].plot(kind='bar', ax=ax3, title='Readmission Rates by Race')
    ax3.set_ylabel('Readmission Rate')
    ax3.tick_params(axis='x', rotation=45)
    
    gender_readmission['readmission_rate'].plot(kind='bar', ax=ax4, title='Readmission Rates by Gender')
    ax4.set_ylabel('Readmission Rate')
    
    plt.tight_layout()
    plt.savefig('diabetes_disparities.png')
    plt.close()

def calculate_disparity_metrics(race_readmission, gender_readmission):
    # Calculate disparity metrics
    print("\nDisparity Metrics:")
    
    # Race disparities
    race_max_rate = race_readmission['readmission_rate'].max()
    race_min_rate = race_readmission['readmission_rate'].min()
    race_disparity_ratio = race_max_rate / race_min_rate
    
    print(f"\nRace disparity ratio (max/min readmission rate): {race_disparity_ratio:.2f}")
    
    # Gender disparities
    gender_max_rate = gender_readmission['readmission_rate'].max()
    gender_min_rate = gender_readmission['readmission_rate'].min()
    gender_disparity_ratio = gender_max_rate / gender_min_rate
    
    print(f"Gender disparity ratio (max/min readmission rate): {gender_disparity_ratio:.2f}")

# Main execution
race_dist, gender_dist, intersectional = analyze_demographic_distribution(df)
race_readmission, gender_readmission, intersectional_readmission = analyze_outcome_disparities(df)
analyze_treatment_disparities(df)
plot_demographic_disparities(race_dist, gender_dist, race_readmission, gender_readmission)
calculate_disparity_metrics(race_readmission, gender_readmission)

# Print recommendations
print("\nRecommendations for Fair Model Training:")
print("\n1. Data Rebalancing Strategies:")
for race, count in race_dist.items():
    if count / len(df) < 0.1:  # If race represents less than 10% of the data
        print(f"- Consider oversampling {race} patients to increase representation")

print("\n2. Model Development Recommendations:")
print("- Use stratified sampling for train/test splits based on both race and gender")
print("- Implement weighted loss functions to account for demographic imbalances")
print("- Monitor and validate performance separately for each demographic group")

print("\n3. Clinical Implications:")
print("- Consider creating separate risk thresholds for different demographic groups")
print("- Implement regular bias audits in the deployment pipeline")
print("- Document any identified disparities for clinical review") 

Loading diabetes data...

Demographic Distribution Analysis:

Race distribution:
race
Caucasian          76099
AfricanAmerican    19210
?                   2273
Hispanic            2037
Other               1506
Asian                641
Name: count, dtype: int64

Gender distribution:
gender
Female             54708
Male               47055
Unknown/Invalid        3
Name: count, dtype: int64

Intersectional demographics (Race x Gender):
gender           Female   Male  Unknown/Invalid
race                                           
?                  1133   1138                2
AfricanAmerican   11728   7482                0
Asian               318    323                0
Caucasian         39689  36410                0
Hispanic           1092    945                0
Other               748    757                1

Readmission Rate Analysis:

Readmission rates by race:
                 readmission_rate  sample_size
race                                          
?                        0.3

ZeroDivisionError: float division by zero

### Output/Note
Float division by zero for 
To fix this:
Drop any groups with zero readmission rate before computing the ratio,
Exclude very small groups (e.g. fewer than a minimum sample size), or
Compute disparities using a difference (max – min) instead of a ratio when the min can be zero.

The dataset is predominantly composed of Caucasian patients (76,099) and females (54,708), with smaller representations of Black (19,210), Hispanic (2,037), Asian (641), and "unknown/other" groups. Readmission rates by race range from 31.9 percent in the unknown-race group to 47.0 percent among Caucasians, indicating a disparity of approximately 1.47 times. Females have a readmission rate of 46.9 percent compared to 45.1 percent for males. Intersectional patterns by race and gender reflect similar trends.

Anomalies are present in the data: both metformin and insulin show 0 percent usage, which is likely due to data coding errors. However, the variable "diabetesMed" indicates prescription rates ranging from approximately 74 percent among Asian patients to 82 percent among those in the "unknown" category, with both male and female groups averaging between 76 and 78 percent.

In [12]:
#LLM code 1
# Implement weighted sampling strategy

import numpy as np

def calculate_balanced_weights(df):
    weights = np.ones(len(df))
    
    # Race weights (inverse frequency)
    race_weights = 1 / df['race'].value_counts(normalize=True)
    for race in race_weights.index:
        weights[df['race'] == race] *= race_weights[race]
    
    # Adjust for outcome imbalance within groups
    for race in df['race'].unique():
        race_mask = df['race'] == race
        readmit_rate = df[race_mask]['readmitted_binary'].mean()
        weights[race_mask & (df['readmitted_binary'] == 1)] *= (1 / readmit_rate)
        weights[race_mask & (df['readmitted_binary'] == 0)] *= (1 / (1 - readmit_rate))
    
    return weights

df['readmitted_binary'] = (df['readmitted'] == '<30').astype(int)
sample_weights = calculate_balanced_weights(df)

In [33]:
#New code 1
import pandas as pd
import numpy as np

# 1. Load and prepare
df = pd.read_csv("../diabetesdatasets/diabetic_data.csv")
df["readmitted_binary"] = df["readmitted"].map({"NO":0, ">30":1, "<30":1})

# 2. Compute weights
def calculate_balanced_weights(df):
    # start with all ones
    weights = np.ones(len(df), dtype=float)
    
    # a) inverse-frequency by race
    race_freq = df["race"].value_counts(normalize=True)
    for race, freq in race_freq.items():
        weights[df["race"] == race] *= 1.0 / freq

    # b) within-race outcome balancing
    for race in df["race"].unique():
        mask = df["race"] == race
        # skip races with zero positives or negatives
        pos_rate = df.loc[mask, "readmitted_binary"].mean()
        if pos_rate in (0,1):
            continue
        # boost positives and negatives inversely to their within-race rates
        weights[mask & (df["readmitted_binary"] == 1)] *= 1.0 / pos_rate
        weights[mask & (df["readmitted_binary"] == 0)] *= 1.0 / (1.0 - pos_rate)
    
    # normalize so average weight = 1
    return weights / np.mean(weights)

df["sample_weight"] = calculate_balanced_weights(df)

# 3. Quick check
print(df.groupby("race")["sample_weight"].mean())
print(df.groupby("race")["sample_weight"].apply(lambda x: np.sum(x[df.loc[x.index, "readmitted_binary"]==1]) / np.sum(x)))


race
?                   7.461945
AfricanAmerican     0.882926
Asian              26.460218
Caucasian           0.222881
Hispanic            8.326460
Other              11.262284
Name: sample_weight, dtype: float64
race
?                  0.5
AfricanAmerican    0.5
Asian              0.5
Caucasian          0.5
Hispanic           0.5
Other              0.5
Name: sample_weight, dtype: float64


In [8]:
#LLM code 2:
#sampling strategies
# Two-phase sampling strategy
def balanced_sample_generator(df):
    # Phase 1: Oversample minority races
    race_sampler = SMOTE(sampling_strategy={
        'Asian': 15000,  # Increase to ~15% representation
        'Hispanic': 15000,
        'Other': 15000
    })
    
    # Phase 2: Balance outcomes within each racial group
    stratified_sampler = StratifiedKFold(n_splits=5, shuffle=True)
    
    return combined_samples


In [41]:
import pandas as pd
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import StratifiedKFold

# 1. Load and prepare
df = pd.read_csv("../diabetesdatasets/diabetic_data.csv")
df["readmitted_binary"] = df["readmitted"].map({"NO":0, ">30":1, "<30":1})

# 2. Phase 1: Randomly oversample by race
#    target ~15% of total for each minority race
total = len(df)
target_n = int(0.15 * total)

ros = RandomOverSampler(
    sampling_strategy={
        "Asian":    target_n,
        "Hispanic": target_n,
        "Other":    target_n
    },
    random_state=42
)

# fit_resample on the entire DataFrame and race column
df_ros, race_ros = ros.fit_resample(df, df["race"])
print("Before oversample:", Counter(df["race"]))
print("After  oversample:", Counter(race_ros))

# 3. Prepare X, y, and grouping column
X_ros = df_ros.drop(columns=["race", "readmitted", "readmitted_binary"])
y_ros = df_ros["readmitted_binary"].values
groups = df_ros["race"].astype(str) + "__" + df_ros["gender"].astype(str)

# 4. Phase 2: Stratified folds by race+gender
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_idx, val_idx) in enumerate(skf.split(X_ros, y_ros, groups)):
    print(f"Fold {fold}: train={len(train_idx)}, val={len(val_idx)}")


Before oversample: Counter({'Caucasian': 76099, 'AfricanAmerican': 19210, '?': 2273, 'Hispanic': 2037, 'Other': 1506, 'Asian': 641})
After  oversample: Counter({'Caucasian': 76099, 'AfricanAmerican': 19210, 'Other': 15264, 'Asian': 15264, 'Hispanic': 15264, '?': 2273})
Fold 0: train=114699, val=28675
Fold 1: train=114699, val=28675
Fold 2: train=114699, val=28675
Fold 3: train=114699, val=28675
Fold 4: train=114700, val=28674


In [6]:
#LLM code 3
#model level adjustments
# Custom loss function with fairness constraints
def fairness_aware_loss(y_true, y_pred, group_membership):
    base_loss = binary_crossentropy(y_true, y_pred)
    
    # Add demographic parity constraint
    demographic_penalty = calculate_demographic_disparity(y_pred, group_membership)
    
    # Add equal opportunity constraint
    opportunity_penalty = calculate_equal_opportunity_disparity(y_true, y_pred, group_membership)
    
    return base_loss + 0.1 * demographic_penalty + 0.1 * opportunity_penalty


In [53]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.impute         import SimpleImputer
from sklearn.preprocessing  import OneHotEncoder, StandardScaler
from sklearn.compose        import ColumnTransformer
from sklearn.pipeline       import Pipeline

# 1. Load data and create binary readmission label
df = pd.read_csv("../diabetesdatasets/diabetic_data.csv")
df["readmitted_binary"] = df["readmitted"].map({"NO": 0, ">30": 1, "<30": 1})

# 2. Separate features and target
y = df["readmitted_binary"]
X = df.drop(columns=["readmitted", "readmitted_binary"])

# 3. Train–test split stratified on race
X_train_df, X_test_df, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=df["race"], random_state=42
)

# 4. Auto-detect categorical vs numeric
cat_cols = X_train_df.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = [c for c in X_train_df.columns if c not in cat_cols]

# 5. Build preprocessing pipelines
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot",  OneHotEncoder(handle_unknown="ignore"))
])
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scale",   StandardScaler())
])
preprocessor = ColumnTransformer([
    ("cat", cat_pipeline, cat_cols),
    ("num", num_pipeline, num_cols)
])

# 6. Fit and transform
X_train_proc = preprocessor.fit_transform(X_train_df)
X_test_proc  = preprocessor.transform(X_test_df)

# 7. Convert to dense numpy arrays
if not isinstance(X_train_proc, np.ndarray):
    X_train_np = X_train_proc.toarray()
else:
    X_train_np = X_train_proc
if not isinstance(X_test_proc, np.ndarray):
    X_test_np = X_test_proc.toarray()
else:
    X_test_np = X_test_proc
y_train_np = y_train.to_numpy()
y_test_np  = y_test.to_numpy()

# 8. Build race_group marker (1 for Black, 0 otherwise)
race_group = (X_train_df["race"] == "Black").astype(int).to_numpy()
race_group_test = (X_test_df["race"] == "Black").astype(int).to_numpy()

# 9. Define Keras model
inputs = tf.keras.Input(shape=(X_train_np.shape[1],))
x      = tf.keras.layers.Dense(32, activation="relu")(inputs)
output = tf.keras.layers.Dense(1, activation="sigmoid")(x)
model  = tf.keras.Model(inputs, output)

# 10. Fairness-aware loss with proper casting
def fairness_aware_loss(y_true, y_pred, group):
    # y_true: (batch,), cast to float32
    y_true_f = tf.cast(y_true, tf.float32)
    # y_pred: (batch,1), squeeze to (batch,)
    y_pred_s = tf.squeeze(y_pred, axis=-1)
    
    # base BCE
    bce = tf.keras.losses.binary_crossentropy(y_true_f, y_pred_s)
    
    # demographic parity penalty
    g0 = tf.boolean_mask(y_pred_s, tf.equal(group, 0))
    g1 = tf.boolean_mask(y_pred_s, tf.equal(group, 1))
    dp = tf.abs(tf.reduce_mean(g0) - tf.reduce_mean(g1))
    
    # equal opportunity penalty
    y0_true = tf.boolean_mask(y_true_f, tf.equal(group, 0))
    y1_true = tf.boolean_mask(y_true_f, tf.equal(group, 1))
    y0_pred = tf.boolean_mask(y_pred_s, tf.equal(group, 0))
    y1_pred = tf.boolean_mask(y_pred_s, tf.equal(group, 1))
    
    tpr0 = tf.reduce_sum(tf.cast(y0_pred > 0.5, tf.float32) * y0_true) / (tf.reduce_sum(y0_true) + 1e-6)
    tpr1 = tf.reduce_sum(tf.cast(y1_pred > 0.5, tf.float32) * y1_true) / (tf.reduce_sum(y1_true) + 1e-6)
    eo   = tf.abs(tpr0 - tpr1)
    
    return bce + 0.1 * dp + 0.1 * eo

# 11. Build tf.data.Dataset
batch_size = 128
dataset = tf.data.Dataset.from_tensor_slices((X_train_np, y_train_np, race_group))
dataset = dataset.shuffle(5000, seed=42).batch(batch_size)

optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

@tf.function
def train_step(xb, yb, gb):
    with tf.GradientTape() as tape:
        preds = model(xb, training=True)
        loss  = fairness_aware_loss(yb, preds, gb)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    return tf.reduce_mean(loss)

# 12. Training loop
for epoch in range(5):
    total_loss = 0.0
    steps = 0
    for xb, yb, gb in dataset:
        total_loss += train_step(xb, yb, gb)
        steps += 1
    print(f"Epoch {epoch+1} average loss: {total_loss/steps:.4f}")

# 13. Test evaluation
y_proba_test = model.predict(X_test_np)
test_loss    = tf.reduce_mean(fairness_aware_loss(y_test_np, y_proba_test, race_group_test))
print("Test fairness-aware loss:", test_loss.numpy())


2025-07-28 11:46:18.057210: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 1 average loss: nan


2025-07-28 11:46:19.341149: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 2 average loss: nan


2025-07-28 11:46:20.618375: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 3 average loss: nan


2025-07-28 11:46:21.937839: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 4 average loss: nan


2025-07-28 11:46:23.157321: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 5 average loss: nan
[1m637/637[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 722us/step
Test fairness-aware loss: nan


The training procedure produced “nan” for the average loss in every epoch and likewise returned a non-number for the test fairness-aware loss. At the same time, TensorFlow emitted “OUT_OF_RANGE: End of sequence” warnings as it exhausted the dataset iterator. Taken together, these results indicate that the custom loss function became numerically unstable most likely because one of the demographic groups had no positive or negative examples in a batch, causing divisions by zero or empty tensors in the demographic parity or equal opportunity terms. As a result, the model was unable to learn meaningful gradients and all loss values collapsed to “nan.”

## Prompt 2

In [16]:
#llm code
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
print("Loading diabetes data...")
df = pd.read_csv('../diabetesdatasets/diabetic_data.csv')

def calculate_fairness_metrics(df, group_col, outcome_col='readmitted'):
    """Calculate various fairness metrics for each group."""
    
    # Convert readmission to binary (any readmission vs no readmission)
    df['outcome_binary'] = (df[outcome_col] != 'NO').astype(int)
    
    metrics = {}
    groups = df[group_col].unique()
    
    # Calculate overall readmission rate (base rate)
    overall_rate = df['outcome_binary'].mean()
    
    for group in groups:
        group_data = df[df[group_col] == group]
        if len(group_data) == 0:
            continue
            
        # Basic statistics
        group_size = len(group_data)
        group_rate = group_data['outcome_binary'].mean()
        
        # Calculate disparity metrics
        statistical_parity_diff = group_rate - overall_rate
        disparate_impact = group_rate / overall_rate if overall_rate > 0 else 0
        
        # Calculate confusion matrix based metrics
        # For this analysis, we'll use the actual readmission as ground truth
        # and compare with a "predicted" readmission based on risk factors
        
        # Create a simple risk score based on available features
        risk_factors = ['time_in_hospital', 'num_lab_procedures', 'num_medications']
        group_data['risk_score'] = group_data[risk_factors].mean(axis=1)
        predicted_high_risk = (group_data['risk_score'] > group_data['risk_score'].median()).astype(int)
        
        tn, fp, fn, tp = confusion_matrix(
            group_data['outcome_binary'],
            predicted_high_risk
        ).ravel()
        
        # Calculate fairness metrics
        fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
        fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
        tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
        ppv = tp / (tp + fp) if (tp + fp) > 0 else 0
        
        metrics[group] = {
            'group_size': group_size,
            'representation': group_size / len(df),
            'readmission_rate': group_rate,
            'statistical_parity_diff': statistical_parity_diff,
            'disparate_impact': disparate_impact,
            'false_positive_rate': fpr,
            'false_negative_rate': fnr,
            'true_positive_rate': tpr,
            'positive_predictive_value': ppv
        }
    
    return metrics

def analyze_treatment_equity(df, group_col):
    """Analyze treatment patterns across groups."""
    treatment_cols = ['metformin', 'insulin', 'diabetesMed']
    treatment_metrics = {}
    
    for treatment in treatment_cols:
        # Calculate treatment rates by group
        treatment_rates = df.groupby(group_col)[treatment].apply(
            lambda x: (x == 'Yes').mean()
        )
        treatment_metrics[treatment] = treatment_rates
    
    return treatment_metrics

def plot_fairness_metrics(metrics, title):
    """Visualize key fairness metrics across groups."""
    groups = list(metrics.keys())
    
    # Select key metrics to plot
    key_metrics = ['readmission_rate', 'false_positive_rate', 'false_negative_rate']
    
    fig, axes = plt.subplots(1, len(key_metrics), figsize=(15, 5))
    
    for idx, metric in enumerate(key_metrics):
        values = [metrics[group][metric] for group in groups]
        axes[idx].bar(groups, values)
        axes[idx].set_title(f'{metric.replace("_", " ").title()}')
        axes[idx].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.savefig(f'diabetes_fairness_{title}.png')
    plt.close()

# Calculate fairness metrics by race and gender
print("\nCalculating fairness metrics by race...")
race_metrics = calculate_fairness_metrics(df, 'race')
print("\nCalculating fairness metrics by gender...")
gender_metrics = calculate_fairness_metrics(df, 'gender')

# Analyze treatment equity
print("\nAnalyzing treatment equity...")
race_treatment_metrics = analyze_treatment_equity(df, 'race')
gender_treatment_metrics = analyze_treatment_equity(df, 'gender')

# Print detailed results
print("\nFairness Metrics by Race:")
for race, metrics in race_metrics.items():
    print(f"\n{race}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.3f}")

print("\nFairness Metrics by Gender:")
for gender, metrics in gender_metrics.items():
    print(f"\n{gender}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.3f}")

print("\nTreatment Equity Analysis:")
for treatment, rates in race_treatment_metrics.items():
    print(f"\n{treatment} rates by race:")
    print(rates)

# Generate visualizations
plot_fairness_metrics(race_metrics, 'race')
plot_fairness_metrics(gender_metrics, 'gender')

# Evaluate fairness concerns
print("\nFairness Concerns Assessment:")

# Define thresholds for concerning disparities
DISPARITY_THRESHOLD = 0.1  # 10% difference
IMPACT_RATIO_THRESHOLD = 0.8  # 80% rule

# Check for concerning disparities
def assess_disparities(metrics, metric_name, threshold):
    values = [m[metric_name] for m in metrics.values()]
    max_val = max(values)
    min_val = min(values)
    disparity = max_val - min_val
    return disparity > threshold, disparity

# Assess various types of disparities
for metrics_dict, group_type in [(race_metrics, 'Race'), (gender_metrics, 'Gender')]:
    print(f"\n{group_type} Disparity Analysis:")
    
    # Statistical Parity
    concerning, disparity = assess_disparities(metrics_dict, 'readmission_rate', DISPARITY_THRESHOLD)
    if concerning:
        print(f"- Concerning disparity in readmission rates: {disparity:.3f}")
    
    # False Positive Rate
    concerning, disparity = assess_disparities(metrics_dict, 'false_positive_rate', DISPARITY_THRESHOLD)
    if concerning:
        print(f"- Concerning disparity in false positive rates: {disparity:.3f}")
    
    # False Negative Rate
    concerning, disparity = assess_disparities(metrics_dict, 'false_negative_rate', DISPARITY_THRESHOLD)
    if concerning:
        print(f"- Concerning disparity in false negative rates: {disparity:.3f}")

print("\nRecommendations:")
print("1. Data Collection and Representation:")
for group, metrics in race_metrics.items():
    if metrics['representation'] < 0.1:  # Less than 10% representation
        print(f"- Increase data collection for {group} patients")

print("\n2. Model Development:")
print("- Implement fairness constraints in the model")
print("- Use stratified sampling for train/test splits")
print("- Consider separate models or calibration for different demographic groups")

print("\n3. Clinical Practice:")
print("- Review treatment decision protocols for potential bias")
print("- Implement regular fairness audits")
print("- Document and monitor disparities over time") 

Loading diabetes data...

Calculating fairness metrics by race...

Calculating fairness metrics by gender...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group_data['risk_score'] = group_data[risk_factors].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group_data['risk_score'] = group_data[risk_factors].mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group_data['risk_score'] = group_data[risk_factors].mean(axis=1)
A value is


Analyzing treatment equity...

Fairness Metrics by Race:

Caucasian:
  group_size: 76099.000
  representation: 0.748
  readmission_rate: 0.469
  statistical_parity_diff: 0.008
  disparate_impact: 1.018
  false_positive_rate: 0.461
  false_negative_rate: 0.486
  true_positive_rate: 0.514
  positive_predictive_value: 0.497

AfricanAmerican:
  group_size: 19210.000
  representation: 0.189
  readmission_rate: 0.458
  statistical_parity_diff: -0.003
  disparate_impact: 0.993
  false_positive_rate: 0.459
  false_negative_rate: 0.490
  true_positive_rate: 0.510
  positive_predictive_value: 0.484

?:
  group_size: 2273.000
  representation: 0.022
  readmission_rate: 0.319
  statistical_parity_diff: -0.141
  disparate_impact: 0.693
  false_positive_rate: 0.460
  false_negative_rate: 0.435
  true_positive_rate: 0.565
  positive_predictive_value: 0.366

Other:
  group_size: 1506.000
  representation: 0.015
  readmission_rate: 0.392
  statistical_parity_diff: -0.068
  disparate_impact: 0.851
  fa

### output
Calculated “fairness” metrics—group size, representation share, readmission rate, statistical‐parity difference, disparate‐impact ratio, false‐positive/negative rates, true‐positive rate and positive‐predictive value for each race and gender. A simple “risk score” (the mean of three numeric features) was thresholded at its median to produce predicted labels, which fed the confusion‐matrix–based rates. The results show that Caucasians and African Americans have readmission rates around 46–47% while smaller groups (e.g. “?” and “Other”) fall below, yielding a 15% readmission disparity by race and an even larger 47% gap by gender. Treatment equity checks revealed zero usage for metformin and insulin (likely a coding or data‐entry issue) but 74–82% use of “diabetesMed” across groups. Finally, the code flagged both race and gender readmission gaps as “concerning” and offered data-collection, modeling, and clinical recommendations. A recurring SettingWithCopyWarning arose when computing group‐level risk scores on slices of the DataFrame, indicating you should use .loc to avoid unintentionally modifying views rather than copies.

# LLM provided code snippets

-Training models with custom loss functions or sample weights.
-Building fairness-aware cross-validation loops or SMOTE+CV pipelines.
-Calibrating model outputs differently by group at inference.
-Monitoring fairness metrics on live predictions or treatment decisions.

They’re out of scope because my thesis focuses on identifying and correcting bias at the data and code‐analysis stages, not on building, training, or monitoring deployed predictive models.

In [19]:
#llm code 1
#data collection and preprocessing
def implement_balanced_sampling(df):
    # Calculate sampling weights
    weights = {
        'Asian': 10.0,  # Increase Asian representation
        'Hispanic': 5.0,  # Increase Hispanic representation
        'Other': 5.0,    # Increase Other representation
        'Caucasian': 1.0 # Reference group
    }
    
    # Apply weights in sampling
    df['sample_weight'] = df['race'].map(weights)
    return df

In [9]:
#llm code 2
#model development
def create_fair_model(X, y, sensitive_features):
    # Implement fairness constraints
    fairness_constraints = {
        'statistical_parity_difference': 0.05,  # Max 5% difference
        'equal_opportunity_difference': 0.05
    }
    
    # Create balanced folds for cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True)
    cv.split(X, y, sensitive_features)
    
    return model


In [10]:
#llm code 3
#monitoring and validation
def monitor_fairness_metrics(predictions, actual, sensitive_features):
    metrics = {
        'statistical_parity': calculate_statistical_parity(predictions, sensitive_features),
        'equal_opportunity': calculate_equal_opportunity(predictions, actual, sensitive_features),
        'disparate_impact': calculate_disparate_impact(predictions, sensitive_features)
    }
    
    # Set alert thresholds
    ALERT_THRESHOLDS = {
        'statistical_parity': 0.05,
        'equal_opportunity': 0.05,
        'disparate_impact': 0.8
    }
    
    return check_thresholds(metrics, ALERT_THRESHOLDS)


In [11]:
#llm code 4
#clinical implementation
def calibrate_risk_scores(scores, race):
    # Adjust thresholds by race to equalize false positive/negative rates
    thresholds = {
        'Asian': 0.45,  # Lower threshold due to underrepresentation
        'Hispanic': 0.47,
        'Caucasian': 0.50,
        'AfricanAmerican': 0.50
    }
    return scores > thresholds[race]


In [12]:
#llm code 5
#treatment equity
def monitor_treatment_equity(df):
    # Calculate treatment rates by demographic group
    treatment_rates = df.groupby(['race', 'gender'])['diabetesMed'].mean()
    
    # Check for significant disparities
    disparity_threshold = 0.05  # 5% difference
    return identify_treatment_disparities(treatment_rates, disparity_threshold)


In [None]:
#New code for all 5 code blocks:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from collections import Counter

# 1) Data collection and preprocessing
def implement_balanced_sampling(df: pd.DataFrame) -> pd.DataFrame:
    """
    Attach a sample_weight column that up-weights underrepresented races.
    """
    weights = {
        'Asian':      10.0,
        'Hispanic':    5.0,
        'Other':       5.0,
        'Caucasian':   1.0,
        'AfricanAmerican': 2.0  # in case that category appears
    }
    # Default to 1.0 for any race not in the map
    df['sample_weight'] = df['race'].map(weights).fillna(1.0)
    return df

# 2) Model development
def create_fair_model(X: np.ndarray, y: np.ndarray, sensitive: np.ndarray):
    """
    Returns a StratifiedKFold object and a placeholder model.
    In practice you would wrap this CV with a fairness-aware learner.
    """
    # Fairness constraints dictionary (not used directly here)
    fairness_constraints = {
        'statistical_parity_difference': 0.05,
        'equal_opportunity_difference':  0.05
    }

    # Create balanced folds for cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    # Example of iterating through folds (no model training here)
    for train_idx, val_idx in cv.split(X, y, sensitive):
        # Normally train your model on X[train_idx], y[train_idx]
        # and validate on X[val_idx], y[val_idx]
        pass

    # Return the CV instance and a dummy placeholder for the model
    # Replace this with your actual fairness-aware estimator
    return cv, None

# 3) Monitoring and validation
# Stub implementations for the metric calculators
def calculate_statistical_parity(preds, sensitive):
    # difference in positive rates between groups 0 and 1
    rates = {}
    for g in np.unique(sensitive):
        rates[g] = np.mean(preds[sensitive == g])
    return abs(rates.get(0, 0) - rates.get(1, 0))

def calculate_equal_opportunity(preds, actual, sensitive):
    # difference in true-positive rates
    tprs = {}
    for g in np.unique(sensitive):
        mask = (sensitive == g)
        true_pos = np.sum((preds[mask] == 1) & (actual[mask] == 1))
        positives = np.sum(actual[mask] == 1)
        tprs[g] = true_pos / positives if positives > 0 else 0
    return abs(tprs.get(0, 0) - tprs.get(1, 0))

def calculate_disparate_impact(preds, sensitive):
    # ratio of positive rates
    rates = {}
    for g in np.unique(sensitive):
        rates[g] = np.mean(preds[sensitive == g])
    low, high = min(rates.values()), max(rates.values())
    return low / high if high > 0 else np.nan

def check_thresholds(metrics: dict, thresholds: dict) -> dict:
    alerts = {}
    for name, value in metrics.items():
        alerts[name] = value > thresholds.get(name, np.inf)
    return alerts

def monitor_fairness_metrics(predictions: np.ndarray,
                             actual:      np.ndarray,
                             sensitive:  np.ndarray) -> dict:
    """
    Compute common fairness metrics and check against alert thresholds.
    """
    metrics = {
        'statistical_parity': calculate_statistical_parity(predictions, sensitive),
        'equal_opportunity': calculate_equal_opportunity(predictions, actual, sensitive),
        'disparate_impact':  calculate_disparate_impact(predictions, sensitive)
    }
    ALERT_THRESHOLDS = {
        'statistical_parity':            0.05,
        'equal_opportunity':             0.05,
        'disparate_impact':              0.8
    }
    return check_thresholds(metrics, ALERT_THRESHOLDS)


# 4) Clinical implementation
def calibrate_risk_scores(scores: np.ndarray,
                          races:  np.ndarray) -> np.ndarray:
    """
    Apply group-specific thresholds to continuous risk scores,
    returning a boolean array of who exceeds their threshold.
    """
    thresholds = {
        'Asian':           0.45,
        'Hispanic':        0.47,
        'Caucasian':       0.50,
        'AfricanAmerican': 0.50,
        'Other':           0.48
    }
    # Vectorized comparison
    output = np.zeros_like(scores, dtype=bool)
    for grp, thr in thresholds.items():
        mask = (races == grp)
        output[mask] = (scores[mask] > thr)
    return output

# 5) Treatment equity
def identify_treatment_disparities(treatment_rates: pd.Series,
                                  threshold:       float) -> pd.DataFrame:
    """
    Return a DataFrame of any subgroups whose treatment rate
    differs by more than `threshold` from the overall mean.
    """
    overall = treatment_rates.mean()
    diff     = (treatment_rates - overall).abs()
    return diff[diff > threshold].to_frame(name='difference')

def monitor_treatment_equity(df: pd.DataFrame) -> pd.DataFrame:
    """
    Compute diabetesMed usage rates by race×gender and flag
    any subgroup that deviates by more than 5 percentage points.
    """
    # Convert to 0/1
    df['diabetesMed_flag'] = (df['diabetesMed'] == 'Yes').astype(int)
    treatment_rates = df.groupby(['race', 'gender'])['diabetesMed_flag'].mean()
    return identify_treatment_disparities(treatment_rates, threshold=0.05)


# Example usage
if __name__ == "__main__":
    # Load data
    df = pd.read_csv("../diabetesdatasets/diabetic_data.csv")
    
    # Create binary outcome column 
    df["readmitted_binary"] = df["readmitted"].map({"NO": 0, ">30": 1, "<30": 1})

    # 1) Preprocessing weights
    df = implement_balanced_sampling(df)
    print("Sample weights attached:", df["sample_weight"].head(), "\n")

    # 2) Model development 
    X = df.drop(columns=["sample_weight", "readmitted", "readmitted_binary"])
    y = df["readmitted_binary"]
    sensitive = (df["race"] == "Black").astype(int).values
    cv, dummy_model = create_fair_model(X.values, y.values, sensitive)
    print("Created StratifiedKFold:", cv, "\n")

    # 3) Monitoring example
    preds = np.random.randint(0, 2, size=len(df))
    alerts = monitor_fairness_metrics(preds, y.values, sensitive)
    print("Fairness alerts:", alerts, "\n")

    # 4) Calibration example
    risk_scores = np.random.rand(len(df))
    decisions   = calibrate_risk_scores(risk_scores, df["race"].values)
    print("Calibrated decisions sample:", decisions[:5], "\n")

    # 5) Treatment equity
    disparities = monitor_treatment_equity(df)
    print("Treatment disparities:\n", disparities)


## Output
This code implements a modular pipeline for assessing fairness and equity in clinical machine learning workflows using the diabetic patient dataset. It begins by assigning sampling weights to underrepresented racial groups in order to promote balanced data representation. A binary readmission label is derived and a stratified cross-validation object is created, preserving demographic balance across folds. Random predictions are then evaluated using fairness metrics, including statistical parity, equal opportunity, and disparate impact. All three metrics exceed predefined alert thresholds, indicating potential bias. Risk scores are calibrated using group-specific thresholds to simulate differential clinical decision rules. Finally, treatment equity is assessed by examining diabetes medication prescription rates across race and gender subgroups. The output reveals disparities in treatment for certain male and unknown-gender patients, especially those with missing or ambiguous race categories, suggesting areas where care delivery may be uneven. 

### Prompt 3

In [26]:
#llm code
import pandas as pd
import numpy as np
from collections import Counter
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

def load_and_preprocess_data():
    # Load the diabetes dataset
    df = pd.read_csv('../diabetesdatasets/diabetic_data.csv')
    
    # Convert readmitted to binary (30 days vs. >30/NO)
    df['readmitted_binary'] = (df['readmitted'] == '<30').astype(int)
    
    # Handle missing values
    df['race'] = df['race'].replace('?', 'Unknown')
    df['gender'] = df['gender'].replace('Unknown/Invalid', 'Unknown')
    
    return df

def analyze_imbalance(df):
    """Analyze demographic imbalances in the dataset"""
    imbalance_metrics = {}
    
    # Analyze race distribution
    race_dist = df['race'].value_counts()
    gender_dist = df['gender'].value_counts()
    
    # Calculate imbalance ratios
    race_imbalance = race_dist.max() / race_dist.min()
    gender_imbalance = gender_dist.max() / gender_dist.min()
    
    # Calculate readmission rates by demographic groups
    race_readmission = df.groupby('race')['readmitted_binary'].mean()
    gender_readmission = df.groupby('gender')['readmitted_binary'].mean()
    
    return {
        'race_distribution': race_dist,
        'gender_distribution': gender_dist,
        'race_imbalance_ratio': race_imbalance,
        'gender_imbalance_ratio': gender_imbalance,
        'race_readmission_rates': race_readmission,
        'gender_readmission_rates': gender_readmission
    }

def apply_rebalancing_techniques(df):
    """Apply different rebalancing techniques and evaluate results"""
    # Prepare features and target
    features = ['race', 'gender', 'age', 'time_in_hospital', 'num_lab_procedures', 
                'num_procedures', 'num_medications', 'number_diagnoses']
    
    # Encode categorical variables
    le_dict = {}
    X = df[features].copy()
    for col in ['race', 'gender', 'age']:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col])
        le_dict[col] = le
    
    y = df['readmitted_binary']
    
    # Apply different rebalancing techniques
    techniques = {
        'SMOTE': SMOTE(random_state=42),
        'Random Undersampling': RandomUnderSampler(random_state=42),
        'SMOTEENN': SMOTEENN(random_state=42)
    }
    
    results = {}
    for name, technique in techniques.items():
        X_resampled, y_resampled = technique.fit_resample(X, y)
        
        # Calculate class distribution after rebalancing
        class_dist = pd.Series(y_resampled).value_counts(normalize=True)
        
        # Calculate demographic distribution after rebalancing
        race_dist = pd.Series(le_dict['race'].inverse_transform(X_resampled['race'])).value_counts(normalize=True)
        gender_dist = pd.Series(le_dict['gender'].inverse_transform(X_resampled['gender'])).value_counts(normalize=True)
        
        results[name] = {
            'class_distribution': class_dist,
            'race_distribution': race_dist,
            'gender_distribution': gender_dist,
            'sample_size': len(y_resampled)
        }
    
    return results

def plot_distributions(original_metrics, rebalancing_results):
    """Create visualizations of the distributions before and after rebalancing"""
    # Plot original distributions
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Original race distribution
    sns.barplot(x=original_metrics['race_distribution'].index, 
                y=original_metrics['race_distribution'].values,
                ax=axes[0,0])
    axes[0,0].set_title('Original Race Distribution')
    axes[0,0].tick_params(axis='x', rotation=45)
    
    # Original gender distribution
    sns.barplot(x=original_metrics['gender_distribution'].index,
                y=original_metrics['gender_distribution'].values,
                ax=axes[0,1])
    axes[0,1].set_title('Original Gender Distribution')
    
    # Plot rebalancing results
    technique_names = list(rebalancing_results.keys())
    
    # Compare race distributions after rebalancing
    race_data = pd.DataFrame({name: results['race_distribution'] 
                             for name, results in rebalancing_results.items()})
    race_data.plot(kind='bar', ax=axes[1,0])
    axes[1,0].set_title('Race Distribution After Rebalancing')
    axes[1,0].tick_params(axis='x', rotation=45)
    
    # Compare gender distributions after rebalancing
    gender_data = pd.DataFrame({name: results['gender_distribution']
                               for name, results in rebalancing_results.items()})
    gender_data.plot(kind='bar', ax=axes[1,1])
    axes[1,1].set_title('Gender Distribution After Rebalancing')
    
    plt.tight_layout()
    plt.savefig('rebalancing_analysis.png')
    plt.close()

def main():
    # Load and preprocess data
    print("Loading and preprocessing data...")
    df = load_and_preprocess_data()
    
    # Analyze imbalances
    print("\nAnalyzing demographic imbalances...")
    imbalance_metrics = analyze_imbalance(df)
    
    # Print initial analysis
    print("\nInitial Imbalance Analysis:")
    print(f"Race Imbalance Ratio: {imbalance_metrics['race_imbalance_ratio']:.2f}")
    print(f"Gender Imbalance Ratio: {imbalance_metrics['gender_imbalance_ratio']:.2f}")
    print("\nRace Distribution:")
    print(imbalance_metrics['race_distribution'])
    print("\nGender Distribution:")
    print(imbalance_metrics['gender_distribution'])
    
    # Apply rebalancing techniques
    print("\nApplying rebalancing techniques...")
    rebalancing_results = apply_rebalancing_techniques(df)
    
    # Plot results
    print("\nGenerating visualizations...")
    plot_distributions(imbalance_metrics, rebalancing_results)
    
    # Print rebalancing results
    print("\nRebalancing Results:")
    for technique, results in rebalancing_results.items():
        print(f"\n{technique}:")
        print(f"Sample size: {results['sample_size']}")
        print("Class distribution:")
        print(results['class_distribution'])

if __name__ == "__main__":
    main() 

Loading and preprocessing data...

Analyzing demographic imbalances...

Initial Imbalance Analysis:
Race Imbalance Ratio: 118.72
Gender Imbalance Ratio: 18236.00

Race Distribution:
race
Caucasian          76099
AfricanAmerican    19210
Unknown             2273
Hispanic            2037
Other               1506
Asian                641
Name: count, dtype: int64

Gender Distribution:
gender
Female     54708
Male       47055
Unknown        3
Name: count, dtype: int64

Applying rebalancing techniques...

Generating visualizations...

Rebalancing Results:

SMOTE:
Sample size: 180818
Class distribution:
readmitted_binary
0    0.5
1    0.5
Name: proportion, dtype: float64

Random Undersampling:
Sample size: 22714
Class distribution:
readmitted_binary
0    0.5
1    0.5
Name: proportion, dtype: float64

SMOTEENN:
Sample size: 116594
Class distribution:
readmitted_binary
1    0.601566
0    0.398434
Name: proportion, dtype: float64


### output
cleaning the diabetic readmission data, binarizing the “readmitted” field, and collapsing rare or missing race/gender labels into “Unknown.” It then measures how skewed the dataset is across race (a 118× gap between the largest and smallest groups) and gender (a 18 236× gap, driven by just three “Unknown” entries). Next, it applies three rebalancing strategies—SMOTE oversampling, random undersampling, and the hybrid SMOTE+ENN—and reports how each changes the positive/negative class mix and overall sample size. SMOTE expands the dataset to 180 818 examples with a perfect 50/50 split, undersampling shrinks it to 22 714 also at 50/50, and SMOTEENN yields 116 594 samples with about 60% positives and 40% negatives (removing noisy majority points). These outputs show how each technique can be used to trade off dataset size against class balance—SMOTE for maximum data, undersampling for simplicity, and SMOTEENN for a middle‐ground that also cleans borderline samples.

In [27]:
#3. **Recommendations for Implementation**:

#1. **Primary Approach**: Use SMOTE with these specific considerations:
#   ```python
smote = SMOTE(
    random_state=42,
    sampling_strategy='auto',
    k_neighbors=5
   )


In [28]:
#new code, some reused from previous code provided by LLM
from imblearn.over_sampling import SMOTE

# Load and preprocess data
df = load_and_preprocess_data()

# Select features and encode categorical variables
features = ['race', 'gender', 'age', 'time_in_hospital', 'num_lab_procedures', 
            'num_procedures', 'num_medications', 'number_diagnoses']
le_dict = {}
X = df[features].copy()
for col in ['race', 'gender', 'age']:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    le_dict[col] = le

y = df['readmitted_binary']

# Split into train and test sets BEFORE applying SMOTE (prevents data leakage)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Create and apply SMOTE with the specified parameters
# This is the recommended usage based on LLM's code snippet:
smote = SMOTE(random_state=42, sampling_strategy='auto', k_neighbors=5)

# Fit SMOTE only on the training set!
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Optional: Check the new class distribution
from collections import Counter
print("After SMOTE, class distribution:", Counter(y_train_smote))

# Continue with model training using the rebalanced training data (X_train_smote, y_train_smote)
# For example:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_smote, y_train_smote)

# Step 6: Evaluate on the (untouched) test set
y_pred = rf.predict(X_test)


After SMOTE, class distribution: Counter({0: 63286, 1: 63286})


# Output
Even class distribution after SMOTE