# load packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col, count, lit, substring

In [4]:
# create a spark session
spark = SparkSession.builder.appName("HospitalReadmissionEDA").getOrCreate()

# load data

In [5]:
# Load the Data into a Spark DataFrame
data_path = '../data/diabetic_data.csv'
df_spark = spark.read.csv(data_path, header=True)

# Initial Inspection with PySpark
print(f"Number of rows: {df_spark.count()}")
print(f"Number of columns: {len(df_spark.columns)}")

# Show the first 5 rows
df_spark.show(5)

Number of rows: 101766
Number of columns: 50
+------------+-----------+---------------+------+-------+------+-----------------+------------------------+-------------------+----------------+----------+--------------------+------------------+--------------+---------------+-----------------+----------------+----------------+------+------+------+----------------+-------------+---------+---------+-----------+-----------+--------------+-----------+-------------+---------+---------+-----------+------------+-------------+--------+--------+------------+----------+-------+-----------+-------+-------------------+-------------------+------------------------+-----------------------+----------------------+------+-----------+----------+
|encounter_id|patient_nbr|           race|gender|    age|weight|admission_type_id|discharge_disposition_id|admission_source_id|time_in_hospital|payer_code|   medical_specialty|num_lab_procedures|num_procedures|num_medications|number_outpatient|number_emergency|number_

In [6]:
df_spark.printSchema()

root
 |-- encounter_id: string (nullable = true)
 |-- patient_nbr: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- weight: string (nullable = true)
 |-- admission_type_id: string (nullable = true)
 |-- discharge_disposition_id: string (nullable = true)
 |-- admission_source_id: string (nullable = true)
 |-- time_in_hospital: string (nullable = true)
 |-- payer_code: string (nullable = true)
 |-- medical_specialty: string (nullable = true)
 |-- num_lab_procedures: string (nullable = true)
 |-- num_procedures: string (nullable = true)
 |-- num_medications: string (nullable = true)
 |-- number_outpatient: string (nullable = true)
 |-- number_emergency: string (nullable = true)
 |-- number_inpatient: string (nullable = true)
 |-- diag_1: string (nullable = true)
 |-- diag_2: string (nullable = true)
 |-- diag_3: string (nullable = true)
 |-- number_diagnoses: string (nullable = true)
 |-- max_glu_ser

# clean data

In [None]:
# create a list of all column names
all_columns = df_spark.columns

In [None]:
# loop through each column to replace? with null
df_cleaned = df_spark 
for column in all_columns:
    df_cleaned = df_cleaned.withColumn(column, 
                                      when(col(column) == "?", None).otherwise(col(column))
                                      )
print("Replaced '?' with null values.")

Replaced '?' with null values.


In [None]:
df_cleaned.show()

+------------+-----------+---------------+------+--------+------+-----------------+------------------------+-------------------+----------------+----------+--------------------+------------------+--------------+---------------+-----------------+----------------+----------------+------+------+------+----------------+-------------+---------+---------+-----------+-----------+--------------+-----------+-------------+---------+---------+-----------+------------+-------------+--------+--------+------------+----------+-------+-----------+-------+-------------------+-------------------+------------------------+-----------------------+----------------------+------+-----------+----------+
|encounter_id|patient_nbr|           race|gender|     age|weight|admission_type_id|discharge_disposition_id|admission_source_id|time_in_hospital|payer_code|   medical_specialty|num_lab_procedures|num_procedures|num_medications|number_outpatient|number_emergency|number_inpatient|diag_1|diag_2|diag_3|number_diagn

In [None]:
# get total number of rows
total_rows = df_cleaned.count()
total_rows

101766

In [None]:
# calculate missing values count and percentage for each column
missing = df_cleaned.select([(count(when(col(c).isNull(), c))/total_rows*100).alias(c) for c in all_columns])
missing.toPandas()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,0.0,0.0,2.233555,0.0,0.0,96.858479,0.0,0.0,0.0,0.0,39.557416,49.082208,0.0,0.0,0.0,0.0,0.0,0.0,0.020636,0.351787,1.398306,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# List of columns to drop
cols_to_drop = [
    'weight',              # 96.9% missing
    'payer_code',          # 39.6% missing
    'medical_specialty',   # 49.1% missing
    'encounter_id',        # Identifier
    'patient_nbr',         # Identifier
]

# Drop the columns
df_reduced = df_cleaned.drop(*cols_to_drop)

# Verify the change
print(f"Original number of columns: {len(df_cleaned.columns)}")
print(f"Number of columns after dropping: {len(df_reduced.columns)}")

Original number of columns: 50
Number of columns after dropping: 45


# check target variable

In [None]:
df_reduced.select('readmitted').distinct().show()

+----------+
|readmitted|
+----------+
|       >30|
|        NO|
|       <30|
+----------+



In [None]:
# We map three strings to three numbers.
df_reduced_with_target = df_reduced.withColumn('label',
    when(col('readmitted') == 'NO', 0.0)
    .when(col('readmitted') == '<30', 1.0)
    .when(col('readmitted') == '>30', 1.0)
    .otherwise(None) # Should not happen, but good practice
).na.drop(subset=["label"]).drop('readmitted')

df_reduced_with_target = df_reduced_with_target.withColumn("label", col("label").cast("double"))

# categorize diag_1

In [None]:
df_reduced_with_target.groupby(col('diag_1')).count().show(),
df_reduced_with_target.groupby(col('diag_2')).count().show(),
df_reduced_with_target.groupby(col('diag_3')).count().show()

+------+-----+
|diag_1|count|
+------+-----+
|   296|  896|
|   451|   40|
|   853|   18|
|   800|    6|
|250.01|   61|
|   447|   63|
|   591|   19|
|     7|    2|
|   574|  965|
|   475|   14|
|   718|   17|
|   307|   25|
|   577| 1057|
|   581|   19|
|   205|   27|
|   747|    7|
|   334|    2|
|   462|   11|
|   711|   66|
|   647|    5|
+------+-----+
only showing top 20 rows
+------+-----+
|diag_2|count|
+------+-----+
|   296|  165|
|   691|    3|
|   451|   32|
|   V72|   13|
|   919|    3|
|250.01| 1523|
|   447|   26|
|   591|  233|
|   574|  346|
|   718|   12|
|   577|  401|
|   581|   80|
|   205|   58|
|  E858|    4|
|   462|   16|
|   272|  420|
|   711|   43|
|   470|    3|
|   647|    5|
|  E870|    3|
+------+-----+
only showing top 20 rows
+------+-----+
|diag_3|count|
+------+-----+
|   296|  214|
|   451|    9|
|   919|    3|
|250.01|  915|
|   591|  140|
|   447|   39|
|   574|  136|
|   307|   16|
|   718|    9|
|   581|   97|
|   577|  149|
|   334|    5|
|  E8

In [None]:
diag_cols = ['diag_1', 'diag_2', 'diag_3']
df_with_groups = df_reduced_with_target

print("Creating granular diagnosis group features...")

# Loop through each diagnosis column to apply the same logic
for diag_col in diag_cols:
    group_col_name = f"{diag_col}_group"
    df_with_groups = df_with_groups.withColumn(
        group_col_name,
        when(col(diag_col).rlike("^V"), "V_Code")   # Use rlike for "starts with"
        .when(col(diag_col).rlike("^E"), "E_Code")   # Same for E codes
        .when(col(diag_col).isNotNull(), substring(col(diag_col), 1, 3)) # For all other non-null codes
        .otherwise(None) # Keep nulls as null for now
    )

# Now, handle the nulls in our new group columns by filling them with 'UNK'
fill_dict = {f"{c}_group": "UNK" for c in diag_cols}
df_engineered = df_with_groups.fillna(fill_dict)

# Finally, drop the original, noisy diagnosis columns
df_engineered = df_engineered.drop(*diag_cols)

print("New diagnosis group features created:")
df_engineered.select('diag_1_group', 'diag_2_group', 'diag_3_group').show(20)

Creating granular diagnosis group features...
New diagnosis group features created:
+------------+------------+------------+
|diag_1_group|diag_2_group|diag_3_group|
+------------+------------+------------+
|         250|         UNK|         UNK|
|         276|         250|         255|
|         648|         250|      V_Code|
|           8|         250|         403|
|         197|         157|         250|
|         414|         411|         250|
|         414|         411|      V_Code|
|         428|         492|         250|
|         398|         427|          38|
|         434|         198|         486|
|         250|         403|         996|
|         157|         288|         197|
|         428|         250|         250|
|         428|         411|         427|
|         518|         998|         627|
|         999|         507|         996|
|         410|         411|         414|
|         682|         174|         250|
|         402|         425|         416|
|         737|

In [None]:
df_engineered.show()

+---------------+------+--------+-----------------+------------------------+-------------------+----------------+------------------+--------------+---------------+-----------------+----------------+----------------+----------------+-------------+---------+---------+-----------+-----------+--------------+-----------+-------------+---------+---------+-----------+------------+-------------+--------+--------+------------+----------+-------+-----------+-------+-------------------+-------------------+------------------------+-----------------------+----------------------+------+-----------+-----+------------+------------+------------+
|           race|gender|     age|admission_type_id|discharge_disposition_id|admission_source_id|time_in_hospital|num_lab_procedures|num_procedures|num_medications|number_outpatient|number_emergency|number_inpatient|number_diagnoses|max_glu_serum|A1Cresult|metformin|repaglinide|nateglinide|chlorpropamide|glimepiride|acetohexamide|glipizide|glyburide|tolbutamide|

In [None]:
missing = df_engineered.select([(count(when(col(c).isNull(), c))/total_rows*100).alias(c) for c in df_engineered.columns])
missing.toPandas()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,label,diag_1_group,diag_2_group,diag_3_group
0,2.233555,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# handle missing values

In [None]:
# Find the most frequent value (the mode) for the 'race' column
mode_race = df_engineered.groupBy('race').count().orderBy('count', ascending=False).first()[0]
print(f"The most frequent race is: '{mode_race}'. We will use this to fill missing values.")

# Fill the nulls with the mode
df_engineered = df_engineered.fillna({'race': mode_race})

The most frequent race is: 'Caucasian'. We will use this to fill missing values.


# more feature engineering

In [None]:
major_disease_groups = [
    '250', # Diabetes
    '428', # Heart Failure
    '414', # Ischemic Heart Disease
    '401', # Hypertension
    '585', # Chronic Kidney Disease
    '486', # Pneumonia
    '493', # Asthma/COPD
    '272'  # Disorders of lipid metabolism
]

# Create a new column, initializing the count to 0
df_engineered = df_engineered.withColumn('comorbidity_count', col('time_in_hospital') * 0) # A trick to create a column of 0s

# Loop through the list of major diseases and add 1 to the count if the patient has it
for group in major_disease_groups:
    df_engineered = df_engineered.withColumn(
        'comorbidity_count',
        col('comorbidity_count') + 
        when((col('diag_1_group') == group) | 
             (col('diag_2_group') == group) | 
             (col('diag_3_group') == group), 1).otherwise(0)
    )

In [None]:
numerical_cols = [
    'time_in_hospital', 'num_lab_procedures', 'num_procedures',
    'num_medications', 'number_outpatient', 'number_emergency',
    'number_inpatient', 'number_diagnoses'
]

df_engineered = df_engineered
for c in numerical_cols:
    df_engineered = df_engineered.withColumn(c, col(c).cast('integer'))

df_engineered = df_engineered.withColumn(
    'labs_per_day', col('num_lab_procedures') / col('time_in_hospital')
).withColumn(
    'meds_per_day', col('num_medications') / col('time_in_hospital')
).withColumn(
    'procs_per_day', col('num_procedures') / col('time_in_hospital')
)

In [None]:
# List of the 23 diabetes medication columns
med_cols = [
    'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
    'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
    'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
    'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
    'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone'
]

# Let's use the df_with_ratios from the previous step
df_final = df_engineered.withColumn('med_change_score', col('time_in_hospital') * 0) # Init with 0s

# Loop through the med columns and add 1 if there was a change
for med in med_cols:
    df_final = df_final.withColumn(
        'med_change_score',
        col('med_change_score') +
        when((col(med) == 'Up') | (col(med) == 'Down'), 1).otherwise(0)
    )

df_final.printSchema()

root
 |-- race: string (nullable = false)
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- admission_type_id: string (nullable = true)
 |-- discharge_disposition_id: string (nullable = true)
 |-- admission_source_id: string (nullable = true)
 |-- time_in_hospital: integer (nullable = true)
 |-- num_lab_procedures: integer (nullable = true)
 |-- num_procedures: integer (nullable = true)
 |-- num_medications: integer (nullable = true)
 |-- number_outpatient: integer (nullable = true)
 |-- number_emergency: integer (nullable = true)
 |-- number_inpatient: integer (nullable = true)
 |-- number_diagnoses: integer (nullable = true)
 |-- max_glu_serum: string (nullable = true)
 |-- A1Cresult: string (nullable = true)
 |-- metformin: string (nullable = true)
 |-- repaglinide: string (nullable = true)
 |-- nateglinide: string (nullable = true)
 |-- chlorpropamide: string (nullable = true)
 |-- glimepiride: string (nullable = true)
 |-- acetohexamide: string (nullabl

# establish pipline

In [None]:
# Separate columns into categorical and numerical lists
# The target variable 'readmitted_binary' is separate
categorical_cols = [
    'race', 'gender', 'age', 'admission_type_id', 'discharge_disposition_id',
    'admission_source_id', 'max_glu_serum', 'A1Cresult', 'metformin',
    'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
    'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
    'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
    'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
    'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone',
    'change', 'diabetesMed', 'diag_1_group', 'diag_2_group', 'diag_3_group'
]

numerical_cols = [
    'time_in_hospital', 'num_lab_procedures', 'num_procedures',
    'num_medications', 'number_outpatient', 'number_emergency',
    'number_inpatient', 'number_diagnoses', 'comorbidity_count', 
    'labs_per_day', 'meds_per_day', 'procs_per_day', 'med_change_score'
]

target_col = 'label'

print("\nFinal number of columns before modeling:", len(df_final.columns))
df_final.printSchema()


Final number of columns before modeling: 50
root
 |-- race: string (nullable = false)
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- admission_type_id: string (nullable = true)
 |-- discharge_disposition_id: string (nullable = true)
 |-- admission_source_id: string (nullable = true)
 |-- time_in_hospital: integer (nullable = true)
 |-- num_lab_procedures: integer (nullable = true)
 |-- num_procedures: integer (nullable = true)
 |-- num_medications: integer (nullable = true)
 |-- number_outpatient: integer (nullable = true)
 |-- number_emergency: integer (nullable = true)
 |-- number_inpatient: integer (nullable = true)
 |-- number_diagnoses: integer (nullable = true)
 |-- max_glu_serum: string (nullable = true)
 |-- A1Cresult: string (nullable = true)
 |-- metformin: string (nullable = true)
 |-- repaglinide: string (nullable = true)
 |-- nateglinide: string (nullable = true)
 |-- chlorpropamide: string (nullable = true)
 |-- glimepiride: string (nullabl

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

# --- STAGE 1 & 2: String Indexing and One-Hot Encoding for Categorical Columns ---
# We create a list to hold all the stages of our pipeline
stages = []

# Loop through each categorical column
for col_name in categorical_cols:
    # 1. StringIndexer, Rule: StringIndexer orders categories by descending frequency in the dataset.
    # That means:The most frequent label gets index 0. The next most frequent gets 1, and so on.

    string_indexer = StringIndexer(inputCol=col_name, outputCol=col_name + "_index", handleInvalid="keep")
    # 2. OneHotEncoder
    one_hot_encoder = OneHotEncoder(inputCols=[string_indexer.getOutputCol()], outputCols=[col_name + "_vec"])
    # Add these two stages to our pipeline
    stages += [string_indexer, one_hot_encoder]

# --- STAGE 3: Vector Assembly ---
# Create a list of all feature columns to be assembled
# This includes the one-hot encoded vectors and the original numerical columns
assembler_inputs = [c + "_vec" for c in categorical_cols] + numerical_cols

# Create the VectorAssembler stage
vector_assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

# Add the assembler to our stages
stages += [vector_assembler]

# --- Create and Run the Pipeline ---
# Define the pipeline with all the stages
preprocessing_pipeline = Pipeline(stages=stages)

# Fit the pipeline to our data to "learn" the transformations
pipeline_model = preprocessing_pipeline.fit(df_final)

# Transform the data to apply the transformations
df_model_ready = pipeline_model.transform(df_final)

print("Pipeline created and data transformed successfully.")

# --- Inspect the Result ---
# Let's look at the final DataFrame. You'll see the new columns,
# especially the 'features' vector and our target column.
df_model_ready.select(target_col, 'features').show(5, truncate=False)

Pipeline created and data transformed successfully.
+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                                                                                                                                                                                                                                                                        |
+-----+-----------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
(trainingData, testData) = df_model_ready.randomSplit([0.8, 0.2], seed=42)

print(f"Number of training samples: {trainingData.count()}")
print(f"Number of testing samples: {testData.count()}")

Number of training samples: 81565
Number of testing samples: 20201


# Test_1 logistic regression

In [None]:
from pyspark.ml.classification import LogisticRegression

# Instantiate the Logistic Regression model
# It will use our 'features' and 'label' columns
lr_1 = LogisticRegression(featuresCol='features', labelCol='label')

# Train the model by fitting it to the training data
print("Training the Logistic Regression model...")
lr_model_1 = lr_1.fit(trainingData)
print("Model training complete.")

Training the Logistic Regression model...
Model training complete.


In [None]:
# Make predictions on the test data
predictions = lr_model_1.transform(testData)

# Let's look at the predictions
# The 'prediction' column is what the model guessed.
print("Sample predictions:")
predictions.select('label', 'prediction', 'probability').show(10, truncate=False)

Sample predictions:
+-----+----------+----------------------------------------+
|label|prediction|probability                             |
+-----+----------+----------------------------------------+
|1.0  |0.0       |[0.9173741735148928,0.08262582648510719]|
|0.0  |0.0       |[0.6668425230420639,0.33315747695793607]|
|0.0  |0.0       |[0.7494673110869254,0.25053268891307456]|
|1.0  |0.0       |[0.6873527198893145,0.31264728011068554]|
|1.0  |1.0       |[0.34954529648602983,0.6504547035139702]|
|0.0  |0.0       |[0.7586286787548977,0.2413713212451023] |
|0.0  |0.0       |[0.6298967434161109,0.3701032565838891] |
|1.0  |0.0       |[0.7155634611430897,0.28443653885691034]|
|1.0  |0.0       |[0.6895710724536535,0.31042892754634654]|
|1.0  |0.0       |[0.5931447346649693,0.4068552653350307] |
+-----+----------+----------------------------------------+
only showing top 10 rows


In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

# AUC
evaluator_auc = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
auc = evaluator_auc.evaluate(predictions)
print(f"Area Under ROC (AUC) = {auc:.4f}")

# Recall
evaluator_recall_positive = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                                              metricName="recallByLabel", metricLabel=1.0)
recall = evaluator_recall_positive.evaluate(predictions)
print(f"Recall (for Readmitted Class 1.0) = {recall:.2%}")

# Precision
evaluator_precision_positive = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                                                  metricName="precisionByLabel", metricLabel=1.0)
precision = evaluator_precision_positive.evaluate(predictions)
print(f"Precision (for Readmitted Class 1.0) = {precision:.2%}")

# F1 score
evaluator_f1_positive = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                                          metricName="f1", metricLabel=1.0)
f1_score = evaluator_f1_positive.evaluate(predictions)
print(f"F1 Score (for Readmitted Class 1.0) = {f1_score:.4f}")

# --- Detailed View: Confusion Matrix ---
# Shows how many times the model predicted each class vs. the actual class.
print("\nConfusion Matrix:")
# Convert to Pandas for a nice crosstab view
preds_and_labels = predictions.select(['prediction', 'label']).toPandas()
confusion_matrix = pd.crosstab(preds_and_labels['label'], preds_and_labels['prediction'])
print(confusion_matrix)

Area Under ROC (AUC) = 0.6893
Recall (for Readmitted Class 1.0) = 53.48%
Precision (for Readmitted Class 1.0) = 62.92%
F1 Score (for Readmitted Class 1.0) = 0.6365

Confusion Matrix:
prediction   0.0   1.0
label                 
0.0         7952  2935
1.0         4333  4981


# Test_2 logistic regression with weighted column

In [None]:
# Calculate the balance ratio
num_positives = df_model_ready.filter(col('label') == 1.0).count()
num_negatives = df_model_ready.filter(col('label') == 0.0).count()

# The weight for the majority class (0.0) should be smaller
weight_for_negatives = num_positives / (num_positives + num_negatives)

# The weight for the minority class (1.0) should be larger
weight_for_positives = num_negatives / (num_positives + num_negatives)

print(f"Weight for class 0: {weight_for_negatives:.2f}")
print(f"Weight for class 1: {weight_for_positives:.2f}")

# Add a weight column to the DataFrame
df_weighted = df_model_ready.withColumn('classWeight',
    when(col('label') == 0.0, weight_for_negatives)
    .otherwise(weight_for_positives)
)

# Split this NEW df_weighted DataFrame
(trainingData, testData) = df_weighted.randomSplit([0.8, 0.2], seed=42)

# Now, when you instantiate your model, tell it to use this column
# If you are using LogisticRegression:
lr_2 = LogisticRegression(featuresCol='features', labelCol='label', weightCol='classWeight')
lr_model_2 = lr_2.fit(trainingData)
predictions = lr_model_2.transform(testData)

Weight for class 0: 0.46
Weight for class 1: 0.54


In [None]:
# AUC
evaluator_auc = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
auc = evaluator_auc.evaluate(predictions)
print(f"Area Under ROC (AUC) = {auc:.4f}")

# Recall
evaluator_recall_positive = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                                              metricName="recallByLabel", metricLabel=1.0)
recall = evaluator_recall_positive.evaluate(predictions)
print(f"Recall (for Readmitted Class 1.0) = {recall:.2%}")

# Precision
evaluator_precision_positive = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                                                  metricName="precisionByLabel", metricLabel=1.0)
precision = evaluator_precision_positive.evaluate(predictions)
print(f"Precision (for Readmitted Class 1.0) = {precision:.2%}")

# F1 score
evaluator_f1_positive = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                                          metricName="f1", metricLabel=1.0)
f1_score = evaluator_f1_positive.evaluate(predictions)
print(f"F1 Score (for Readmitted Class 1.0) = {f1_score:.4f}")

# --- Detailed View: Confusion Matrix ---
# Shows how many times the model predicted each class vs. the actual class.
print("\nConfusion Matrix:")
# Convert to Pandas for a nice crosstab view
preds_and_labels = predictions.select(['prediction', 'label']).toPandas()
confusion_matrix = pd.crosstab(preds_and_labels['label'], preds_and_labels['prediction'])
print(confusion_matrix)


Area Under ROC (AUC) = 0.6894
Recall (for Readmitted Class 1.0) = 62.70%
Precision (for Readmitted Class 1.0) = 60.09%
F1 Score (for Readmitted Class 1.0) = 0.6364

Confusion Matrix:
prediction   0.0   1.0
label                 
0.0         7008  3879
1.0         3474  5840


# Test_3 random forest with weighted column

In [None]:
from pyspark.ml.classification import RandomForestClassifier

# 1. Instantiate the RandomForest model, making sure to use the weightCol!
rf_1 = RandomForestClassifier(featuresCol='features', 
                            labelCol='label', 
                            weightCol='classWeight',  # Crucial to keep this
                            seed=42)                 # For reproducibility

# 2. Train the model on your weighted training data
print("Training the weighted RandomForest model...")
rf_model_1 = rf_1.fit(trainingData)
print("Model training complete.")

# 3. Make predictions on the UNTOUCHED test data
predictions_rf = rf_model_1.transform(testData)

# 4. Evaluate the new model using your corrected evaluation code
print("\n--- Evaluation of RandomForest Model ---")

# --- AUC ---
evaluator_auc = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
auc = evaluator_auc.evaluate(predictions_rf)
print(f"Area Under ROC (AUC) = {auc:.4f}")

# --- Recall for Positive Class ---
evaluator_recall_positive = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="recallByLabel", metricLabel=1.0)
recall = evaluator_recall_positive.evaluate(predictions_rf)
print(f"Recall (for Readmitted Class 1.0) = {recall:.2%}")

# --- Precision for Positive Class ---
evaluator_precision_positive = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="precisionByLabel", metricLabel=1.0)
precision = evaluator_precision_positive.evaluate(predictions_rf)
print(f"Precision (for Readmitted Class 1.0) = {precision:.2%}")

# --- F1 Score for Positive Class ---
evaluator_f1_positive = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1", metricLabel=1.0)
f1_score = evaluator_f1_positive.evaluate(predictions_rf)
print(f"F1 Score (for Readmitted Class 1.0) = {f1_score:.4f}")

# --- Confusion Matrix ---
print("\nConfusion Matrix (RandomForest):")
preds_and_labels_rf = predictions_rf.select(['prediction', 'label']).toPandas()
confusion_matrix_rf = pd.crosstab(preds_and_labels_rf['label'], preds_and_labels_rf['prediction'], rownames=['Actual'], colnames=['Predicted'])
print(confusion_matrix_rf)

Training the weighted RandomForest model...
Model training complete.

--- Evaluation of RandomForest Model ---
Area Under ROC (AUC) = 0.6653
Recall (for Readmitted Class 1.0) = 64.99%
Precision (for Readmitted Class 1.0) = 58.08%
F1 Score (for Readmitted Class 1.0) = 0.6228

Confusion Matrix (RandomForest):
Predicted   0.0   1.0
Actual               
0.0        6519  4368
1.0        3261  6053
