In [18]:
import os
import subprocess

java_home = subprocess.check_output(["/usr/libexec/java_home", "-v", "17"]).strip().decode('utf-8')

# Set JAVA_HOME and PATH
os.environ["JAVA_HOME"] = java_home
os.environ["PATH"] = os.path.join(java_home, "bin") + ":" + os.environ["PATH"]
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local pyspark-shell"

# Verify JAVA_HOME and Java version
print("JAVA_HOME:", os.environ['JAVA_HOME'])
!java -version

JAVA_HOME: /opt/homebrew/Cellar/openjdk@17/17.0.13/libexec/openjdk.jdk/Contents/Home
openjdk version "17.0.13" 2024-10-15
OpenJDK Runtime Environment Homebrew (build 17.0.13+0)
OpenJDK 64-Bit Server VM Homebrew (build 17.0.13+0, mixed mode, sharing)


In [19]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("CBRFSS") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.cores", "4") \
    .config("spark.executor.instances", "4") \
    .getOrCreate()

file_path = "data.parquet/ "
df = spark.read.parquet("data.parquet")
df.printSchema()
df.show(5)

root
 |-- _PSU: string (nullable = true)
 |-- SEXVAR: string (nullable = true)
 |-- GENHLTH: string (nullable = true)
 |-- PHYSHLTH: string (nullable = true)
 |-- MENTHLTH: string (nullable = true)
 |-- CHECKUP1: string (nullable = true)
 |-- EXRACT12: string (nullable = true)
 |-- EXEROFT: string (nullable = true)
 |-- EXERHMM: string (nullable = true)
 |-- BPHIGH6: string (nullable = true)
 |-- BPMEDS: string (nullable = true)
 |-- TOLDHI3: string (nullable = true)
 |-- CHOLMED3: string (nullable = true)
 |-- CVDINFR4: string (nullable = true)
 |-- CVDCRHD4: string (nullable = true)
 |-- CVDSTRK3: string (nullable = true)
 |-- ASTHMA3: string (nullable = true)
 |-- CHCSCNC1: string (nullable = true)
 |-- CHCOCNC1: string (nullable = true)
 |-- CHCCOPD3: string (nullable = true)
 |-- ADDEPEV3: string (nullable = true)
 |-- CHCKDNY2: string (nullable = true)
 |-- HAVARTH4: string (nullable = true)
 |-- DIABETE4: string (nullable = true)
 |-- MARITAL: string (nullable = true)
 |-- EDUCA

In [20]:
df.printSchema()

root
 |-- _PSU: string (nullable = true)
 |-- SEXVAR: string (nullable = true)
 |-- GENHLTH: string (nullable = true)
 |-- PHYSHLTH: string (nullable = true)
 |-- MENTHLTH: string (nullable = true)
 |-- CHECKUP1: string (nullable = true)
 |-- EXRACT12: string (nullable = true)
 |-- EXEROFT: string (nullable = true)
 |-- EXERHMM: string (nullable = true)
 |-- BPHIGH6: string (nullable = true)
 |-- BPMEDS: string (nullable = true)
 |-- TOLDHI3: string (nullable = true)
 |-- CHOLMED3: string (nullable = true)
 |-- CVDINFR4: string (nullable = true)
 |-- CVDCRHD4: string (nullable = true)
 |-- CVDSTRK3: string (nullable = true)
 |-- ASTHMA3: string (nullable = true)
 |-- CHCSCNC1: string (nullable = true)
 |-- CHCOCNC1: string (nullable = true)
 |-- CHCCOPD3: string (nullable = true)
 |-- ADDEPEV3: string (nullable = true)
 |-- CHCKDNY2: string (nullable = true)
 |-- HAVARTH4: string (nullable = true)
 |-- DIABETE4: string (nullable = true)
 |-- MARITAL: string (nullable = true)
 |-- EDUCA

In [None]:
# features = ["_BMI5CAT", "DIABETE4", "DIABAGE4"]

In [21]:
from pyspark.sql.types import DoubleType
#casted to double format for easier correlation calculations and prediction later on
for col in df.columns:
    df = df.withColumn(col, df[col].cast(DoubleType()))

In [22]:
df.printSchema()

# Show the first few rows of the DataFrame
df.show(5)

root
 |-- _PSU: double (nullable = true)
 |-- SEXVAR: double (nullable = true)
 |-- GENHLTH: double (nullable = true)
 |-- PHYSHLTH: double (nullable = true)
 |-- MENTHLTH: double (nullable = true)
 |-- CHECKUP1: double (nullable = true)
 |-- EXRACT12: double (nullable = true)
 |-- EXEROFT: double (nullable = true)
 |-- EXERHMM: double (nullable = true)
 |-- BPHIGH6: double (nullable = true)
 |-- BPMEDS: double (nullable = true)
 |-- TOLDHI3: double (nullable = true)
 |-- CHOLMED3: double (nullable = true)
 |-- CVDINFR4: double (nullable = true)
 |-- CVDCRHD4: double (nullable = true)
 |-- CVDSTRK3: double (nullable = true)
 |-- ASTHMA3: double (nullable = true)
 |-- CHCSCNC1: double (nullable = true)
 |-- CHCOCNC1: double (nullable = true)
 |-- CHCCOPD3: double (nullable = true)
 |-- ADDEPEV3: double (nullable = true)
 |-- CHCKDNY2: double (nullable = true)
 |-- HAVARTH4: double (nullable = true)
 |-- DIABETE4: double (nullable = true)
 |-- MARITAL: double (nullable = true)
 |-- EDUCA

In [23]:
num_columns = len(df.columns)

num_rows = df.count()
print(f"Dimensions of DataFrame: {num_rows} rows, {num_columns} columns")

Dimensions of DataFrame: 433323 rows, 139 columns


In [24]:
from pyspark.sql import DataFrame
from pyspark.sql import functions as F


def add_column_y_based_on_x(df: DataFrame, column_x: str) -> DataFrame:
    """
    Adds a new column 'y' to the DataFrame based on the values in column 'x'.

    Parameters:
    df (DataFrame): The input PySpark DataFrame.
    column_x (str): The name of the column to evaluate for conditions.

    Returns:
    DataFrame: A new DataFrame with the additional column 'y'.
    """
    # Create a new column 'y' based on conditions applied to column 'x'
    df = df.withColumn(
        "y",
        F.when(df[column_x].isin(2, 3, 4), 0)  # Assign 0 if x is 2, 3, or 4
        .when(df[column_x] == 1, 1)  # Assign 1 if x is 1
        .otherwise(None),  # Assign None for other values
    )

    return df


# Example usage
# Assuming 'df' is your original DataFrame and you want to evaluate column 'x'
df_with_flag = add_column_y_based_on_x(df, "DIABETE4")

# Show the updated DataFrame
df_with_flag.show()

+-------+------+-------+--------+--------+--------+--------+-------+-------+-------+------+-------+--------+--------+--------+--------+-------+--------+--------+--------+--------+--------+--------+--------+-------+-----+-------+-------+--------+-------+--------+--------+--------+--------+--------+-------+--------+-------+--------+--------+--------+--------+-------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+-------+--------+------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+-------+--------+-------+--------+--------+-------+-------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+-------+--------+--------+-------+--------+--------+--------+--------+--------+--------+------+--------+-------+--------+--------+--------+--------+--------+-------+--------+--------+-------+--------

In [25]:
df = df_with_flag.replace({9: None, 99: None, 999: None})

In [26]:
df.show()

+-------+------+-------+--------+--------+--------+--------+-------+-------+-------+------+-------+--------+--------+--------+--------+-------+--------+--------+--------+--------+--------+--------+--------+-------+-----+-------+-------+--------+-------+--------+--------+--------+--------+--------+-------+--------+-------+--------+--------+--------+--------+-------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+-------+--------+------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+-------+--------+-------+--------+--------+-------+-------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+-------+--------+--------+-------+--------+--------+--------+--------+--------+--------+------+--------+-------+--------+--------+--------+--------+--------+-------+--------+--------+-------+--------

In [27]:
from pyspark.sql import DataFrame
from pyspark.sql import functions as F


def calculate_correlations_with_y(df: DataFrame, target_column: str) -> DataFrame:
    """
    Calculate the correlation between each column in the DataFrame and a specified target column,
    handling null values by dropping rows with any nulls in the columns being compared.

    Parameters:
    df (DataFrame): The input PySpark DataFrame.
    target_column (str): The name of the target column to calculate correlations with.

    Returns:
    DataFrame: A new DataFrame containing pairs of columns and their correlation coefficients.
    """

    # Prepare a list to hold correlation results
    correlations = []

    # Calculate correlations between each column and the target column
    for col in df.columns:
        if col == target_column:
            continue  # Skip the target column itself
        correlation_value = df.stat.corr(col, target_column)
        correlations.append((col, target_column, correlation_value))

    # Create a DataFrame from the correlations list
    correlations_df = df.sparkSession.createDataFrame(
        [result for result in correlations if result[2] is not None],
        ["Column", "Target", "Correlation"],
    )

    return correlations_df


# Example usage
# Assuming 'spark' is your SparkSession and 'df_with_flag' is your original DataFrame with numeric types already casted and 'y' is the target column
correlation_results = calculate_correlations_with_y(df_with_flag, "y")

# Show the correlation results
correlation_results.show()



                                                                                

+--------+------+--------------------+
|  Column|Target|         Correlation|
+--------+------+--------------------+
|    _PSU|     y|-0.03476310477591971|
|  SEXVAR|     y|-0.01835857765920...|
| GENHLTH|     y| 0.24776447914348637|
|PHYSHLTH|     y|-0.07914371693121511|
|MENTHLTH|     y| 0.02670181130559514|
|CHECKUP1|     y|-0.10494091045237935|
|EXRACT12|     y|-0.06052741697272239|
| EXEROFT|     y|-0.06447990615671116|
| EXERHMM|     y|-0.05052216441365793|
| BPHIGH6|     y| -0.2471159110963859|
|  BPMEDS|     y| 0.18774166723378258|
| TOLDHI3|     y|-0.03695206303912695|
|CHOLMED3|     y|-0.08009754666103465|
|CVDINFR4|     y|-0.05543264225305641|
|CVDCRHD4|     y|-0.02345067645981...|
|CVDSTRK3|     y|-0.05334410396006527|
| ASTHMA3|     y|-0.03436481717255345|
|CHCSCNC1|     y|-0.00490675275942705|
|CHCOCNC1|     y|-0.03760772282256236|
|CHCCOPD3|     y|-0.05106953089874152|
+--------+------+--------------------+
only showing top 20 rows



In [28]:
top_correlations_df = (
    correlation_results.withColumn(
        "AbsCorrelation", F.abs(correlation_results["Correlation"])
    )
    .orderBy(F.desc("AbsCorrelation"))
    .drop("AbsCorrelation")
)

top_correlations_df.show(100)




+--------+------+--------------------+
|  Column|Target|         Correlation|
+--------+------+--------------------+
|INDORTAN|     y|                 NaN|
| NUMBURN|     y|                 NaN|
|_IMPRACE|     y|                 NaN|
|DIABETE4|     y| -0.9264783789919822|
|FEETSORE|     y|  0.5920567148889624|
|INSULIN1|     y|  0.5710825026238983|
|DIABEDU1|     y|  0.5592829728928967|
|DIABTYPE|     y|   0.509762705240806|
|EYEEXAM1|     y|   0.506132843727578|
|DIABEYE1|     y| 0.49393632918931357|
|PREDIAB2|     y| -0.2960771669241459|
|CHKHEMO3|     y| 0.25233950884563205|
| GENHLTH|     y| 0.24776447914348637|
| BPHIGH6|     y| -0.2471159110963859|
|_AGEG5YR|     y| 0.20367327606214386|
|PDIABTS1|     y| -0.1947705289931606|
|  BPMEDS|     y| 0.18774166723378258|
|_RFHYPE6|     y| 0.18399994785910842|
| EMPLOY1|     y|  0.1783402473732575|
|  _MICHD|     y|-0.15532978822184057|
|_DRDXAR2|     y| -0.1547562367716296|
| _RFHLTH|     y| 0.15283068991106716|
|_HCVU653|     y| 0.14725

After calculating the correlation of all features, we extracted the top most features that may be related to diabetes by looking up the questions in the codebook as well. TODO: discuss for which features the correlations are less useful than other ones

In our case, these are: 
FEETSORE (feet soreness)
EYEEXAM1 (last eye exam)
GENHLTH (general health)
_AGEG5YR (age cat)
_RFHYPE6 (high BP)
EMPLOY1 (employment status)
_MICHD (heart disease)
_DRDXAR2 (arithris)
_RFHLTH (calculated health val)
_HCVU653 (has/has not insurance)
_RFCHOL3 (high cholesterol)
METVL12_ (activety met value)
ALCDAY4 (alocol consumption last 30 days)
HAVARTH4 (arthritis, rheumatoid arthritis, gout, lupus, or fibromyalgia)
_BMI5CAT (BMI category)
_BMI5 (BMI)
PREGNANT (pregnant)
DIFFWALK (difficulty walking)
_TOTINDA (physical activity)
WTKG3 (weight)
PNEUVAC4 (flu vaccine/shot)
EDUCA (education)
CIMEMLO1 (memories issues)
_INCOMG1 (income)
PADUR1_ (minutes of activity)
CHCKDNY2 (kidney diease)
FALL12MN (falls last 12 months)
SMOKDAY2 (smoking)
CVDINFR4(heart attack)