In [2]:
import os
import subprocess

java_home = subprocess.check_output(["/usr/libexec/java_home", "-v", "17"]).strip().decode('utf-8')

# Set JAVA_HOME and PATH
os.environ["JAVA_HOME"] = java_home
os.environ["PATH"] = os.path.join(java_home, "bin") + ":" + os.environ["PATH"]
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local pyspark-shell"

# Verify JAVA_HOME and Java version
print("JAVA_HOME:", os.environ['JAVA_HOME'])
!java -version

JAVA_HOME: /opt/homebrew/Cellar/openjdk@17/17.0.13/libexec/openjdk.jdk/Contents/Home
openjdk version "17.0.13" 2024-10-15
OpenJDK Runtime Environment Homebrew (build 17.0.13+0)
OpenJDK 64-Bit Server VM Homebrew (build 17.0.13+0, mixed mode, sharing)


In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("CBRFSS") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.cores", "4") \
    .config("spark.executor.instances", "4") \
    .getOrCreate()



24/11/26 21:56:17 WARN Utils: Your hostname, MacBook-Air-von-Linda.local resolves to a loopback address: 127.0.0.1; using 10.89.101.139 instead (on interface en0)
24/11/26 21:56:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/26 21:56:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/11/26 21:56:18 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/11/26 21:56:18 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [4]:
file_path = "output/processedv2.parquet"

df = spark.read.parquet(file_path)
# df.printSchema()
df.show(30)

                                                                                

+-------+--------+--------+-------+------+--------+--------+--------+--------+-------+--------+--------+--------+-----+--------+--------+--------+--------+--------+---+
|GENHLTH|_AGEG5YR|_RFHYPE6|EMPLOY1|_MICHD|_DRDXAR2|_HCVU653|_RFCHOL3|METVL12_|ALCDAY4|_BMI5CAT|DIFFWALK|_TOTINDA|EDUCA|_INCOMG1|CHCKDNY2|FALL12MN|SMOKE100|CVDINFR4|  y|
+-------+--------+--------+-------+------+--------+--------+--------+--------+-------+--------+--------+--------+-----+--------+--------+--------+--------+--------+---+
|    4.0|    11.0|     2.0|    1.0|   2.0|     1.0|     9.0|     1.0|   103.0|  888.0|     2.0|     1.0|     1.0|  5.0|     4.0|     2.0|    88.0|     1.0|     2.0|  0|
|    2.0|     9.0|     1.0|    2.0|   2.0|     2.0|     1.0|     2.0|   106.0|  220.0|     2.0|     2.0|     1.0|  6.0|     5.0|     2.0|     2.0|     2.0|     2.0|  0|
|    3.0|     8.0|     1.0|    1.0|   2.0|     2.0|     1.0|     1.0|   104.0|  210.0|     4.0|     2.0|     1.0|  6.0|     5.0|     2.0|    88.0|     2.0|

In [5]:
df.printSchema()

root
 |-- GENHLTH: double (nullable = true)
 |-- _AGEG5YR: double (nullable = true)
 |-- _RFHYPE6: double (nullable = true)
 |-- EMPLOY1: double (nullable = true)
 |-- _MICHD: double (nullable = true)
 |-- _DRDXAR2: double (nullable = true)
 |-- _HCVU653: double (nullable = true)
 |-- _RFCHOL3: double (nullable = true)
 |-- METVL12_: double (nullable = true)
 |-- ALCDAY4: double (nullable = true)
 |-- _BMI5CAT: double (nullable = true)
 |-- DIFFWALK: double (nullable = true)
 |-- _TOTINDA: double (nullable = true)
 |-- EDUCA: double (nullable = true)
 |-- _INCOMG1: double (nullable = true)
 |-- CHCKDNY2: double (nullable = true)
 |-- FALL12MN: double (nullable = true)
 |-- SMOKE100: double (nullable = true)
 |-- CVDINFR4: double (nullable = true)
 |-- y: integer (nullable = true)



In [6]:
num_columns = len(df.columns)

num_rows = df.count()
# Print the dimensions
print(f"Dimensions of DataFrame: {num_rows} rows, {num_columns} columns")

Dimensions of DataFrame: 251504 rows, 20 columns


We will split the data into 80-20 train and test set and use cross validation

In [7]:
df_filled = df.fillna(0)
train_df, test_df = df_filled.randomSplit([0.8, 0.2], seed=1)
# Show the counts of each split
print(f"Training set count: {train_df.count()}")
print(f"Test set count: {test_df.count()}")

                                                                                

Training set count: 201155
Test set count: 50349


In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Prepare feature columns by removing the target variable 'y'
feature_columns = df.columns
feature_columns.remove("y")

# Step 1: Assemble features into a single vector for training data
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
train_data = assembler.transform(train_df)
# Step 2: Apply PCA to reduce dimensionality
# pca = PCA(
#     k=10, inputCol="features", outputCol="pca_features"
# )  # k is the number of principal components
# pca_model = pca.fit(train_data)
# train_pca = pca_model.transform(train_data)
# Step 2: Set up Linear Regression model

lr = LinearRegression(featuresCol="features", labelCol="y")

# Step 3: Set up Cross-Validation
paramGrid = (
    ParamGridBuilder()
    .addGrid(lr.regParam, [0.1, 0.01])
    .addGrid(lr.elasticNetParam, [0.0, 0.5])
    .build()
)

crossval = CrossValidator(
    estimator=lr,
    estimatorParamMaps=paramGrid,
    evaluator=RegressionEvaluator(labelCol="y", metricName="rmse"),
    numFolds=5,
)  # Use 5+ folds for cross-validation

# Train the model using Cross-Validation on the training data
cv_model = crossval.fit(train_data)

# Step 4: Evaluate on Test Set
test_data = assembler.transform(test_df)
test_predictions = cv_model.transform(test_data)

# Evaluate the model performance on test data
evaluator = RegressionEvaluator(
    labelCol="y", predictionCol="prediction", metricName="rmse"
)
test_rmse = evaluator.evaluate(test_predictions)
print(f"Root Mean Squared Error (RMSE) on test data: {test_rmse}")

# Optional: Show best model parameters from cross-validation
best_model = cv_model.bestModel
print(f"Best Regularization Parameter: {best_model._java_obj.getRegParam()}")
print(f"Best Elastic Net Parameter: {best_model._java_obj.getElasticNetParam()}")

                                                                                

Root Mean Squared Error (RMSE) on test data: 0.362475507154418
Best Regularization Parameter: 0.01
Best Elastic Net Parameter: 0.0
