In [None]:
# data frame (df) should be loaded/defined before the next cell

In [None]:
# preparing data

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml import Pipeline


# Define feature columns (excluding the label column 'y')
feature_columns = [
    "GENHLTH", "_AGEG5YR", "_RFHYPE6", "EMPLOY1",
    "_MICHD", "_DRDXAR2", "_HCVU653", "_RFCHOL3", "METVL12_", 
    "ALCDAY4", "_BMI5CAT", "DIFFWALK", 
    "_TOTINDA", "EDUCA", "_INCOMG1", "CHCKDNY2", "FALL12MN", "SMOKE100", "CVDINFR4"
]


# Assemble feature columns into a feature vector
vector_assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data_with_features = vector_assembler.transform(df)

# Select the relevant columns for modeling
data_for_model = data_with_features.select("features", "y")

# Split the Data into 80% Training, 20% Testing
train_validation_data, test_data = data_for_model.randomSplit([0.8, 0.2], seed=42)




In [None]:
# Decision tree

# Train the Decision Tree with Hyperparameter Tuning
decision_tree = DecisionTreeClassifier(labelCol="y", featuresCol="features")

# Create a ParamGridBuilder to tune hyperparameters
param_grid = (ParamGridBuilder()
              .addGrid(decision_tree.maxDepth, [5, 6, 7, 8])  # Hyperparameters to tune
              .addGrid(decision_tree.minInstancesPerNode, [1, 2, 3])  # Tuning another hyperparameter
              .build())

# Create the evaluator
evaluator = MulticlassClassificationEvaluator(labelCol="y", predictionCol="prediction", metricName="accuracy")

# Set up cross-validation
cross_validator = CrossValidator(estimator=decision_tree, 
                                 estimatorParamMaps=param_grid,
                                 evaluator=evaluator, 
                                 numFolds=3)  # 3-fold cross-validation

# Fit the model using Cross-Validation on the combined Training + Validation Data
cv_model = cross_validator.fit(train_validation_data)

# Make Predictions on Test Data
predictions = cv_model.transform(test_data)

# Evaluate the Model on Test Data
accuracy = evaluator.evaluate(predictions)
print(f"Test Accuracy: {accuracy:.2f}")

# Display the decision tree structure
print(cv_model.bestModel.toDebugString)

# Group predictions by the predicted class
predictions.groupBy("prediction").count().show()

In [None]:
# Analysis of decision tree

# Get feature importances from the best model found by cross-validation
importances = cv_model.bestModel.featureImportances

# Map feature importances to the feature names
feature_importance = [(name, importance) for name, importance in zip(feature_columns, importances)]
sorted_features = sorted(feature_importance, key=lambda x: x[1], reverse=True)

# Print sorted features by importance
print("Feature Importances:")
for feature, importance in sorted_features:
    print(f"{feature}: {importance}")
