# Infor722 Iteration 4 

In [1]:
import findspark
findspark.init('/home/ubuntu/spark-3.2.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, avg, monotonically_increasing_id, row_number, rand, udf
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, NaiveBayes, LinearSVC
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.sql.types import IntegerType, DoubleType, FloatType
from pyspark.ml.linalg import Vectors

In [2]:
spark = SparkSession.builder.master("local[1]").appName("SparkApp")\
    .config("spark.executor.memory", "4g")\
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/23 15:39:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## 2. Data Understand

### 2.1. Load Dataset

In [3]:
df_main = spark.read.csv('Datasets/diabetes_012_health_indicators_BRFSS2021.csv', header=True)
df_income = spark.read.csv('Datasets/diabetes_Income.csv', header=True)

                                                                                

### 2.3. Explore Data

In [4]:
df_desc = df_main.describe()
df_desc.show()

                                                                                

+-------+-------------------+------------------+-------------------+-------------------+------------------+-------------------+-------------------+--------------------+------------------+-------------------+-------------------+-------------------+-------------------+-------------------+------------------+-----------------+-----------------+------------------+------+-----------------+------------------+
|summary|       Diabetes_012|            HighBP|           HighChol|          CholCheck|               BMI|             Smoker|             Stroke|HeartDiseaseorAttack|      PhysActivity|             Fruits|            Veggies|  HvyAlcoholConsump|      AnyHealthcare|        NoDocbcCost|           GenHlth|         MentHlth|         PhysHlth|          DiffWalk|   Sex|              Age|         Education|
+-------+-------------------+------------------+-------------------+-------------------+------------------+-------------------+-------------------+--------------------+------------------+-

In [5]:
df_main.printSchema()

root
 |-- Diabetes_012: string (nullable = true)
 |-- HighBP: string (nullable = true)
 |-- HighChol: string (nullable = true)
 |-- CholCheck: string (nullable = true)
 |-- BMI: string (nullable = true)
 |-- Smoker: string (nullable = true)
 |-- Stroke: string (nullable = true)
 |-- HeartDiseaseorAttack: string (nullable = true)
 |-- PhysActivity: string (nullable = true)
 |-- Fruits: string (nullable = true)
 |-- Veggies: string (nullable = true)
 |-- HvyAlcoholConsump: string (nullable = true)
 |-- AnyHealthcare: string (nullable = true)
 |-- NoDocbcCost: string (nullable = true)
 |-- GenHlth: string (nullable = true)
 |-- MentHlth: string (nullable = true)
 |-- PhysHlth: string (nullable = true)
 |-- DiffWalk: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Education: string (nullable = true)



In [6]:
diabetes_counts = df_main.groupBy("Diabetes_012").count()
diabetes_counts.show()

[Stage 5:>                                                          (0 + 1) / 1]

+------------+------+
|Diabetes_012| count|
+------------+------+
|           0|197191|
|           1|  5619|
|           2| 33568|
+------------+------+



                                                                                

### 2.4. Verify the data quality

In [7]:
df_main.printSchema()

root
 |-- Diabetes_012: string (nullable = true)
 |-- HighBP: string (nullable = true)
 |-- HighChol: string (nullable = true)
 |-- CholCheck: string (nullable = true)
 |-- BMI: string (nullable = true)
 |-- Smoker: string (nullable = true)
 |-- Stroke: string (nullable = true)
 |-- HeartDiseaseorAttack: string (nullable = true)
 |-- PhysActivity: string (nullable = true)
 |-- Fruits: string (nullable = true)
 |-- Veggies: string (nullable = true)
 |-- HvyAlcoholConsump: string (nullable = true)
 |-- AnyHealthcare: string (nullable = true)
 |-- NoDocbcCost: string (nullable = true)
 |-- GenHlth: string (nullable = true)
 |-- MentHlth: string (nullable = true)
 |-- PhysHlth: string (nullable = true)
 |-- DiffWalk: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Education: string (nullable = true)



In [8]:
# Count the number of missing values per column
missing_values = df_main.select([count(when(col(c).isNull(), c)).alias(c) for c in df_main.columns])
missing_values.show()

[Stage 8:>                                                          (0 + 1) / 1]

+------------+------+--------+---------+---+------+------+--------------------+------------+------+-------+-----------------+-------------+-----------+-------+--------+--------+--------+---+---+---------+
|Diabetes_012|HighBP|HighChol|CholCheck|BMI|Smoker|Stroke|HeartDiseaseorAttack|PhysActivity|Fruits|Veggies|HvyAlcoholConsump|AnyHealthcare|NoDocbcCost|GenHlth|MentHlth|PhysHlth|DiffWalk|Sex|Age|Education|
+------------+------+--------+---------+---+------+------+--------------------+------------+------+-------+-----------------+-------------+-----------+-------+--------+--------+--------+---+---+---------+
|           0|     0|       1|        1| 14|     9|     7|                  10|          12|     9|      7|                9|            7|          4|      5|       3|       6|       4|  0|  1|        2|
+------------+------+--------+---------+---+------+------+--------------------+------------+------+-------+-----------------+-------------+-----------+-------+--------+--------+---

                                                                                

In [9]:
# Percentage of missing values in the dataset
total_rows = df_main.count()
missing_percentage = df_main.select([(count(when(col(c).isNull(), c)) / total_rows * 100).alias(c) for c in df_main.columns])
missing_percentage.show()

[Stage 14:>                                                         (0 + 1) / 1]

+------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---+--------------------+--------------------+
|Diabetes_012|HighBP|            HighChol|           CholCheck|                 BMI|              Smoker|              Stroke|HeartDiseaseorAttack|        PhysActivity|              Fruits|             Veggies|   HvyAlcoholConsump|       AnyHealthcare|         NoDocbcCost|             GenHlth|            MentHlth|            PhysHlth|            DiffWalk|Sex|                 Age|           Education|
+------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+---

                                                                                

In [10]:
# Value counts for the 'Sex' column
sex_distribution = df_main.groupBy("Sex").count()
sex_distribution.show()

[Stage 17:>                                                         (0 + 1) / 1]

+------+------+
|   Sex| count|
+------+------+
|     F|     3|
|Female|123426|
|female|     2|
|     M|     3|
|  feee|     1|
|  Male|112943|
+------+------+



                                                                                

## 3. Data preparation

### 3.1. Selecte data

In [11]:
main_df = df_main.drop('Education', 'MentHlth')

# Show the DataFrame to confirm columns are dropped
main_df.show()

# Display the first few rows of the DataFrame
main_df.show(5)

+------------+------+--------+---------+---+------+------+--------------------+------------+------+-------+-----------------+-------------+-----------+-------+--------+--------+------+---+
|Diabetes_012|HighBP|HighChol|CholCheck|BMI|Smoker|Stroke|HeartDiseaseorAttack|PhysActivity|Fruits|Veggies|HvyAlcoholConsump|AnyHealthcare|NoDocbcCost|GenHlth|PhysHlth|DiffWalk|   Sex|Age|
+------------+------+--------+---------+---+------+------+--------------------+------------+------+-------+-----------------+-------------+-----------+-------+--------+--------+------+---+
|           0|     0|       1|        1| 15|     1|     0|                   0|           0|     1|      1|                0|            1|          0|      5|      20|       0|Female| 11|
|           2|     1|       0|        1| 28|     0|     0|                   1|           0|     1|      0|                0|            1|          0|      2|       0|       0|Female| 11|
|           2|     1|       1|        1| 33|     0|    

In [12]:
num_columns = len(main_df.columns)
print(f"The number of columns in the DataFrame is: {num_columns}")

The number of columns in the DataFrame is: 19


### 3.2. Clean Data

In [13]:
# Clean 'main_df' DataFrame
main_df = main_df.withColumn("Age", when((col("Age") < 1) | (col("Age") > 13), None).otherwise(col("Age")))
main_df = main_df.withColumn("Diabetes_012", when(~col("Diabetes_012").isin([0, 1, 2]), None).otherwise(col("Diabetes_012")))
main_df = main_df.withColumn("HighBP", when(~col("HighBP").isin([0, 1]), None).otherwise(col("HighBP")))
main_df = main_df.withColumn("HighChol", when(~col("HighChol").isin([0, 1]), None).otherwise(col("HighChol")))
main_df = main_df.withColumn("CholCheck", when(~col("CholCheck").isin([0, 1]), None).otherwise(col("CholCheck")))
main_df = main_df.withColumn("BMI", when((col("BMI") < 12) | (col("BMI") > 99), None).otherwise(col("BMI")))
main_df = main_df.withColumn("Smoker", when(~col("Smoker").isin([0, 1]), None).otherwise(col("Smoker")))
main_df = main_df.withColumn("Stroke", when(~col("Stroke").isin([0, 1]), None).otherwise(col("Stroke")))
main_df = main_df.withColumn("HeartDiseaseorAttack", when(~col("HeartDiseaseorAttack").isin([0, 1]), None).otherwise(col("HeartDiseaseorAttack")))
main_df = main_df.withColumn("PhysActivity", when(~col("PhysActivity").isin([0, 1]), None).otherwise(col("PhysActivity")))
main_df = main_df.withColumn("Fruits", when(~col("Fruits").isin([0, 1]), None).otherwise(col("Fruits")))
main_df = main_df.withColumn("Veggies", when(~col("Veggies").isin([0, 1]), None).otherwise(col("Veggies")))
main_df = main_df.withColumn("HvyAlcoholConsump", when(~col("HvyAlcoholConsump").isin([0, 1]), None).otherwise(col("HvyAlcoholConsump")))
main_df = main_df.withColumn("AnyHealthcare", when(~col("AnyHealthcare").isin([0, 1]), None).otherwise(col("AnyHealthcare")))
main_df = main_df.withColumn("NoDocbcCost", when(~col("NoDocbcCost").isin([0, 1]), None).otherwise(col("NoDocbcCost")))
main_df = main_df.withColumn("GenHlth", when(~col("GenHlth").isin([1, 2, 3, 4, 5]), None).otherwise(col("GenHlth")))
main_df = main_df.withColumn("PhysHlth", when((col("PhysHlth") < 0) | (col("PhysHlth") > 30), None).otherwise(col("PhysHlth")))
main_df = main_df.withColumn("DiffWalk", when(~col("DiffWalk").isin([0, 1]), None).otherwise(col("DiffWalk")))

# Clean 'second_df' DataFrame
second_df = df_income.withColumn("Income", when((col("Income") < 1) | (col("Income") > 11), None).otherwise(col("Income")))

In [14]:
columns_of_interest = [
    'Age', 'Diabetes_012', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
    'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
    'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'PhysHlth',
    'DiffWalk'
]

# Calculate missing values for each field in the main DataFrame
missing_values = main_df.select([count(when(col(c).isNull(), c)).alias(c) for c in columns_of_interest])
missing_values.show()

[Stage 22:>                                                         (0 + 1) / 1]

+---+------------+------+--------+---------+---+------+------+--------------------+------------+------+-------+-----------------+-------------+-----------+-------+--------+--------+
|Age|Diabetes_012|HighBP|HighChol|CholCheck|BMI|Smoker|Stroke|HeartDiseaseorAttack|PhysActivity|Fruits|Veggies|HvyAlcoholConsump|AnyHealthcare|NoDocbcCost|GenHlth|PhysHlth|DiffWalk|
+---+------------+------+--------+---------+---+------+------+--------------------+------------+------+-------+-----------------+-------------+-----------+-------+--------+--------+
|  2|           0|     0|       1|        1| 29|     9|     7|                  10|          12|     9|      7|                9|            7|          4|      5|       9|       4|
+---+------------+------+--------+---------+---+------+------+--------------------+------------+------+-------+-----------------+-------------+-----------+-------+--------+--------+



                                                                                

In [15]:
# Calculate missing values for the 'Income' field in the second DataFrame
missing_values_second = second_df.select([count(when(col('Income').isNull(), 'Income')).alias('Income')])
missing_values_second.show()

+------+
|Income|
+------+
|     0|
+------+



In [16]:
binary_columns = [
    'HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
    'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare',
    'NoDocbcCost', 'DiffWalk'
]

# Replace NaN in binary columns with 1
for col_name in binary_columns:
    main_df = main_df.na.fill({col_name: 1})

In [17]:
# Define non-binary columns
non_binary_columns = [
    'Age', 'Diabetes_012', 'BMI', 'GenHlth', 'PhysHlth'
]

# Replace NaN in non-binary columns with the column's mean
for col_name in non_binary_columns:
    mean_value = main_df.select(avg(col_name)).first()[0]
    main_df = main_df.na.fill({col_name: mean_value})

                                                                                

In [18]:
columns_of_interest = [
    'Age', 'Diabetes_012', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
    'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
    'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'PhysHlth',
    'DiffWalk'
]

# Calculate missing values for each field in the main DataFrame
missing_values = main_df.select([count(when(col(c).isNull(), c)).alias(c) for c in columns_of_interest])
missing_values.show()

+---+------------+------+--------+---------+---+------+------+--------------------+------------+------+-------+-----------------+-------------+-----------+-------+--------+--------+
|Age|Diabetes_012|HighBP|HighChol|CholCheck|BMI|Smoker|Stroke|HeartDiseaseorAttack|PhysActivity|Fruits|Veggies|HvyAlcoholConsump|AnyHealthcare|NoDocbcCost|GenHlth|PhysHlth|DiffWalk|
+---+------------+------+--------+---------+---+------+------+--------------------+------------+------+-------+-----------------+-------------+-----------+-------+--------+--------+
|  0|           0|     0|       0|        0|  0|     0|     0|                   0|           0|     0|      0|                0|            0|          0|      0|       0|       0|
+---+------------+------+--------+---------+---+------+------+--------------------+------------+------+-------+-----------------+-------------+-----------+-------+--------+--------+



In [19]:
main_df.select("Sex").distinct().show()

+------+
|   Sex|
+------+
|     F|
|Female|
|female|
|     M|
|  feee|
|  Male|
+------+



[Stage 46:>                                                         (0 + 1) / 1]                                                                                

In [20]:
sex_mapping = {
    'female': 0, 'F': 0, 'M': 1, 'feee': 0, 'Female': 0, 'Male': 1
}

# Applying the mapping using when().otherwise() for each condition
# Note: PySpark does not have direct replacement using dictionary, thus using multiple when().otherwise()
main_df = main_df.withColumn("Sex", 
    when(col("Sex") == "female", 0)
    .when(col("Sex") == "F", 0)
    .when(col("Sex") == "M", 1)
    .when(col("Sex") == "feee", 0)
    .when(col("Sex") == "Female", 0)
    .when(col("Sex") == "Male", 1)
    .otherwise(col("Sex")))

# Display the unique values in the 'Sex' column to confirm changes
main_df.select("Sex").distinct().show()

+---+
|Sex|
+---+
|  0|
|  1|
+---+



[Stage 49:>                                                         (0 + 1) / 1]                                                                                

In [21]:
# Count the values in the 'Sex' column
main_df.groupBy("Sex").count().show()

[Stage 52:>                                                         (0 + 1) / 1]

+---+------+
|Sex| count|
+---+------+
|  0|123432|
|  1|112946|
+---+------+



                                                                                

### 3.3. Construct the data

In [22]:
# Define bins and corresponding labels
bins = [-1, 0, 10, 20, float('inf')]
labels = [0, 1, 2, 3]

# Creating a new column 'physical_health' based on bins
main_df = main_df.withColumn("physical_health", 
    when((col("PhysHlth") > bins[0]) & (col("PhysHlth") <= bins[1]), labels[0])
    .when((col("PhysHlth") > bins[1]) & (col("PhysHlth") <= bins[2]), labels[1])
    .when((col("PhysHlth") > bins[2]) & (col("PhysHlth") <= bins[3]), labels[2])
    .when(col("PhysHlth") > bins[3], labels[3])
    .otherwise(None))

# Count the values in the 'physical_health' column
physical_health_distribution = main_df.groupBy("physical_health").count()
physical_health_distribution.show()

[Stage 55:>                                                         (0 + 1) / 1]

+---------------+------+
|physical_health| count|
+---------------+------+
|              1| 51500|
|              3| 18113|
|              2|  9987|
|              0|156778|
+---------------+------+



                                                                                

In [23]:
# Create new column 'Cardiovascular Disease' based on conditions
main_df = main_df.withColumn("Cardiovascular Disease", 
    when((col("HeartDiseaseorAttack") == 1) | (col("Stroke") == 1), 1).otherwise(0))

# Create new column 'Diseases of Metabolic Syndrome' based on conditions
main_df = main_df.withColumn("Diseases of Metabolic Syndrome", 
    when((col("HighBP") == 1) | (col("HighChol") == 1), 1).otherwise(0))

# Show the updated DataFrame to see the changes (optional)
main_df.show()

+------------+------+--------+---------+-----------------+------+------+--------------------+------------+------+-------+-----------------+-------------+-----------+-------+--------+--------+---+---+---------------+----------------------+------------------------------+
|Diabetes_012|HighBP|HighChol|CholCheck|              BMI|Smoker|Stroke|HeartDiseaseorAttack|PhysActivity|Fruits|Veggies|HvyAlcoholConsump|AnyHealthcare|NoDocbcCost|GenHlth|PhysHlth|DiffWalk|Sex|Age|physical_health|Cardiovascular Disease|Diseases of Metabolic Syndrome|
+------------+------+--------+---------+-----------------+------+------+--------------------+------------+------+-------+-----------------+-------------+-----------+-------+--------+--------+---+---+---------------+----------------------+------------------------------+
|           0|     0|       1|        1|               15|     1|     0|                   0|           0|     1|      1|                0|            1|          0|      5|      20|       0

In [24]:
# Count the values in the 'Cardiovascular Disease' column
cardiovascular_disease_counts = main_df.groupBy("Cardiovascular Disease").count()
cardiovascular_disease_counts.show()

# Count the values in the 'Diseases of Metabolic Syndrome' column
metabolic_syndrome_counts = main_df.groupBy("Diseases of Metabolic Syndrome").count()
metabolic_syndrome_counts.show()

                                                                                

+----------------------+------+
|Cardiovascular Disease| count|
+----------------------+------+
|                     1| 26589|
|                     0|209789|
+----------------------+------+

+------------------------------+------+
|Diseases of Metabolic Syndrome| count|
+------------------------------+------+
|                             1|138001|
|                             0| 98377|
+------------------------------+------+



### 3.4. Integrate various data sources

In [25]:
main_df = main_df.withColumn("index", monotonically_increasing_id())
second_df = second_df.withColumn("index", monotonically_increasing_id())

# Perform the inner join on the index column
final_df = main_df.join(second_df, main_df.index == second_df.index, how='inner')
final_df.show()

+------------+------+--------+---------+-----------------+------+------+--------------------+------------+------+-------+-----------------+-------------+-----------+-------+--------+--------+---+---+---------------+----------------------+------------------------------+-----+------+-----+
|Diabetes_012|HighBP|HighChol|CholCheck|              BMI|Smoker|Stroke|HeartDiseaseorAttack|PhysActivity|Fruits|Veggies|HvyAlcoholConsump|AnyHealthcare|NoDocbcCost|GenHlth|PhysHlth|DiffWalk|Sex|Age|physical_health|Cardiovascular Disease|Diseases of Metabolic Syndrome|index|Income|index|
+------------+------+--------+---------+-----------------+------+------+--------------------+------------+------+-------+-----------------+-------------+-----------+-------+--------+--------+---+---+---------------+----------------------+------------------------------+-----+------+-----+
|           0|     0|       1|        1|               15|     1|     0|                   0|           0|     1|      1|            

In [26]:
num_columns = len(final_df.columns)
print(f"The number of columns in final_df: {num_columns}")
num_rows = final_df.count()
print(f"The number of rows in final_df: {num_rows}")

The number of columns in final_df: 25
The number of rows in final_df: 236378


In [27]:
final_df = final_df.drop('index')
num_columns = len(final_df.columns)
print(f"The number of columns in final_df: {num_columns}")

The number of columns in final_df: 23


### 3.5. Format the data as required

In [28]:
# Calculate the number of missing values per column
final_missing_value = final_df.select([count(when(col(c).isNull(), c)).alias(c) for c in final_df.columns])
final_missing_value.show()

+------------+------+--------+---------+---+------+------+--------------------+------------+------+-------+-----------------+-------------+-----------+-------+--------+--------+---+---+---------------+----------------------+------------------------------+------+
|Diabetes_012|HighBP|HighChol|CholCheck|BMI|Smoker|Stroke|HeartDiseaseorAttack|PhysActivity|Fruits|Veggies|HvyAlcoholConsump|AnyHealthcare|NoDocbcCost|GenHlth|PhysHlth|DiffWalk|Sex|Age|physical_health|Cardiovascular Disease|Diseases of Metabolic Syndrome|Income|
+------------+------+--------+---------+---+------+------+--------------------+------------+------+-------+-----------------+-------------+-----------+-------+--------+--------+---+---+---------------+----------------------+------------------------------+------+
|           0|     0|       0|        0|  0|     0|     0|                   0|           0|     0|      0|                0|            0|          0|      0|       0|       0|  0|  0|              0|          

## 4. 

In [29]:
value_counts = final_df.groupBy("Diabetes_012").count()
value_counts.show()
min_size = value_counts.agg({"count": "min"}).collect()[0][0]
print(f"Minimum class size: {min_size}")
windowSpec = Window.partitionBy("Diabetes_012").orderBy(rand())
final_df = final_df.withColumn("row_number", row_number().over(windowSpec))
balanced_data = final_df.filter(col("row_number") <= min_size).drop("row_number")
balanced_data.groupBy("Diabetes_012").count().show()

+------------+------+
|Diabetes_012| count|
+------------+------+
|           0|197191|
|           1|  5619|
|           2| 33568|
+------------+------+



                                                                                

Minimum class size: 5619


[Stage 89:>                                                         (0 + 1) / 1]

+------------+-----+
|Diabetes_012|count|
+------------+-----+
|           0| 5619|
|           1| 5619|
|           2| 5619|
+------------+-----+



                                                                                

In [30]:
columns_to_convert = [
    'Age', 'Diabetes_012', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
    'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
    'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'PhysHlth',
    'DiffWalk', 'physical_health', 'Cardiovascular Disease', 'Diseases of Metabolic Syndrome', 
    'Income', 'Sex']
for column_name in columns_to_convert:
    balanced_data = balanced_data.withColumn(column_name, col(column_name).cast(IntegerType()))


feature_columns = [c for c in balanced_data.columns if c != 'Diabetes_012']
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
rf = RandomForestClassifier(labelCol="Diabetes_012", featuresCol="features", numTrees=100, seed=42)
pipeline = Pipeline(stages=[assembler, rf])
(trainingData, testData) = balanced_data.randomSplit([0.7, 0.3], seed=42)
model = pipeline.fit(trainingData)
predictions = model.transform(testData)
evaluator = MulticlassClassificationEvaluator(
    labelCol="Diabetes_012", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))
rfModel = model.stages[-1]
importances = rfModel.featureImportances
importances_list = importances.toArray().tolist()
feature_importance_dict = dict(zip(feature_columns, importances_list))
sorted_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
for feature, importance in sorted_importance:
    print(f"{feature}: {importance}")


[Stage 128:>                                                        (0 + 1) / 1]

Test set accuracy = 0.5032245062474808
GenHlth: 0.21333884621943663
Diseases of Metabolic Syndrome: 0.19496958646936466
HighBP: 0.18383625136286294
Age: 0.09554931804062501
BMI: 0.08886205909906654
HighChol: 0.0462627899520565
DiffWalk: 0.046003911828705585
PhysActivity: 0.03308384630674443
Cardiovascular Disease: 0.023057131908188137
Income: 0.021300218878944753
HeartDiseaseorAttack: 0.014856096267459635
PhysHlth: 0.011855347010821896
physical_health: 0.005720250467666654
Sex: 0.004285689086399554
CholCheck: 0.004067369742956981
Stroke: 0.0028522518931771907
HvyAlcoholConsump: 0.0026732314032923993
AnyHealthcare: 0.0021121233025785187
NoDocbcCost: 0.0020558735335636535
Veggies: 0.0012781230309743034
Fruits: 0.0010141572644015574
Smoker: 0.0009655269307126209


                                                                                

In [31]:
balanced_data = balanced_data.drop('Fruits', 'AnyHealthcare', 'NoDocbcCost', 'Smoker', 'Veggies')
balanced_data.show()

[Stage 131:>                                                        (0 + 1) / 1]

+------------+------+--------+---------+---+------+--------------------+------------+-----------------+-------+--------+--------+---+---+---------------+----------------------+------------------------------+------+
|Diabetes_012|HighBP|HighChol|CholCheck|BMI|Stroke|HeartDiseaseorAttack|PhysActivity|HvyAlcoholConsump|GenHlth|PhysHlth|DiffWalk|Sex|Age|physical_health|Cardiovascular Disease|Diseases of Metabolic Syndrome|Income|
+------------+------+--------+---------+---+------+--------------------+------------+-----------------+-------+--------+--------+---+---+---------------+----------------------+------------------------------+------+
|           0|     0|       0|        1| 30|     0|                   0|           1|                0|      2|       0|       0|  0|  4|              0|                     0|                             0|    11|
|           0|     0|       0|        1| 40|     0|                   0|           1|                0|      1|       1|       0|  0|  6|   

[Stage 133:>                                                        (0 + 1) / 1]                                                                                

In [32]:
balanced_data = balanced_data.drop('HeartDiseaseorAttack', 'Stroke', 'HighBP', 'HighChol', 'PhysHlth')
balanced_data.show()

[Stage 137:>                                                        (0 + 1) / 1]

+------------+---------+---+------------+-----------------+-------+--------+---+---+---------------+----------------------+------------------------------+------+
|Diabetes_012|CholCheck|BMI|PhysActivity|HvyAlcoholConsump|GenHlth|DiffWalk|Sex|Age|physical_health|Cardiovascular Disease|Diseases of Metabolic Syndrome|Income|
+------------+---------+---+------------+-----------------+-------+--------+---+---+---------------+----------------------+------------------------------+------+
|           0|        1| 30|           1|                0|      2|       0|  0|  4|              0|                     0|                             0|    11|
|           0|        1| 40|           1|                0|      1|       0|  0|  6|              1|                     0|                             0|    10|
|           0|        1| 28|           0|                0|      4|       0|  0|  2|              1|                     0|                             1|     5|
|           0|        1| 34|

                                                                                

In [33]:
balanced_data = balanced_data.withColumn("AgeGroup",
    when(col("Age") < 4, 0)
    .when(col("Age") < 8, 1)
    .otherwise(2)
)

# Drop the 'Age' column
balanced_data = balanced_data.drop('Age')

# Assigning the result to another DataFrame (if necessary)
resulte = balanced_data

# Display DataFrame structure using printSchema() to mimic pandas info()
resulte.printSchema()

# Calculate and print the number of each data type
print("DataFrame information:")
for dtype in resulte.dtypes:
    count = resulte.filter(resulte[dtype[0]].cast(dtype[1]) == resulte[dtype[0]]).count()
    print(f"Column: {dtype[0]}, Type: {dtype[1]}, Non-null Count: {count}")

root
 |-- Diabetes_012: integer (nullable = true)
 |-- CholCheck: integer (nullable = true)
 |-- BMI: integer (nullable = true)
 |-- PhysActivity: integer (nullable = true)
 |-- HvyAlcoholConsump: integer (nullable = true)
 |-- GenHlth: integer (nullable = true)
 |-- DiffWalk: integer (nullable = true)
 |-- Sex: integer (nullable = true)
 |-- physical_health: integer (nullable = true)
 |-- Cardiovascular Disease: integer (nullable = false)
 |-- Diseases of Metabolic Syndrome: integer (nullable = false)
 |-- Income: integer (nullable = true)
 |-- AgeGroup: integer (nullable = false)

DataFrame information:


                                                                                

Column: Diabetes_012, Type: int, Non-null Count: 16857


                                                                                

Column: CholCheck, Type: int, Non-null Count: 16857


                                                                                

Column: BMI, Type: int, Non-null Count: 16857


                                                                                

Column: PhysActivity, Type: int, Non-null Count: 16857


                                                                                

Column: HvyAlcoholConsump, Type: int, Non-null Count: 16857


                                                                                

Column: GenHlth, Type: int, Non-null Count: 16857


                                                                                

Column: DiffWalk, Type: int, Non-null Count: 16857


                                                                                

Column: Sex, Type: int, Non-null Count: 16857


                                                                                

Column: physical_health, Type: int, Non-null Count: 16857


                                                                                

Column: Cardiovascular Disease, Type: int, Non-null Count: 16857


                                                                                

Column: Diseases of Metabolic Syndrome, Type: int, Non-null Count: 16857
Column: Income, Type: int, Non-null Count: 16857


[Stage 223:>                                                        (0 + 1) / 1]

Column: AgeGroup, Type: int, Non-null Count: 16857


[Stage 225:>                                                        (0 + 1) / 1]                                                                                

## 6.

In [34]:
# Specify the input columns (features) and the output column (features vector)
feature_columns = [c for c in resulte.columns if c != 'Diabetes_012']
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# Transform data to have feature vectors
data = assembler.transform(resulte)
# Split the data into training and testing sets
train_data, test_data = data.randomSplit([0.7, 0.3], seed=42)

In [35]:
# Create the Naive Bayes model
nb = NaiveBayes(labelCol="Diabetes_012", featuresCol="features", modelType="gaussian")

# Train the model
nb_model = nb.fit(train_data)

# Predictions
predictions = nb_model.transform(test_data)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol="Diabetes_012", predictionCol="prediction", metricName="accuracy")
train_accuracy = evaluator.evaluate(nb_model.transform(train_data))
test_accuracy = evaluator.evaluate(predictions)

# Print the accuracies
print(f'Naive Bayes training accuracy: {train_accuracy}')
print(f'Naive Bayes testing accuracy: {test_accuracy}')

[Stage 246:>                                                        (0 + 1) / 1]

Naive Bayes training accuracy: 0.49701555275325765
Naive Bayes testing accuracy: 0.4909310761789601


                                                                                

In [None]:
train_data_list = train_data.collect()
train_data_broadcast = spark.sparkContext.broadcast(train_data_list)

def knn_predict(features):
    k = 3  
    train_data = train_data_broadcast.value
    distances = []
    for row in train_data:
        dist = float(Vectors.squared_distance(features, Vectors.dense(row['features'])))
        distances.append((dist, row['Diabetes_012']))
    distances.sort(key=lambda x: x[0])
    nearest_labels = [dist[1] for dist in distances[:k]]
    prediction = max(set(nearest_labels), key=nearest_labels.count)
    return prediction

knn_predict_udf = udf(knn_predict, IntegerType())

knn_train_data = train_data.withColumn("prediction", knn_predict_udf(col("features"))).withColumn("prediction", col("prediction").cast(DoubleType()))
knn_test_data = test_data.withColumn("prediction", knn_predict_udf(col("features"))).withColumn("prediction", col("prediction").cast(DoubleType()))

evaluator = MulticlassClassificationEvaluator(labelCol="Diabetes_012", predictionCol="prediction", metricName="accuracy")

test_accuracy = evaluator.evaluate(knn_test_data)
train_accuracy = evaluator.evaluate(knn_train_data)

print(f'KNN training accuracy: {train_accuracy}')
print(f'KNN testing accuracy: {test_accuracy}')

In [36]:
from pyspark.ml.classification import OneVsRest
from pyspark.ml.classification import LinearSVC

lsvc = LinearSVC(maxIter=10, regParam=0.1)

# Initialize One Vs Rest with the base classifier
ovr = OneVsRest(classifier=lsvc, labelCol="Diabetes_012", featuresCol="features")

# Train the multiclass model
ovrModel = ovr.fit(train_data)

train_predictions_ovr = ovrModel.transform(train_data)
test_predictions_ovr = ovrModel.transform(test_data)

evaluator_ovr = MulticlassClassificationEvaluator(labelCol="Diabetes_012", predictionCol="prediction", metricName="accuracy")

train_accuracy_ovr = evaluator_ovr.evaluate(train_predictions_ovr)
print(f'One-vs-Rest SVC training accuracy: {train_accuracy_ovr}')

test_accuracy_ovr = evaluator_ovr.evaluate(test_predictions_ovr)
print(f'One-vs-Rest SVC test accuracy: {test_accuracy_ovr}')

24/05/23 15:42:21 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/05/23 15:42:21 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
[Stage 444:>                                                        (0 + 1) / 1]

One-vs-Rest SVC training accuracy: 0.48675914249684743


[Stage 449:>                                                        (0 + 1) / 1]

One-vs-Rest SVC test accuracy: 0.4758162031438936


                                                                                

In [43]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

lsvc = LinearSVC(maxIter=10, regParam=0.1)
ovr = OneVsRest(classifier=lsvc, labelCol="Diabetes_012", featuresCol="features")

paramGrid = ParamGridBuilder() \
    .addGrid(lsvc.regParam, [0.1, 1.0, 10.0]) \
    .addGrid(lsvc.maxIter, [10, 100, 1000]) \
    .build()

evaluator = MulticlassClassificationEvaluator(labelCol="Diabetes_012", predictionCol="prediction", metricName="accuracy")

crossval = CrossValidator(estimator=ovr,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=5)  # Use 5-fold cross validation

cvModel = crossval.fit(train_data)

bestModel = cvModel.bestModel
bestLsvcModel = bestModel.getClassifier().extractParamMap()
print("Best model parameters:", bestLsvcModel)

Best model parameters: [0.1, 10]


## 7.

In [44]:
from pyspark.ml.classification import OneVsRest
from pyspark.ml.classification import LinearSVC

lsvc = LinearSVC(maxIter=10, regParam=0.1)

# Initialize One Vs Rest with the base classifier
ovr = OneVsRest(classifier=lsvc, labelCol="Diabetes_012", featuresCol="features")

# Train the multiclass model
ovrModel = ovr.fit(train_data)

train_predictions_ovr = ovrModel.transform(train_data)
test_predictions_ovr = ovrModel.transform(test_data)

evaluator_ovr = MulticlassClassificationEvaluator(labelCol="Diabetes_012", predictionCol="prediction", metricName="accuracy")

train_accuracy_ovr = evaluator_ovr.evaluate(train_predictions_ovr)
print(f'One-vs-Rest SVC training accuracy: {train_accuracy_ovr}')

test_accuracy_ovr = evaluator_ovr.evaluate(test_predictions_ovr)
print(f'One-vs-Rest SVC test accuracy: {test_accuracy_ovr}')

                                                                                

One-vs-Rest SVC training accuracy: 0.4861674658797492


[Stage 3173:>                                                       (0 + 1) / 1]

One-vs-Rest SVC test accuracy: 0.4788007268322229


                                                                                

In [45]:
predictions = ovrModel.transform(test_data)

# Adding a new column that converts prediction scores to binary class labels
predictions = predictions.withColumn("predictedLabel", predictions.prediction.cast("integer"))

# Calculate positive and negative cases
positive_cases = predictions.filter(col("predictedLabel") == 1).count()
negative_cases = predictions.filter(col("predictedLabel") == 0).count()

# Print results
print(f"1: {positive_cases}")
print(f"0: {negative_cases}")

[Stage 3185:>                                                       (0 + 1) / 1]

1: 160
0: 1759


                                                                                

## 8.

In [46]:
train_data, test_data = data.randomSplit([0.7, 0.3], seed=42)

from pyspark.ml.classification import OneVsRest
from pyspark.ml.classification import LinearSVC

lsvc = LinearSVC(maxIter=10, regParam=0.1)

# Initialize One Vs Rest with the base classifier
ovr = OneVsRest(classifier=lsvc, labelCol="Diabetes_012", featuresCol="features")

# Train the multiclass model
ovrModel = ovr.fit(train_data)

train_predictions_ovr = ovrModel.transform(train_data)
test_predictions_ovr = ovrModel.transform(test_data)

evaluator_ovr = MulticlassClassificationEvaluator(labelCol="Diabetes_012", predictionCol="prediction", metricName="accuracy")

train_accuracy_ovr = evaluator_ovr.evaluate(train_predictions_ovr)
print(f'One-vs-Rest SVC training accuracy: {train_accuracy_ovr}')

test_accuracy_ovr = evaluator_ovr.evaluate(test_predictions_ovr)
print(f'One-vs-Rest SVC test accuracy: {test_accuracy_ovr}')

                                                                                

One-vs-Rest SVC training accuracy: 0.48675914249684743


[Stage 3390:>                                                       (0 + 1) / 1]

One-vs-Rest SVC test accuracy: 0.4758162031438936


                                                                                

In [47]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

from pyspark.ml.classification import OneVsRest
from pyspark.ml.classification import LinearSVC

lsvc = LinearSVC(maxIter=10, regParam=0.1)

# Initialize One Vs Rest with the base classifier
ovr = OneVsRest(classifier=lsvc, labelCol="Diabetes_012", featuresCol="features")

# Train the multiclass model
ovrModel = ovr.fit(train_data)

train_predictions_ovr = ovrModel.transform(train_data)
test_predictions_ovr = ovrModel.transform(test_data)

evaluator_ovr = MulticlassClassificationEvaluator(labelCol="Diabetes_012", predictionCol="prediction", metricName="accuracy")

train_accuracy_ovr = evaluator_ovr.evaluate(train_predictions_ovr)
print(f'One-vs-Rest SVC training accuracy: {train_accuracy_ovr}')

test_accuracy_ovr = evaluator_ovr.evaluate(test_predictions_ovr)
print(f'One-vs-Rest SVC test accuracy: {test_accuracy_ovr}')

                                                                                

One-vs-Rest SVC training accuracy: 0.4861674658797492


[Stage 3593:>                                                       (0 + 1) / 1]

One-vs-Rest SVC test accuracy: 0.4788007268322229


                                                                                