In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [2]:
spark = SparkSession.builder.getOrCreate()



In [3]:
# Import Data
df_train = spark.read.csv('./Classification/diabetes_prediction_training.csv', header=True, inferSchema=True)
df_test = spark.read.csv('./Classification/diabetes_prediction_testing.csv', header=True, inferSchema=True)

# Selecting Features & Label Col
df_train = df_train.select('age', 'smoking_history', "bmi", "diabetes", "HbA1c_level")
df_test = df_test.select('age', 'smoking_history', "bmi", "diabetes", "HbA1c_level")

# Filter NA data
df_train.show()

+----+---------------+-----+--------+-----------+
| age|smoking_history|  bmi|diabetes|HbA1c_level|
+----+---------------+-----+--------+-----------+
|80.0|          never|25.19|       0|        6.6|
|54.0|        No Info|27.32|       0|        6.6|
|28.0|          never|27.32|       0|        5.7|
|36.0|        current|23.45|       0|        5.0|
|76.0|        current|20.14|       0|        4.8|
|20.0|          never|27.32|       0|        6.6|
|44.0|          never|19.31|       1|        6.5|
|79.0|        No Info|23.86|       0|        5.7|
|42.0|          never|33.64|       0|        4.8|
|32.0|          never|27.32|       0|        5.0|
|53.0|          never|27.32|       0|        6.1|
|54.0|         former| 54.7|       0|        6.0|
|78.0|         former|36.05|       0|        5.0|
|67.0|          never|25.69|       0|        5.8|
|76.0|        No Info|27.32|       0|        5.0|
|78.0|        No Info|27.32|       0|        6.6|
|15.0|          never|30.36|       0|        6.1|


- never = 0
- ever = 1
- No Info = 2
- not current = 3
- former = 4
- current = 5

In [4]:
df_train = df_train.withColumn("smoking_history", 
                                   when(df_train["smoking_history"] == "never", 0)
                                   .when(df_train["smoking_history"] == "ever", 1)
                                   .when(df_train["smoking_history"] == "No Info", 2)
                                   .when(df_train["smoking_history"] == "not current", 3)
                                   .when(df_train["smoking_history"] == "former", 4)
                                   .when(df_train["smoking_history"] == "current", 5))

df_test = df_test.withColumn("smoking_history", 
                                   when(df_test["smoking_history"] == "never", 0)
                                   .when(df_test["smoking_history"] == "ever", 1)
                                   .when(df_test["smoking_history"] == "No Info", 2)
                                   .when(df_test["smoking_history"] == "not current", 3)
                                   .when(df_test["smoking_history"] == "former", 4)
                                   .when(df_test["smoking_history"] == "current", 5))


df_train.show()

+----+---------------+-----+--------+-----------+
| age|smoking_history|  bmi|diabetes|HbA1c_level|
+----+---------------+-----+--------+-----------+
|80.0|              0|25.19|       0|        6.6|
|54.0|              2|27.32|       0|        6.6|
|28.0|              0|27.32|       0|        5.7|
|36.0|              5|23.45|       0|        5.0|
|76.0|              5|20.14|       0|        4.8|
|20.0|              0|27.32|       0|        6.6|
|44.0|              0|19.31|       1|        6.5|
|79.0|              2|23.86|       0|        5.7|
|42.0|              0|33.64|       0|        4.8|
|32.0|              0|27.32|       0|        5.0|
|53.0|              0|27.32|       0|        6.1|
|54.0|              4| 54.7|       0|        6.0|
|78.0|              4|36.05|       0|        5.0|
|67.0|              0|25.69|       0|        5.8|
|76.0|              2|27.32|       0|        5.0|
|78.0|              2|27.32|       0|        6.6|
|15.0|              0|30.36|       0|        6.1|


In [5]:
# Normalization
df_train = df_train.na.drop()
df_test = df_test.na.drop()

cols = df_train.columns
cols.remove("diabetes")

df_train = VectorAssembler(inputCols=cols, outputCol="VectorOut").transform(df_train)
df_train = StandardScaler(inputCol="VectorOut", outputCol="features").fit(df_train)\
                                                                        .transform(df_train)

df_test = VectorAssembler(inputCols=cols, outputCol="VectorOut").transform(df_test)
df_test = StandardScaler(inputCol="VectorOut", outputCol="features").fit(df_test)\
                                                                        .transform(df_test)

# Model
model = LogisticRegression(featuresCol="features", labelCol="diabetes", maxIter=100).fit(df_train)

# Evaluate
evaluator = BinaryClassificationEvaluator(labelCol="diabetes")
print(f"Result: {evaluator.evaluate(model.transform(df_test)) * 100}%")


Result: 92.61648285427995%
