In [23]:
!pip install pyspark
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit




In [24]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType
from pyspark.ml import Pipeline
from pyspark.ml.feature import Imputer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import when # Import the when function

# Initialize Spark Session
spark = SparkSession.builder.appName("DiabetesPipeline").getOrCreate()

# Define Schema
my_schema = StructType([
    StructField(name='Pregnancies', dataType=IntegerType(), nullable=True),
    StructField(name='Glucose', dataType=IntegerType(), nullable=True),
    StructField(name='BloodPressure', dataType=IntegerType(), nullable=True),
    StructField(name='SkinThickness', dataType=IntegerType(), nullable=True),
    StructField(name='Insulin', dataType=IntegerType(), nullable=True),
    StructField(name='BMI', dataType=DoubleType(), nullable=True),
    StructField(name='DiabetesPedigreeFunction', dataType=DoubleType(), nullable=True),
    StructField(name='Age', dataType=IntegerType(), nullable=True),
    StructField(name='Outcome', dataType=IntegerType(), nullable=True)
])

# Read Data
my_data = spark.read.csv('diabetes.csv', schema=my_schema, header=True)
# Replace 0 with Null in specified columns
columns_to_replace = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in columns_to_replace:
    my_data = my_data.withColumn(col, when(my_data[col] == 0, None).otherwise(my_data[col]))

In [34]:
# Define Pipeline Stages
imputer = Imputer(
    inputCols=my_data.columns,
    outputCols=["{}_imputed".format(c) for c in my_data.columns]
).setStrategy("median")


# Fit Imputer
imputerModel = imputer.fit(my_data)

# Transform Data
my_data = imputerModel.transform(my_data)
my_data.head(25)


[Row(Pregnancies=6, Glucose=148, BloodPressure=72, SkinThickness=35, Insulin=None, BMI=33.6, DiabetesPedigreeFunction=0.627, Age=50, Outcome=1, Pregnancies_imputed=6, Glucose_imputed=148, BloodPressure_imputed=72, SkinThickness_imputed=35, Insulin_imputed=125, BMI_imputed=33.6, DiabetesPedigreeFunction_imputed=0.627, Age_imputed=50, Outcome_imputed=1, Pregnancies_imputed_imputed=6, Glucose_imputed_imputed=148, BloodPressure_imputed_imputed=72, SkinThickness_imputed_imputed=35, Insulin_imputed_imputed=125, BMI_imputed_imputed=33.6, DiabetesPedigreeFunction_imputed_imputed=0.627, Age_imputed_imputed=50, Outcome_imputed_imputed=1, Pregnancies_imputed_imputed_imputed=6, Glucose_imputed_imputed_imputed=148, BloodPressure_imputed_imputed_imputed=72, SkinThickness_imputed_imputed_imputed=35, Insulin_imputed_imputed_imputed=125, BMI_imputed_imputed_imputed=33.6, DiabetesPedigreeFunction_imputed_imputed_imputed=0.627, Age_imputed_imputed_imputed=50, Outcome_imputed_imputed_imputed=1, Pregnancie

In [38]:
#my_data.head(20)

assembler = VectorAssembler(
    inputCols=['Pregnancies_imputed', 'Glucose_imputed', 'BloodPressure_imputed', 'SkinThickness_imputed', 'Insulin_imputed', 'BMI_imputed', 'DiabetesPedigreeFunction_imputed', 'Age_imputed'],
    outputCol='features'
)

lr = LogisticRegression(featuresCol='features', labelCol='Outcome_imputed', maxIter=10)

# Create Pipeline
pipeline = Pipeline(stages=[imputer, assembler, lr])

# Split Data
xtrain, xtest = my_data.randomSplit([0.8, 0.2])

# Train Model using Pipeline
pipelineModel = pipeline.fit(xtrain)
predictions = pipelineModel.transform(xtest)

# Evaluate Model
evaluator = MulticlassClassificationEvaluator()
evaluator.setLabelCol("Outcome_imputed")
evaluator.setPredictionCol("prediction")
accuracy = evaluator.evaluate(predictions)
print(f"Model Accuracy: {accuracy}")


Model Accuracy: 0.7290776192133658
