In [2]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler, StandardScaler, Imputer
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.functions import col, when

# Download CSV
!wget https://raw.githubusercontent.com/neelamdoshi/Spark_neelam/main/diabetes.csv

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("DiabetesLogisticRegressionWithoutPipeline") \
    .getOrCreate()

# Load and inspect the data
data = spark.read.csv("diabetes.csv", header=True, inferSchema=True)
data.printSchema()
data.show(5)

# Step 1: Replace zero values with nulls in specific columns where zero is invalid
columns_with_zero_as_missing = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]

# Replace zero values with null in specified columns
for column in columns_with_zero_as_missing:
    data = data.withColumn(column, when(col(column) == 0, None).otherwise(col(column)))

# Check data after replacing zeros
data.show(5)

# Step 2: Impute missing (null) values using the median strategy
# List of all feature columns excluding the label (Outcome)
feature_cols = [col for col in data.columns if col != 'Outcome']

# Create an Imputer instance to fill missing values with the median
imputer = Imputer(inputCols=feature_cols, outputCols=[f"{col}_imputed" for col in feature_cols])\
    .setStrategy("median")  # Set strategy to median

# Apply the Imputer
imputed_data = imputer.fit(data).transform(data)
imputed_data.show(5)  # Inspect imputed data

# Step 3: Assemble the features (manually without pipeline)
# Use imputed columns for feature assembling
imputed_feature_cols = [f"{col}_imputed" for col in feature_cols]

# Assemble the feature columns into a single vector
assembler = VectorAssembler(inputCols=imputed_feature_cols, outputCol="features")
assembled_data = assembler.transform(imputed_data)
assembled_data.select("features").show(5)

# Step 4: Standardize the feature vectors (scaling manually)
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(assembled_data)
scaled_data = scaler_model.transform(assembled_data)
scaled_data.select("scaledFeatures").show(5)

# Step 5: Prepare label column
# Create the label column from Outcome (cast it to double type)
final_data = scaled_data.withColumn("label", col("Outcome").cast("double"))

# Step 6: Split data into training and test sets
train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=42)

# Step 7: Train a Logistic Regression model
lr = LogisticRegression(featuresCol="scaledFeatures", labelCol="label")
lr_model = lr.fit(train_data)

# Step 8: Make predictions on the test data
predictions = lr_model.transform(test_data)
predictions.select("label", "prediction", "probability").show(5)

# Step 9: Evaluate the model using BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy}")

# Stop Spark session
spark.stop()


--2024-10-18 16:45:13--  https://raw.githubusercontent.com/neelamdoshi/Spark_neelam/main/diabetes.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23875 (23K) [text/plain]
Saving to: ‘diabetes.csv’


2024-10-18 16:45:13 (16.4 MB/s) - ‘diabetes.csv’ saved [23875/23875]

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregn