In [1]:
!pip install pyspark



In [20]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler

spark = SparkSession.builder \
    .appName("BDA Q1") \
    .getOrCreate()

In [21]:
url = "https://raw.githubusercontent.com/pkmklong/Breast-Cancer-Wisconsin-Diagnostic-DataSet/master/data.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("data.csv"), header=True, inferSchema=True)

In [22]:
df.show(1)

+------+---------+-----------+------------+--------------+---------+---------------+----------------+--------------+-------------------+-------------+----------------------+---------+----------+------------+-------+-------------+--------------+------------+-----------------+-----------+--------------------+------------+-------------+---------------+----------+----------------+-----------------+---------------+--------------------+--------------+-----------------------+----+
|    id|diagnosis|radius_mean|texture_mean|perimeter_mean|area_mean|smoothness_mean|compactness_mean|concavity_mean|concave points_mean|symmetry_mean|fractal_dimension_mean|radius_se|texture_se|perimeter_se|area_se|smoothness_se|compactness_se|concavity_se|concave points_se|symmetry_se|fractal_dimension_se|radius_worst|texture_worst|perimeter_worst|area_worst|smoothness_worst|compactness_worst|concavity_worst|concave points_worst|symmetry_worst|fractal_dimension_worst|_c32|
+------+---------+-----------+------------

In [23]:
columns = ['id', 'diagnosis'] + [f'feature_{i}' for i in range(1, 32)]
data = df.toDF(*columns)

#Map 'M' (malignant) to 1 and 'B' (benign) to 0
data = data.withColumn("label", (data["diagnosis"] == "M").cast("integer")).drop("diagnosis")

feature_columns = [f'feature_{i}' for i in range(1, 25)]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

data = assembler.transform(data)

train_data, test_data = data.randomSplit([0.8, 0.2])

In [5]:
logistic_regression = LogisticRegression(featuresCol="features", labelCol="label")
model = logistic_regression.fit(train_data)

In [16]:
coefficients = model.coefficients
intercept = model.intercept

print("Coefficients: ", *[ round( coefficient, 4) for coefficient in coefficients], sep = ", ")
print("Intercept: {:.2f}".format(intercept))

Coefficients: , -4.008, 0.0045, -0.4551, 0.0187, 90.2372, -156.5367, 119.7819, 95.9678, 32.9891, 658.6888, 17.5873, -3.2028, -4.3962, 0.088, -153.4554, 104.2943, -165.8414, 738.2135, 209.7404, -1656.5333, 1.0116, 0.6981, 0.446, 0.0288
Intercept: -69.28


In [17]:
predictions = model.transform(test_data)

# AUC-ROC
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label")
auc = evaluator.evaluate(predictions)

# Accuracy, Precision, and Recall
multi_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
accuracy = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "accuracy"})
precision = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "weightedPrecision"})
recall = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "weightedRecall"})

print(f"AUC-ROC: {auc:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

AUC-ROC: 0.9775
Accuracy: 0.9683
Precision: 0.9685
Recall: 0.9683
