In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler

#  SparkSession 생성
spark = SparkSession.builder \
    .appName("adam") \
    .getOrCreate()

# 데이터를 원격 URL에서 다운로드하기 위해 addFile()을 사용하여 데이터 파일을 Spark 클러스터 노드로 복사
url = "https://raw.githubusercontent.com/pkmklong/Breast-Cancer-Wisconsin-Diagnostic-DataSet/master/data.csv"
spark.sparkContext.addFile(url)

# SparkFiles.get()를 사용하여 복사된 데이터 파일을 읽고, DataFrame으로 로드 
# 첫 2개의 행 출력
df = spark.read.csv(SparkFiles.get("data.csv"), header=True, inferSchema=True)
df.show(2)

# DataFrame의 컬럼명 변경
#id' 및 'diagnosis' 컬럼 외 'feature_1', 'feature_2', ... 형식으로 변경
columns = ['id', 'diagnosis'] + [f'feature_{i}' for i in range(1, 32)]
data = df.toDF(*columns)

# 'diagnosis' 컬럼의 'M'을 1로, 'B' (양성)을 0으로 정수 매핑 후, 'diagnosis' 컬럼은 제거
data = data.withColumn("label", (data["diagnosis"] == "M").cast("integer")).drop("diagnosis")

# 'feature_1'부터 'feature_24'까지의 컬럼을 선택하고, 이를 하나의 피처 벡터 컬럼으로 변환 후 'features' 컬럼에 저장
feature_columns = [f'feature_{i}' for i in range(1, 25)]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(data)

# 학습, 테스트셋 분리
train_data, test_data = data.randomSplit([0.8, 0.2], seed=20230921)

+------+---------+-----------+------------+--------------+---------+---------------+----------------+--------------+-------------------+-------------+----------------------+---------+----------+------------+-------+-------------+--------------+------------+-----------------+-----------+--------------------+------------+-------------+---------------+----------+----------------+-----------------+---------------+--------------------+--------------+-----------------------+----+
|    id|diagnosis|radius_mean|texture_mean|perimeter_mean|area_mean|smoothness_mean|compactness_mean|concavity_mean|concave points_mean|symmetry_mean|fractal_dimension_mean|radius_se|texture_se|perimeter_se|area_se|smoothness_se|compactness_se|concavity_se|concave points_se|symmetry_se|fractal_dimension_se|radius_worst|texture_worst|perimeter_worst|area_worst|smoothness_worst|compactness_worst|concavity_worst|concave points_worst|symmetry_worst|fractal_dimension_worst|_c32|
+------+---------+-----------+------------

In [2]:
logistic_regression = LogisticRegression(featuresCol="features", labelCol="label")
print(logistic_regression.explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features, current: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label, current: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. The

In [3]:
model = logistic_regression.fit(train_data)

In [4]:
# 절편 및 계수
print("coeff:", model.coefficients)
print("intercept:",model.intercept)

coeff: [-18.745836156912354,-37.61798013875791,-3.234477875921946,-0.16877842822122097,7445.113403326788,-83.2724078803222,3683.412854717869,5960.411307151161,1424.9497556518256,1004.8882626760784,1009.8576143004756,-558.7181204528758,14.155740791097887,4.138568010177129,12019.785085927231,2431.1566273670355,3088.167480690681,-4682.901772453213,-16922.129391524082,-98788.23438764102,60.668927195695005,88.88712346023758,6.351016407981151,0.44238213516090985]
intercept: -3954.2536865501384


In [5]:
# 요약
summary = model.summary
print("AUROC:",summary.areaUnderROC)
print("ACC:",summary.accuracy)

AUROC: 1.0
ACC: 1.0


In [6]:
# prediction 진행
predictions = model.transform(test_data)

# AUC-ROC는 BinaryClassificationEvaluator 에서 측정 가능
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label")
auc = evaluator.evaluate(predictions)

# Accuracy, Precision, and Recall는 MulticlassClassificationEvaluator에서 측정 가능
multi_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
accuracy = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "accuracy"})
precision = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "weightedPrecision"})
recall = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "weightedRecall"})

print(f"AUC-ROC: {auc:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

AUC-ROC: 0.9849
Accuracy: 0.9426
Precision: 0.9426
Recall: 0.9426
