In [23]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

#  SparkSession 생성
spark = SparkSession \
    .builder \
    .appName("adam") \
    .getOrCreate()

# 외부 csv 데이터 불러온 후, 스키마 출력(inferSchema는 컬럼의 데이터 타입을 자동으로 추론)
data = spark.read.csv('./boston.csv', header=True, inferSchema=True)
data.printSchema()

# label인 medv를 제외한 컬럼을 가져오고 이를 하나의 피처 벡터 컬럼으로 변환 후 'features' 컬럼에 저장
feature_columns = data.columns[:-1]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(data)

# 학습, 테스트셋 분리
train_data, test_data = data.randomSplit([0.8, 0.2], seed=20230921)

root
 |-- _c0: integer (nullable = true)
 |-- crim: double (nullable = true)
 |-- zn: double (nullable = true)
 |-- indus: double (nullable = true)
 |-- chas: integer (nullable = true)
 |-- nox: double (nullable = true)
 |-- rm: double (nullable = true)
 |-- age: double (nullable = true)
 |-- dis: double (nullable = true)
 |-- rad: integer (nullable = true)
 |-- tax: integer (nullable = true)
 |-- ptratio: double (nullable = true)
 |-- black: double (nullable = true)
 |-- lstat: double (nullable = true)
 |-- medv: double (nullable = true)



In [16]:
lr = LinearRegression(featuresCol="features", labelCol="medv").setMaxIter(10).setElasticNetParam(0.8)
print(lr.explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0, current: 0.8)
epsilon: The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber (default: 1.35)
featuresCol: features column name. (default: features, current: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label, current: medv)
loss: The loss function to be optimized. Supported options: squaredError, huber. (default: squaredError)
maxBlockSizeInMB: maximum memory in MB for stacking input data into blocks. Data is stacked within partitions. If more than remaining data size in a partition then it is adjusted to the data size. Default 0.0 represents choosing optimal value, depends on specific algorithm. Must be >= 0. (default: 0.0)
maxIter: max nu

In [17]:
model = lr.fit(train_data)

In [18]:
# 절편 및 계수
print("coeff:", model.coefficients)
print("intercept:",model.intercept)

coeff: [-0.0011384965525373981,-0.11382731736982217,0.04345857667392836,-0.009590641423899147,2.7773521264664107,-16.706034331615328,3.6604896948821906,-0.004348133394135391,-1.4170988376642684,0.3340328560716146,-0.014958676488581353,-1.0041442068694604,0.007751474062520427,-0.44705086421922635]
intercept: 38.78168582742949


In [20]:
# 요약
summary = model.summary
summary.residuals.show()
print("total_iter:",summary.totalIterations)
print("RMSE:",summary.rootMeanSquaredError)
print("R2:",summary.r2)

+--------------------+
|           residuals|
+--------------------+
|   -5.93347367272095|
| -3.5602435194523423|
|    4.41400038252246|
|   4.824187881803212|
|     8.1714686396922|
|  3.2801858110954747|
| -0.6796235279944582|
|   6.617283915340149|
|   3.108602362798429|
| -0.9351824618219347|
|   -5.05181059652363|
| -3.2792364653870543|
|-0.19139056616458205|
|  0.8945962195035371|
|  0.6022852708869735|
| 0.26032333762736926|
|-0.33629364672044204|
|  0.2914959305512621|
| -1.1621981427636854|
| 0.05606782582302117|
+--------------------+
only showing top 20 rows

total_iter: 0
RMSE: 4.489037353393413
R2: 0.75226768239584




In [30]:
# prediction 진행
predictions = model.transform(test_data)

evaluator = RegressionEvaluator(labelCol="medv", predictionCol='prediction')
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

print(f"rmse: {rmse:.4f}")
print(f"mae: {mae:.4f}")
print(f"r2: {r2:.4f}")

rmse: 5.4116
mae: 3.3250
r2: 0.6783
