In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

#  SparkSession 생성
spark = SparkSession \
    .builder \
    .appName("adam") \
    .getOrCreate()

# 외부 csv 데이터 불러온 후, 스키마 출력(inferSchema는 컬럼의 데이터 타입을 자동으로 추론)
data = spark.read.csv('./boston.csv', header=True, inferSchema=True)
data.printSchema()

# label인 medv를 제외한 컬럼을 가져오고 이를 하나의 피처 벡터 컬럼으로 변환 후 'features' 컬럼에 저장
feature_columns = data.columns[:-1]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(data)

# 학습, 테스트셋 분리
train_data, test_data = data.randomSplit([0.8, 0.2], seed=20230921)

root
 |-- _c0: integer (nullable = true)
 |-- crim: double (nullable = true)
 |-- zn: double (nullable = true)
 |-- indus: double (nullable = true)
 |-- chas: integer (nullable = true)
 |-- nox: double (nullable = true)
 |-- rm: double (nullable = true)
 |-- age: double (nullable = true)
 |-- dis: double (nullable = true)
 |-- rad: integer (nullable = true)
 |-- tax: integer (nullable = true)
 |-- ptratio: double (nullable = true)
 |-- black: double (nullable = true)
 |-- lstat: double (nullable = true)
 |-- medv: double (nullable = true)



In [2]:
glr = GeneralizedLinearRegression(featuresCol="features", labelCol="medv") \
     .setFamily("gaussian") \
     .setLink("identity") \
     .setMaxIter(10)
print(glr.explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
family: The name of family which is a description of the error distribution to be used in the model. Supported options: gaussian (default), binomial, poisson, gamma and tweedie. (default: gaussian, current: gaussian)
featuresCol: features column name. (default: features, current: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label, current: medv)
link: The name of link function which provides the relationship between the linear predictor and the mean of the distribution function. Supported options: identity, log, inverse, logit, probit, cloglog and sqrt. (current: identity)
linkPower: The index in the power link function. Only applicable to the Tweedie family. (undefined)
linkPredictionCol: link prediction (linear predictor) column name (undefined)
maxIter: max number of iterations (>= 0). (default: 25, current: 10)
offsetCol: The offset column nam

In [3]:
model = glr.fit(train_data)

In [4]:
# 절편 및 계수
print("coeff:", model.coefficients)
print("intercept:",model.intercept)

coeff: [-0.0011384965525373981,-0.11382731736982217,0.04345857667392836,-0.009590641423899147,2.7773521264664107,-16.706034331615328,3.6604896948821906,-0.004348133394135391,-1.4170988376642684,0.3340328560716146,-0.014958676488581353,-1.0041442068694604,0.007751474062520427,-0.44705086421922635]
intercept: 38.78168582742949


In [12]:
# prediction 진행
predictions = model.transform(test_data)

evaluator = RegressionEvaluator(labelCol="medv", predictionCol='prediction')
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

print(f"rmse: {rmse:.4f}")
print(f"mae: {mae:.4f}")
print(f"r2: {r2:.4f}")

rmse: 5.4116
mae: 3.3250
r2: 0.6783
