/
v1-numeric-features.scala
59 lines (39 loc) · 2.52 KB
/
v1-numeric-features.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
/**
2021-09-18
https://www.kaggle.com/c/house-prices-advanced-regression-techniques/overview
*/
import org.apache.spark.sql.DataFrame
// val rootPath: String = _
val rootPath: String = "/Users/wulei/Raymond/git/spark-intro2/22 从房价预测开始/data/house-prices-advanced-regression-techniques"
val filePath: String = s"${rootPath}/train.csv"
val trainDF: DataFrame = spark.read.format("csv").option("header", true).load(filePath)
// 所有数值型字段
val numericFields: Array[String] = Array("LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF", "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces", "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea")
val labelFields: Array[String] = Array("SalePrice")
// 抽取所有数值型字段
val selectedFields: DataFrame = trainDF.selectExpr((numericFields ++ labelFields): _*)
import org.apache.spark.sql.types.IntegerType
var typedFields = selectedFields
for (field <- (numericFields ++ labelFields)) {
typedFields = typedFields.withColumn(s"${field}Int",col(field).cast(IntegerType)).drop(field)
}
import org.apache.spark.ml.feature.VectorAssembler
val features: Array[String] = numericFields.map(_ + "Int").toArray
val assembler = new VectorAssembler().setInputCols(features).setOutputCol("features").setHandleInvalid("skip")
var featuresAdded: DataFrame = assembler.transform(typedFields)
for (feature <- features) {
featuresAdded = featuresAdded.drop(feature)
}
val Array(trainSet, testSet) = featuresAdded.randomSplit(Array(0.7, 0.3))
import org.apache.spark.ml.regression.LinearRegression
// val lr = new LinearRegression().setLabelCol("SalePriceInt").setFeaturesCol("features").setMaxIter(10)
val lr = new LinearRegression().setLabelCol("SalePriceInt").setFeaturesCol("features").setMaxIter(1000)
val lrModel = lr.fit(trainSet)
val trainingSummary = lrModel.summary
println(s"Root Mean Squared Error (RMSE) on train data: ${trainingSummary.rootMeanSquaredError}")
// RMSE: 38288.77947156114
import org.apache.spark.ml.evaluation.RegressionEvaluator
val predictions: DataFrame = lrModel.transform(testSet).select("SalePriceInt", "prediction")
val evaluator = new RegressionEvaluator().setLabelCol("SalePriceInt").setPredictionCol("prediction").setMetricName("rmse")
val rmse = evaluator.evaluate(predictions)
println("Root Mean Squared Error (RMSE) on test data = " + rmse)