In [ ]:
val dataDir = sys.props("java.io.tmpdir") + "/data/linear_regression"

In [ ]:
val data = sparkSession.read.parquet(s"${dataDir}/djia.parquet")

In [ ]:
import org.apache.spark.ml.regression.LinearRegression

In [ ]:
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors

In [ ]:
val frame = data.drop("DWDP").drop("MMM")
val features = frame.columns.filter(_ != "JPM").filter(_ != "date")

In [ ]:
val assembler = new VectorAssembler()
                    .setInputCols(features)
                    .setOutputCol("features")

In [ ]:
val lr = new LinearRegression()
  .setMaxIter(10)

In [ ]:
val training = frame.filter($"date" < "2018-01-01")
val test     = frame.filter($"date" >= "2018-01-01")

In [ ]:
val Array(training, test) = frame.randomSplit(Array(4, 1))

In [ ]:
(training.count, test.count)

In [ ]:
val mlTrain = assembler.transform(training).select("date","JPM", "features")
                                           .withColumnRenamed("JPM", "label")
val mlTest = assembler.transform(test).select("date", "JPM", "features")
                                      .withColumnRenamed("JPM", "label")

In [ ]:
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel}
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}

In [ ]:
val paramGrid = new ParamGridBuilder()
  .addGrid(lr.regParam, Array(10.0, 1.0, 0.1, 0.01, 0.001))
  .addGrid(lr.elasticNetParam, Array(0.0, 0.33, 0.66, 1.0))
  .build()


In [ ]:
val trainValidationSplit = new TrainValidationSplit()
  .setEstimator(lr)
  .setEvaluator(new RegressionEvaluator)
  .setEstimatorParamMaps(paramGrid)
  // 80% of the data will be used for training and the remaining 20% for validation.
  .setTrainRatio(0.75)

In [ ]:
val model = trainValidationSplit.fit(mlTrain)

In [ ]:
model.validationMetrics

In [ ]:
model.bestModel.asInstanceOf[LinearRegressionModel].extractParamMap()

In [ ]:
val evalModel = model.bestModel.asInstanceOf[LinearRegressionModel]
                               .evaluate(mlTest)

In [ ]:
evalModel.rootMeanSquaredError

In [ ]:
evalModel.select("features", "label", "prediction")
  .show()

In [ ]:
evalModel.predictions.select("label","prediction").collect

In [ ]:
evalModel.residuals.collect

* Training & validation (random)
* Training & validation (2017/2018)
* Binning volume
* use binned volume as categories
* New model with volume as feature