In [ ]:
val dataDir = sys.props("java.io.tmpdir") + "/data/linear_regression"

In [ ]:
val data = sparkSession.read.parquet(s"${dataDir}/djia.parquet")

In [ ]:
import org.apache.spark.ml.regression.LinearRegression

In [ ]:
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors

In [ ]:
val frame = data.drop("DWDP").drop("MMM")
val features = frame.columns.filter(_ != "JPM").filter(_ != "date")

In [ ]:
val assembler = new VectorAssembler()
                    .setInputCols(features)
                    .setOutputCol("features")

In [ ]:
val lr = new LinearRegression()
  .setMaxIter(10)
  .setSolver("l-bfgs")
  .setRegParam(0.0)
  .setLabelCol("JPM")

In [ ]:
val training = frame.filter($"date" < "2018-01-01")
val test     = frame.filter($"date" >= "2018-01-01")

In [ ]:
(training.count, test.count)

In [ ]:
val mlTrain = assembler.transform(training).select("date","JPM", "features")
val mlTest = assembler.transform(test).select("date", "JPM", "features")

In [ ]:
val model = lr.fit(mlTrain)

In [ ]:
model.summary.rootMeanSquaredError

In [ ]:
val evalModel = model.evaluate(mlTest)

In [ ]:
evalModel.rootMeanSquaredError

In [ ]:
evalModel.predictions.select("JPM","prediction").collect

In [ ]:
evalModel.residuals.collect

* Training & validation (random)
* Training & validation (2017/2018)
* Binning volume
* use binned volume as categories
* New model with volume as feature

In [ ]:
val lr = new LinearRegression()
  .setMaxIter(10)
  .setSolver("l-bfgs")
  .setRegParam(.1)
  .setLabelCol("JPM")

In [ ]:
val model = lr.fit(mlTrain)

In [ ]:
model.summary.rootMeanSquaredError

In [ ]:
val evalModel = model.evaluate(mlTest)
evalModel.rootMeanSquaredError