In [ ]:
val dataDir = sys.props("java.io.tmpdir") + "/data/linear_regression"

In [ ]:
val data = sparkSession.read.parquet(s"${dataDir}/djia.parquet")

In [ ]:
data.select("AAPL").agg(mean("AAPL"), variance("AAPL"))


In [ ]:
import org.apache.spark.ml.regression.LinearRegression

In [ ]:
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors

In [ ]:
data.describe()

### 1 feature : AAPL

We first try a model with just one feature

In [ ]:
val frame = data.drop("date").drop("DWDP").drop("MMM")
//val features = Array("AAPL")
val features = frame.columns.filter(_ != "JPM")

In [ ]:

val assembler = new VectorAssembler()
                    .setInputCols(features)
                    .setOutputCol("features")

val mlInput = assembler.transform(frame).select("JPM", "features")


In [ ]:
mlInput.show

In [ ]:
val lr = new LinearRegression()
  .setMaxIter(10)
  .setSolver("l-bfgs")
  .setRegParam(0.0)
  .setLabelCol("JPM")
  //.setElasticNetParam(0.8)

In [ ]:
val model = lr.fit(mlInput)

In [ ]:
model.intercept

In [ ]:
model.coefficients

In [ ]:
model.summary.residuals.collect

In [ ]:
model.summary.rootMeanSquaredError

* Training & validation (random)
* Training & validation (2017/2018)
* Binning volume
* use binned volume as categories
* New model with volume as feature

### Random split for validation

In [ ]:
val frame = data.drop("DWDP").drop("MMM")
val Array(training, validation) = frame.randomSplit(Array(0.7, 0.3))
val features = Array("AAPL")
//val features = frame.columns.filter(_ != "JPM")

In [ ]:
val mlTrain = assembler.transform(training).select("date","JPM", "features")
val mlValid = assembler.transform(validation).select("date", "JPM", "features")

In [ ]:
val model = lr.fit(mlTrain)

In [ ]:
model.transform(mlValid).select("date", "JPM", "prediction").orderBy("date")
     .withColumn("residual", $"JPM" - $"prediction" )
     .select("date", "residual")
     .collect

In [ ]:
val evalModel = model.evaluate(mlValid)

In [ ]:
evalModel.rootMeanSquaredError

In [ ]:
evalModel.residuals.collect

In [ ]:
evalModel.predictions.select("date","JPM","prediction").collect

In [ ]:
evalModel.predictions.select("JPM","prediction").collect