## Example de calcul de Corrélation

In [1]:
import org.apache.spark.ml.linalg.{Matrix, Vectors}
import org.apache.spark.ml.stat.Correlation
import org.apache.spark.sql.Row

val data = Seq(
  Vectors.sparse(4, Seq((0, 1.0), (3, -2.0))),
  Vectors.dense(4.0, 5.0, 0.0, 3.0),
  Vectors.dense(6.0, 7.0, 0.0, 8.0),
  Vectors.dense(6.0, 7.0, 1.0, 8.0),
  Vectors.sparse(4, Seq((0, 9.0), (3, 1.0)))
)
// dense : ensemble des variables du vecteur
// sparse : seulement les valeurs non nulles (nbre de valeur puis  couples (indice, valeur))

val df = data.map(Tuple1.apply).toDF("features")

df.show()
// par defaut pearson correlation
val Row(coeff1: Matrix) = Correlation.corr(df, "features").head
println(s"Pearson correlation matrix:\n $coeff1")

// autre choix spearman
val Row(coeff2: Matrix) = Correlation.corr(df, "features", "spearman").head
println(s"Spearman correlation matrix:\n $coeff2")

+--------------------+
|            features|
+--------------------+
|(4,[0,3],[1.0,-2.0])|
|   [4.0,5.0,0.0,3.0]|
|   [6.0,7.0,0.0,8.0]|
|   [6.0,7.0,1.0,8.0]|
| (4,[0,3],[9.0,1.0])|
+--------------------+

Pearson correlation matrix:
 1.0                  0.12367498475994064  0.15161960871578067  0.41287206669597387  
0.12367498475994064  1.0                  0.5019646441105268   0.9357441262904027   
0.15161960871578067  0.5019646441105268   1.0                  0.5598852584152163   
0.41287206669597387  0.9357441262904027   0.5598852584152163   1.0                  
Spearman correlation matrix:
 1.0                  0.16222142113076254  0.18136906252750293  0.36842105263157904  
0.16222142113076254  1.0                  0.5590169943749475   0.9733285267845753   
0.18136906252750293  0.5590169943749475   1.0                  0.5441071875825088   
0.36842105263157904  0.9733285267845753   0.5441071875825088   1.0                  


data = List((4,[0,3],[1.0,-2.0]), [4.0,5.0,0.0,3.0], [6.0,7.0,0.0,8.0], [6.0,7.0,1.0,8.0], (4,[0,3],[9.0,1.0]))
df = [features: vector]
coeff1 = 
coeff2 = 


1.0                  0.12367498475994064  0.15161960871578067  0.41287206669597387
0.12367498475994064  1.0                  0.5019646441105268   0.9357441262904027
0.15161960871578067  0.5019646441105268   1.0                  0.5598852584152163
0.41287206669597387  0.9357441262904027   0.5598852584152163   1.0
1.0                  0.16222142113076254  0.181369062527502...


## Exemple test d'hypothèse

In [3]:
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.stat.ChiSquareTest

//  Attention : label et features sont categorical (nbre finis de valeurs)
// label 2 classes, features 3 classes (resp. 4)

// le test est fait pour chaque feature vs label
// le degré de liberté est calculé automatiquement

val data = Seq(
  (0.0, Vectors.dense(0.5, 10.0)),
  (0.0, Vectors.dense(1.5, 20.0)),
  (1.0, Vectors.dense(1.5, 30.0)),
  (0.0, Vectors.dense(3.5, 30.0)),
  (0.0, Vectors.dense(3.5, 40.0)),
  (1.0, Vectors.dense(3.5, 40.0))
)

val df = data.toDF("label", "features")
val chi = ChiSquareTest.test(df, "features", "label").head
println(s"pValues = ${chi.getAs[Vector](0)}")
println(s"degreesOfFreedom ${chi.getSeq[Int](1).mkString("[", ",", "]")}")
println(s"statistics ${chi.getAs[Vector](2)}")

pValues = [0.6872892787909721,0.6822703303362126]
degreesOfFreedom [2,3]
statistics [0.75,1.5]


data = List((0.0,[0.5,10.0]), (0.0,[1.5,20.0]), (1.0,[1.5,30.0]), (0.0,[3.5,30.0]), (0.0,[3.5,40.0]), (1.0,[3.5,40.0]))
df = [label: double, features: vector]
chi = [[0.6872892787909721,0.6822703303362126],WrappedArray(2, 3),[0.75,1.5]]


[[0.6872892787909721,0.6822703303362126],WrappedArray(2, 3),[0.75,1.5]]

## Résumé statistique

In [8]:
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.stat.Summarizer

val data = Seq(
  (Vectors.dense(2.0, 3.0, 5.0), 1.0),
  (Vectors.dense(4.0, 6.0, 7.0), 2.0)
)

val df = data.toDF("features", "weight")

// metrics possible : mean, variance, count, numNonZeros, max, min, normL1, normL2

// calcul incluant le poids
val (meanVal, varianceVal) = df.select(Summarizer.metrics("mean", "variance")
                                       .summary($"features", $"weight").as("summary"))
                                       .select("summary.mean", "summary.variance")
                                       .as[(Vector, Vector)].first()

println(s"with weight: mean = ${meanVal}, variance = ${varianceVal}")

// calcul sans tenir compte du poids
val (meanVal2, varianceVal2) = df.select(Summarizer.mean($"features"), Summarizer.variance($"features"))
  .as[(Vector, Vector)].first()

println(s"without weight: mean = ${meanVal2}, sum = ${varianceVal2}")

with weight: mean = [3.333333333333333,5.0,6.333333333333333], variance = [2.0,4.5,2.0]
without weight: mean = [3.0,4.5,6.0], sum = [2.0,4.5,2.0]


data = List(([2.0,3.0,5.0],1.0), ([4.0,6.0,7.0],2.0))
df = [features: vector, weight: double]
meanVal = [3.333333333333333,5.0,6.333333333333333]
varianceVal = [2.0,4.5,2.0]
meanVal2 = [3.0,4.5,6.0]
varianceVal2 = [2.0,4.5,2.0]


[2.0,4.5,2.0]

## Exemple de Estimator

In [4]:
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.sql.Row

// Prepare training data from a list of (label, features) tuples.
val training = spark.createDataFrame(Seq(
  (1.0, Vectors.dense(0.0, 1.1, 0.1)),
  (0.0, Vectors.dense(2.0, 1.0, -1.0)),
  (0.0, Vectors.dense(2.0, 1.3, 1.0)),
  (1.0, Vectors.dense(0.0, 1.2, -0.5))
)).toDF("label", "features")

// Create a LogisticRegression instance. This instance is an Estimator.
val lr = new LogisticRegression()
// Print out the parameters, documentation, and any default values.
println(s"LogisticRegression parameters:\n ${lr.explainParams()}\n")

// We may set parameters using setter methods.
lr.setMaxIter(10)
  .setRegParam(0.01)

// Learn a LogisticRegression model. This uses the parameters stored in lr.
val model1 = lr.fit(training)
// Since model1 is a Model (i.e., a Transformer produced by an Estimator),
// we can view the parameters it used during fit().
// This prints the parameter (name: value) pairs, where names are unique IDs for this
// LogisticRegression instance.
println(s"Model 1 was fit using parameters: ${model1.parent.extractParamMap}")

// We may alternatively specify parameters using a ParamMap,
// which supports several methods for specifying parameters.
val paramMap = ParamMap(lr.maxIter -> 20)
  .put(lr.maxIter, 30)  // Specify 1 Param. This overwrites the original maxIter.
  .put(lr.regParam -> 0.1, lr.threshold -> 0.55)  // Specify multiple Params.

// One can also combine ParamMaps.
val paramMap2 = ParamMap(lr.probabilityCol -> "myProbability")  // Change output column name.
val paramMapCombined = paramMap ++ paramMap2

// Now learn a new model using the paramMapCombined parameters.
// paramMapCombined overrides all parameters set earlier via lr.set* methods.
val model2 = lr.fit(training, paramMapCombined)
println(s"Model 2 was fit using parameters: ${model2.parent.extractParamMap}")

// Prepare test data.
val test = spark.createDataFrame(Seq(
  (1.0, Vectors.dense(-1.0, 1.5, 1.3)),
  (0.0, Vectors.dense(3.0, 2.0, -0.1)),
  (1.0, Vectors.dense(0.0, 2.2, -1.5))
)).toDF("label", "features")

// Make predictions on test data using the Transformer.transform() method.
// LogisticRegression.transform will only use the 'features' column.
// Note that model2.transform() outputs a 'myProbability' column instead of the usual
// 'probability' column since we renamed the lr.probabilityCol parameter previously.
model2.transform(test)
  .select("features", "label", "myProbability", "prediction")
  .collect()
  .foreach { case Row(features: Vector, label: Double, prob: Vector, prediction: Double) =>
    println(s"($features, $label) -> prob=$prob, prediction=$prediction")
  }

LogisticRegression parameters:
 aggregationDepth: suggested depth for treeAggregate (>= 2) (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial. (default: auto)
featuresCol: features column name (default: features)
fitIntercept: whether to fit an intercept term (default: true)
labelCol: label column name (default: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. (undefined)
maxIter: maximum number of iterations (>= 0) (default: 100)
predictionCol: prediction column name (default: prediction)
probabilityCol: Column name for predicted class c

training = [label: double, features: vector]
lr = logreg_8376c45f74b7
model1 = LogisticRegressionModel: uid = logreg_8376c45f74b7, numClasses = 2, numFeatures = 3
paramMap = 
paramMap2 = 


{
	logreg_8376c45f74b7-maxIter: 30,
	logreg_8376c45f74b7-regParam: 0.1,
	logreg_8376c45f74b7-threshold: 0.55
}
{
	logreg_8376c45f74b7-probabilityCol: myProbability
}
paramMapCombined: org.apache.spark.ml.p...


## Utilisation d'un pipeline

In [9]:
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.Row

// Prepare training documents from a list of (id, text, label) tuples.
val training = spark.createDataFrame(Seq(
  (0L, "a b c d e spark", 1.0),
  (1L, "b d", 0.0),
  (2L, "spark f g h", 1.0),
  (3L, "hadoop mapreduce", 0.0)
)).toDF("id", "text", "label")

// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
val tokenizer = new Tokenizer()
  .setInputCol("text")
  .setOutputCol("words")
val hashingTF = new HashingTF()
  .setNumFeatures(1000)
  .setInputCol(tokenizer.getOutputCol)
  .setOutputCol("features")
val lr = new LogisticRegression()
  .setMaxIter(10)
  .setRegParam(0.001)
val pipeline = new Pipeline()
  .setStages(Array(tokenizer, hashingTF, lr))

// Fit the pipeline to training documents.
val model = pipeline.fit(training)

// Now we can optionally save the fitted pipeline to disk
model.write.overwrite().save("/tmp/spark-logistic-regression-model")

// We can also save this unfit pipeline to disk
pipeline.write.overwrite().save("/tmp/unfit-lr-model")

// And load it back in during production
val sameModel = PipelineModel.load("/tmp/spark-logistic-regression-model")

// Prepare test documents, which are unlabeled (id, text) tuples.
val test = spark.createDataFrame(Seq(
  (4L, "spark i j k"),
  (5L, "l m n"),
  (6L, "spark hadoop spark"),
  (7L, "apache hadoop")
)).toDF("id", "text")

// Make predictions on test documents.
model.transform(test)
  .select("id", "text", "probability", "prediction")
  .collect()
  .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
    println(s"($id, $text) --> prob=$prob, prediction=$prediction")
  }

(4, spark i j k) --> prob=[0.15964077387874118,0.8403592261212589], prediction=1.0
(5, l m n) --> prob=[0.8378325685476612,0.16216743145233875], prediction=0.0
(6, spark hadoop spark) --> prob=[0.06926633132976273,0.9307336686702373], prediction=1.0
(7, apache hadoop) --> prob=[0.9821575333444208,0.01784246665557917], prediction=0.0


training = [id: bigint, text: string ... 1 more field]
tokenizer = tok_8c192e7394b8
hashingTF = hashingTF_38ffd2141876
lr = logreg_574e2c90ae6a
pipeline = pipeline_ecddaccc9a07
model = pipeline_ecddaccc9a07
sameModel = pipeline_ecddaccc9a07
test = [id: bi...


[id: bi...