In [1]:
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SQLContext, SparkSession, Row}
import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType}
import org.apache.spark.ml.feature.{RegexTokenizer, StopWordsRemover, CountVectorizer, CountVectorizerModel, IDF, StringIndexer, OneHotEncoderEstimator, VectorAssembler}
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.evaluation.{MulticlassClassificationEvaluator}
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.attribute.AttributeGroup


Intitializing Scala interpreter ...

Spark Web UI available at http://xavier-linux.home:4040
SparkContext available as 'sc' (version = 2.4.4, master = local[*], app id = local-1575641417407)
SparkSession available as 'spark'


import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SQLContext, SparkSession, Row}
import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType}
import org.apache.spark.ml.feature.{RegexTokenizer, StopWordsRemover, CountVectorizer, CountVectorizerModel, IDF, StringIndexer, OneHotEncoderEstimator, VectorAssembler}
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.attribute.AttributeGroup


## Configuration de SparkSession

In [2]:
val conf = new SparkConf().setAll(Map(
      "spark.scheduler.mode" -> "FIFO",
      "spark.speculation" -> "false",
      "spark.reducer.maxSizeInFlight" -> "48m",
      "spark.serializer" -> "org.apache.spark.serializer.KryoSerializer",
      "spark.kryoserializer.buffer.max" -> "1g",
      "spark.shuffle.file.buffer" -> "32k",
      "spark.default.parallelism" -> "12",
      "spark.sql.shuffle.partitions" -> "12"
    ))

conf: org.apache.spark.SparkConf = org.apache.spark.SparkConf@4fa6acbb


In [3]:
 val spark = SparkSession
      .builder
      .config(conf)
      .appName("TP Spark : Preprocessor")
      .getOrCreate()

import spark.implicits._  // to use the symbol $

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@668f7271
import spark.implicits._


## Chargement du DataFrame

In [4]:
val df: DataFrame = spark
            .read
            .option("header", true)
            .option("inferSchema", "true")
            .parquet("../data/prepared_trainingset/")

println(s"Nombre de lignes: ${df.count}")
println(s"Nombre de colonnes: ${df.columns.length}")

Nombre de lignes: 107614
Nombre de colonnes: 14


df: org.apache.spark.sql.DataFrame = [project_id: string, name: string ... 12 more fields]


In [5]:
df.show(5)

+--------------+--------------------+--------------------+------+--------------------+------------+--------+---------+-------------------+-------------------+-------------------+-------------+-----------+--------------------+
|    project_id|                name|                desc|  goal|            keywords|final_status|country2|currency2|          deadline2|        created_at2|       launched_at2|days_campaign|hours_prepa|                text|
+--------------+--------------------+--------------------+------+--------------------+------------+--------+---------+-------------------+-------------------+-------------------+-------------+-----------+--------------------+
| kkst471421639|american options ...|looking to create...|100000|american-options-...|           0|      US|      USD|2014-11-15 17:31:27|2014-10-10 21:23:58|2014-10-16 17:31:27|           30|    140.125|american options ...|
|kkst1098019088|iheadbones bone c...|wireless bluetoot...| 20000|iheadbones-bone-c...|          

## Utilisation des données textuelles

In [6]:
// Stage 1 : récupérer les mots des textes
val tokenizer = new RegexTokenizer()
  .setPattern("\\W+")
  .setGaps(true)
  .setInputCol("text")
  .setOutputCol("tokens")

//val words = tokenizer.transform(df)

tokenizer: org.apache.spark.ml.feature.RegexTokenizer = regexTok_b7a8bd700ae5


In [7]:
// Stage 2 : retirer les stop words (liste : StopWordsRemover.loadDefaultStopWords("english"))
val stopWordsRemover = new StopWordsRemover()
    .setInputCol("tokens")
    .setOutputCol("text_filtered")

//val words2 = stopWordsRemover.transform(words)

stopWordsRemover: org.apache.spark.ml.feature.StopWordsRemover = stopWords_4f77f05faec9


In [8]:
// Stage 3 : computer la partie TF
val cvModel: CountVectorizer = new CountVectorizer()//Model(Array("a", "b", "c"))
    .setInputCol("text_filtered")
    .setOutputCol("cv_features")
    //.setVocabSize(3)
    //.setMinDF()

//val words3 = cvModel.fit(words2).transform(words2)

cvModel: org.apache.spark.ml.feature.CountVectorizer = cntVec_39174728fc19


In [9]:
// Stage 4 : computer la partie IDF
val idf = new IDF()
    .setInputCol("cv_features")
    .setOutputCol("tfidf")

//val df2: DataFrame = idf.fit(words3).transform(words3)

idf: org.apache.spark.ml.feature.IDF = idf_54e6d0bc5b9d


## Conversion des variables catégorielles en variables numériques

In [10]:
//Stage 5 : convertir country2 en quantités numériques
val indexer_country = new StringIndexer()
    .setInputCol("country2")
    .setOutputCol("country_indexed")
    .setHandleInvalid("keep")

//val df4: DataFrame = indexer.fit(df3).transform(df3)

indexer_country: org.apache.spark.ml.feature.StringIndexer = strIdx_0f55843547f7


In [11]:
//Stage 6 : convertir currency2 en quantités numériques
val indexer_currency = new StringIndexer()
    .setInputCol("currency2")
    .setOutputCol("currency_indexed")

//val df5: DataFrame = indexer.fit(df4).transform(df4)

indexer_currency: org.apache.spark.ml.feature.StringIndexer = strIdx_de6f211f8ae9


In [12]:
// Stages 7 et 8: One-Hot encoder ces deux catégories
val encoder = new OneHotEncoderEstimator()
    .setInputCols(Array("country_indexed", "currency_indexed"))
    .setOutputCols(Array("country_onehot", "currency_onehot"))

//val df7: DataFrame = encoder.fit(df6).transform(df6)

encoder: org.apache.spark.ml.feature.OneHotEncoderEstimator = oneHotEncoder_1571356236c6


## Mise des données sous une forme utilisable par Spark.ML

In [13]:
val assembler = new VectorAssembler()
  .setInputCols(Array("tfidf", "days_campaign", "hours_prepa", "goal", "country_onehot", "currency_onehot"))
  .setOutputCol("features")

//val df9 = assembler.transform(df8)

assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_d71a17a416ac


## Création du modèle de classification

In [14]:
// Stage 10 : créer/instancier le modèle de classification
val lr = new LogisticRegression()
  .setElasticNetParam(0.0)
  .setFitIntercept(true)
  .setFeaturesCol("features")
  .setLabelCol("final_status")
  .setStandardization(true)
  .setPredictionCol("predictions")
  .setRawPredictionCol("raw_predictions")
  .setThresholds(Array(0.7, 0.3))
  .setTol(1.0e-6)
  .setMaxIter(20)

lr: org.apache.spark.ml.classification.LogisticRegression = logreg_3aaaa95f5d35


## Création du pipeline

In [15]:
val pipeline = new Pipeline()
    .setStages(Array(tokenizer, stopWordsRemover, 
                     cvModel, idf, indexer_country, indexer_currency,
                     encoder, assembler, lr))

pipeline: org.apache.spark.ml.Pipeline = pipeline_49b093d47b52


## Entraînement, test, et sauvegarde du modèle

In [16]:
val Array(training, test) = df.randomSplit(Array(0.9, 0.1), seed=261)

training: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [project_id: string, name: string ... 12 more fields]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [project_id: string, name: string ... 12 more fields]


In [17]:
val model = pipeline.fit(training)

model: org.apache.spark.ml.PipelineModel = pipeline_49b093d47b52


In [18]:
// On enregistre le modèle entraîné
model.write.overwrite().save("../data/model/spark-logistic-regression-model")

In [19]:
val dfWithSimplePredictions = model.transform(test)

dfWithSimplePredictions: org.apache.spark.sql.DataFrame = [project_id: string, name: string ... 24 more fields]


In [20]:
dfWithSimplePredictions.groupBy("final_status", "predictions").count.show()

+------------+-----------+-----+
|final_status|predictions|count|
+------------+-----------+-----+
|           1|        0.0| 1812|
|           0|        1.0| 2328|
|           1|        1.0| 1590|
|           0|        0.0| 5036|
+------------+-----------+-----+



In [21]:
val evaluator = new MulticlassClassificationEvaluator()
    .setLabelCol("final_status")
    .setPredictionCol("predictions")
    .setMetricName("f1")
val f1_score = evaluator.evaluate(dfWithSimplePredictions)

evaluator: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = mcEval_a5821d68849e
f1_score: Double = 0.6220287782615461


## Réglage des hyper-paramètres du modèle

Documentation : https://spark.apache.org/docs/2.2.0/ml-tuning.html

In [22]:
val paramGrid = new ParamGridBuilder()
    .addGrid(cvModel.minDF, Array(55.0, 75.0, 95.0))
    .addGrid(lr.elasticNetParam, Array(10e-8, 10e-6, 10e-4, 10e-2))
    .build()

paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	logreg_3aaaa95f5d35-elasticNetParam: 1.0E-7,
	cntVec_39174728fc19-minDF: 55.0
}, {
	logreg_3aaaa95f5d35-elasticNetParam: 1.0E-7,
	cntVec_39174728fc19-minDF: 75.0
}, {
	logreg_3aaaa95f5d35-elasticNetParam: 1.0E-7,
	cntVec_39174728fc19-minDF: 95.0
}, {
	logreg_3aaaa95f5d35-elasticNetParam: 1.0E-5,
	cntVec_39174728fc19-minDF: 55.0
}, {
	logreg_3aaaa95f5d35-elasticNetParam: 1.0E-5,
	cntVec_39174728fc19-minDF: 75.0
}, {
	logreg_3aaaa95f5d35-elasticNetParam: 1.0E-5,
	cntVec_39174728fc19-minDF: 95.0
}, {
	logreg_3aaaa95f5d35-elasticNetParam: 0.001,
	cntVec_39174728fc19-minDF: 55.0
}, {
	logreg_3aaaa95f5d35-elasticNetParam: 0.001,
	cntVec_39174728fc19-minDF: 75.0
}, {
	logreg_3aaaa95f5d35-elasticNetParam: 0.001,
	cntVec_39174728fc1...

In [23]:
val validation = new TrainValidationSplit()
    .setEstimator(pipeline)
    .setEvaluator(evaluator)
    .setEstimatorParamMaps(paramGrid)
    .setTrainRatio(0.7)

validation: org.apache.spark.ml.tuning.TrainValidationSplit = tvs_9a1a95e2e5b1


In [24]:
val model_improved = validation.fit(training)

model_improved: org.apache.spark.ml.tuning.TrainValidationSplitModel = tvs_9a1a95e2e5b1


In [25]:
val dfWithPredictions = model_improved.transform(test)

dfWithPredictions: org.apache.spark.sql.DataFrame = [project_id: string, name: string ... 24 more fields]


In [26]:
dfWithPredictions.groupBy("final_status", "predictions").count.show()

+------------+-----------+-----+
|final_status|predictions|count|
+------------+-----------+-----+
|           1|        0.0| 1081|
|           0|        1.0| 2845|
|           1|        1.0| 2321|
|           0|        0.0| 4519|
+------------+-----------+-----+



In [27]:
val f1_score = evaluator.evaluate(dfWithPredictions)

f1_score: Double = 0.6480627333380333


In [28]:
// On enregistre le modèle entraîné
model_improved.write.overwrite().save("../data/model/spark-logistic-regression-model-improved")

## Test du modèle sur les données cleanées précédemment

In [29]:
val df_cleaned: DataFrame = spark
            .read
            .option("header", true)
            .option("inferSchema", "true")
            .parquet("../data/dataframe/")

println(s"Nombre de lignes: ${df_cleaned.count}")
println(s"Nombre de colonnes: ${df_cleaned.columns.length}")
val Array(training_cleaned, test_cleaned) = df_cleaned.randomSplit(Array(0.9, 0.1), seed=261)

Nombre de lignes: 108129
Nombre de colonnes: 10


df_cleaned: org.apache.spark.sql.DataFrame = [project_id: string, goal: double ... 8 more fields]
training_cleaned: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [project_id: string, goal: double ... 8 more fields]
test_cleaned: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [project_id: string, goal: double ... 8 more fields]


In [30]:
//Stage 5 : convertir country en quantités numériques
val indexer_country_cleaned = new StringIndexer()
    .setInputCol("country")
    .setOutputCol("country_indexed")
    .setHandleInvalid("keep")

//Stage 6 : convertir currency en quantités numériques
val indexer_currency_cleaned = new StringIndexer()
    .setInputCol("currency")
    .setOutputCol("currency_indexed")

val pipeline_cleaned = new Pipeline()
    .setStages(Array(tokenizer, stopWordsRemover, 
                     cvModel, idf, indexer_country_cleaned, indexer_currency_cleaned,
                     encoder, assembler, lr))

val validation_cleaned = new TrainValidationSplit()
    .setEstimator(pipeline_cleaned)
    .setEvaluator(evaluator)
    .setEstimatorParamMaps(paramGrid)
    .setTrainRatio(0.7)

indexer_country_cleaned: org.apache.spark.ml.feature.StringIndexer = strIdx_9c781dc4f477
indexer_currency_cleaned: org.apache.spark.ml.feature.StringIndexer = strIdx_9eecb1d2b73c
pipeline_cleaned: org.apache.spark.ml.Pipeline = pipeline_87623081799f
validation_cleaned: org.apache.spark.ml.tuning.TrainValidationSplit = tvs_43dabc727c7e


In [31]:
val model_improved_training_cleaned = validation_cleaned.fit(training_cleaned)
val dfCleanedWithPredictions = model_improved_training_cleaned.transform(test_cleaned)

model_improved_training_cleaned: org.apache.spark.ml.tuning.TrainValidationSplitModel = tvs_43dabc727c7e
dfCleanedWithPredictions: org.apache.spark.sql.DataFrame = [project_id: string, goal: double ... 20 more fields]


In [32]:
dfCleanedWithPredictions.groupBy("final_status", "predictions").count.show()
print(s"f1-score du modèle cleané: ${evaluator.evaluate(dfCleanedWithPredictions)}")

+------------+-----------+-----+
|final_status|predictions|count|
+------------+-----------+-----+
|           1|        0.0| 1060|
|           0|        1.0| 2849|
|           1|        1.0| 2445|
|           0|        0.0| 4459|
+------------+-----------+-----+

f1-score du modèle cleané: 0.6500313713545759

## Amélioration du modèle sur les données cleanées précédemment

In [60]:
val df2: DataFrame = spark
            .read
            .option("header", true)
            .option("inferSchema", "true")
            .parquet("../data/dataframe/")

println(s"Nombre de lignes: ${df2.count}")
println(s"Nombre de colonnes: ${df2.columns.length}")
val Array(training2, test2) = df2.randomSplit(Array(0.9, 0.1), seed=261)

Nombre de lignes: 108129
Nombre de colonnes: 10


df2: org.apache.spark.sql.DataFrame = [project_id: string, goal: double ... 8 more fields]
training2: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [project_id: string, goal: double ... 8 more fields]
test2: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [project_id: string, goal: double ... 8 more fields]


In [61]:
// Stage 1 : récupérer les mots des textes
val tokenizer2 = new RegexTokenizer()
  .setPattern("\\W+")
  .setGaps(true)
  .setInputCol("text")
  .setOutputCol("tokens")

// Stage 2 : retirer les stop words (liste : StopWordsRemover.loadDefaultStopWords("english"))
val stopWordsRemover2 = new StopWordsRemover()
    .setInputCol("tokens")
    .setOutputCol("text_filtered")

// Stage 3 : computer la partie TF
val cvModel2: CountVectorizer = new CountVectorizer()//Model(Array("a", "b", "c"))
    .setInputCol("text_filtered")
    .setOutputCol("cv_features")
    //.setVocabSize(3)
    //.setMinDF()

// Stage 4 : computer la partie IDF
val idf2 = new IDF()
    .setInputCol("cv_features")
    .setOutputCol("tfidf")

//Stage 5 : convertir country2 en quantités numériques
val indexer_country2 = new StringIndexer()
    .setInputCol("country")
    .setOutputCol("country_indexed")
    .setHandleInvalid("skip")

//Stage 6 : convertir currency2 en quantités numériques
val indexer_currency2 = new StringIndexer()
    .setInputCol("currency")
    .setOutputCol("currency_indexed")
    .setHandleInvalid("skip")

// Stages 7 et 8: One-Hot encoder ces deux catégories
val encoder2 = new OneHotEncoderEstimator()
    .setInputCols(Array("country_indexed", "currency_indexed"))
    .setOutputCols(Array("country_onehot", "currency_onehot"))

val assembler2 = new VectorAssembler()
    .setInputCols(Array("tfidf", "days_campaign", "hours_prepa", "launched_month", "length_desc", "goal", "country_onehot", "currency_onehot"))
    .setOutputCol("features")
    .setHandleInvalid("skip")

// Stage 10 : créer/instancier le modèle de classification
val lr2 = new LogisticRegression()
    .setElasticNetParam(0.0)
    .setFitIntercept(true)
    .setFeaturesCol("features")
    .setLabelCol("final_status")
    .setStandardization(true)
    .setPredictionCol("predictions")
    .setRawPredictionCol("raw_predictions")
    .setThresholds(Array(0.7, 0.3))
    .setTol(1.0e-6)
    .setMaxIter(20)

val pipeline2 = new Pipeline()
    .setStages(Array(tokenizer2, stopWordsRemover2, 
                     cvModel2, idf2, indexer_country2, indexer_currency2,
                     encoder2, assembler2, lr2))

/*val paramGrid2 = new ParamGridBuilder()
    .addGrid(cvModel2.minDF, Array(5.0, 15.0, 25.0, 35.0, 45.0, 55.0, 75.0, 95.0))
    .addGrid(lr2.elasticNetParam, Array(10e-8, 10e-6, 10e-4, 10e-2))
    .build()*/

val paramGrid2 = new ParamGridBuilder()
    .addGrid(cvModel2.minDF, Array(5.0, 15.0))
    .addGrid(lr2.elasticNetParam, Array(10e-8, 10e-6, 10e-4, 10e-2))
    .build()

val evaluator = new MulticlassClassificationEvaluator()
    .setLabelCol("final_status")
    .setPredictionCol("predictions")
    .setMetricName("f1")

val validation2 = new TrainValidationSplit()
    .setEstimator(pipeline2)
    .setEvaluator(evaluator)
    .setEstimatorParamMaps(paramGrid2)
    .setTrainRatio(0.7)

tokenizer2: org.apache.spark.ml.feature.RegexTokenizer = regexTok_4821ea770a36
stopWordsRemover2: org.apache.spark.ml.feature.StopWordsRemover = stopWords_e7b133ba8724
cvModel2: org.apache.spark.ml.feature.CountVectorizer = cntVec_ed60d64a5159
idf2: org.apache.spark.ml.feature.IDF = idf_c490c5b4cd48
indexer_country2: org.apache.spark.ml.feature.StringIndexer = strIdx_6b7f1799aaef
indexer_currency2: org.apache.spark.ml.feature.StringIndexer = strIdx_c928c9efa268
encoder2: org.apache.spark.ml.feature.OneHotEncoderEstimator = oneHotEncoder_9c369cfa159f
assembler2: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_f2b3fa3cf1f3
lr2: org.apache.spark.ml.classification.LogisticRegression = logreg_7d1c3e4f75cb
pipeline2: org.apache.spark.ml.Pipeline = pipeline_4363083abfab
paramGrid2: ...

In [62]:
val model2 = validation2.fit(training2)
val df2 = model2.transform(test2)

org.apache.spark.SparkException:  Exception thrown in awaitResult:

In [54]:
df2.count //108129

res11: Long = 10769


In [36]:
df2.groupBy("final_status", "predictions").count.show()
print(s"f1-score du modèle cleané: ${evaluator.evaluate(df2)}")

+------------+-----------+-----+
|final_status|predictions|count|
+------------+-----------+-----+
|           1|        0.0| 1084|
|           0|        1.0| 2739|
|           1|        1.0| 2421|
|           0|        0.0| 4569|
+------------+-----------+-----+

f1-score du modèle cleané: 0.6576354660559056

In [59]:
df2.groupBy("final_status", "predictions").count.show()
print(s"f1-score du modèle cleané: ${evaluator.evaluate(df2)}")

+------------+-----------+-----+
|final_status|predictions|count|
+------------+-----------+-----+
|           1|        0.0| 1203|
|           0|        1.0| 2684|
|           1|        1.0| 2252|
|           0|        0.0| 4630|
+------------+-----------+-----+

f1-score du modèle cleané: 0.6505795545699076

## Explication du modèle

In [51]:
val model_lr = model2.bestModel.asInstanceOf[PipelineModel].stages.last.asInstanceOf[LogisticRegressionModel]
// Extract the attributes of the input (features)
val schema = model2.transform(test2).schema
val featureAttrs = AttributeGroup.fromStructField(schema(model_lr.getFeaturesCol)).attributes.get
val features = featureAttrs.map(_.name.get)

// Add "(Intercept)" to list of feature names if the model was fit with an intercept
val featureNames: Array[String] = if (model_lr.getFitIntercept) {
  Array("(Intercept)") ++ features
} else {
  features
}

// Get array of coefficients
val lrModelCoeffs = model_lr.coefficients.toArray
val coeffs = if (model_lr.getFitIntercept) {
  lrModelCoeffs ++ Array(model_lr.intercept)
} else {
  lrModelCoeffs
}
val coeffs_abs = coeffs.map(num => Math.abs(num))

model_lr: org.apache.spark.ml.classification.LogisticRegressionModel = LogisticRegressionModel: uid = logreg_86f4a0152866, numClasses = 2, numFeatures = 5397
schema: org.apache.spark.sql.types.StructType = StructType(StructField(project_id,StringType,true), StructField(goal,DoubleType,true), StructField(country,StringType,true), StructField(currency,StringType,true), StructField(final_status,IntegerType,true), StructField(days_campaign,IntegerType,true), StructField(hours_prepa,DoubleType,true), StructField(text,StringType,true), StructField(tokens,ArrayType(StringType,true),true), StructField(text_filtered,ArrayType(StringType,true),true), StructField(cv_features,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true), StructField(tfidf,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true), ...

In [45]:
// Print feature names & coefficients together
println("Feature\tCoefficient")
featureNames.zip(coeffs_abs).sortBy(_._2)(Ordering[Double].reverse).foreach { case (feature, coeff) =>
  println(s"$feature\t$coeff")
}

Feature	Coefficient
tfidf_30	2.6016066995801466
country_onehot_IE	2.5861991418289367
currency_onehot_DKK	0.7166991107777904
tfidf_1279	0.6663550223107313
country_onehot_NO	0.4360256300103373
tfidf_4351	0.40268669110338245
tfidf_2213	0.394899623208894
country_onehot_AU	0.34485782425037703
tfidf_4233	0.2563368826778249
tfidf_901	0.25461390442432613
tfidf_826	0.2540310279523193
goal	0.25330930870286894
country_onehot_DE	0.25330930870286894
tfidf_2015	0.24827678855813068
currency_onehot_AUD	0.2427185680160838
tfidf_5179	0.2339054609081266
tfidf_4261	0.17494020521939166
tfidf_45	0.17371476056328491
tfidf_5062	0.17132211449480017
tfidf_3164	0.1697579464995335
tfidf_4917	0.16881299957759446
tfidf_5271	0.16264556234719132
tfidf_2711	0.16164719169181788
tfidf_5145	0.16038838052396662
tfidf_2458	0.16008817556345303
tfidf_4140	0.158497212125889
country_onehot_DK	0.15761100760838692
tfidf_3855	0.1569886455946628
tfidf_4680	0.154032397998834
tfidf_4662	0.15340016052599
tfidf_3526	0.1515209200176526

tfidf_691	0.07452972093941254
tfidf_5069	0.07451630391767314
tfidf_1995	0.07428940622628569
tfidf_5186	0.07425598209885736
tfidf_561	0.07424550257072218
tfidf_1316	0.07422960117967578
tfidf_4084	0.07418080554418768
tfidf_606	0.07414176210673913
tfidf_5302	0.07413159426386194
tfidf_3843	0.07390789310402807
tfidf_5365	0.07383160823509123
tfidf_2718	0.07374848004663477
tfidf_3374	0.07362992996423523
tfidf_1208	0.07353045809523441
tfidf_4290	0.07352804386430331
tfidf_4377	0.07343871589068059
tfidf_5066	0.07337632279996252
tfidf_4457	0.0733226496074391
tfidf_5001	0.07318684397017607
tfidf_2617	0.07308693067031138
tfidf_3971	0.0730796940284052
tfidf_3790	0.07303850582031479
tfidf_4512	0.07297856081946398
tfidf_960	0.07276810457537491
tfidf_3949	0.07275248604837771
tfidf_3777	0.07246419930097966
tfidf_4497	0.07242349797740244
tfidf_4922	0.07234936769323932
tfidf_5017	0.07229618825345802
tfidf_4366	0.07225167503427433
tfidf_3695	0.07220478217162733
tfidf_2786	0.0721060571821797
tfidf_3528	0.07

tfidf_4883	0.05512628834563081
tfidf_5245	0.055112578701122295
tfidf_4864	0.05509050699622655
tfidf_2777	0.05508356016712499
tfidf_1691	0.05502280309063045
tfidf_3525	0.05498082082725906
tfidf_378	0.05491962917330748
tfidf_4309	0.054884323587500124
tfidf_4940	0.054868645750710245
tfidf_5221	0.05485889940047081
tfidf_1008	0.05483424974916612
tfidf_4628	0.05483043547704597
tfidf_5287	0.05475855676469272
tfidf_2824	0.05474952757370177
tfidf_4800	0.05473366113065697
tfidf_5023	0.05467400609799139
tfidf_4231	0.054660890052187244
tfidf_5158	0.05458938664055491
tfidf_32	0.05457557508060139
tfidf_1942	0.05457310636881132
tfidf_4272	0.0545367485020701
tfidf_4787	0.05453588131135229
tfidf_3165	0.05451418749616385
tfidf_5368	0.054491815900867345
tfidf_3032	0.05445846630408007
tfidf_4881	0.05443670666097874
tfidf_3280	0.054419052517857565
tfidf_157	0.05440293977461797
tfidf_4813	0.05439400216740259
tfidf_1134	0.05438048263957129
tfidf_2827	0.05435135072466659
tfidf_1987	0.05432717159087449
tfidf_4

tfidf_2999	0.04272441330123277
tfidf_2308	0.04270692277357762
tfidf_2974	0.042676383061900476
tfidf_2464	0.042647961596611696
tfidf_3572	0.0426392265791406
tfidf_4499	0.042635372499459076
tfidf_3744	0.042628958355236654
tfidf_2027	0.042574365481352384
tfidf_4973	0.042548972504895644
tfidf_2658	0.04254862711370076
tfidf_3314	0.04252102930620449
tfidf_1948	0.04251480088079591
tfidf_139	0.0425085745787831
tfidf_4709	0.042496321572704734
tfidf_3512	0.042480162566845246
tfidf_3657	0.04247879563546547
tfidf_1986	0.04247053761114113
tfidf_4236	0.042466999047860464
tfidf_3634	0.04244886869855785
tfidf_3109	0.04243036287977797
tfidf_1651	0.042426860999803945
tfidf_5326	0.04242031557213737
tfidf_4444	0.042409972940380423
tfidf_911	0.0423954443314007
tfidf_4437	0.04235765295970233
tfidf_2052	0.04227736984214428
tfidf_2236	0.04225272648533292
tfidf_2351	0.042233337354611496
tfidf_462	0.04212995163532807
tfidf_249	0.04209176695754839
tfidf_2799	0.04208750203952964
tfidf_3866	0.04208105020392395
tfi

tfidf_1969	0.038003451437855024
tfidf_3604	0.03800094305351879
tfidf_3933	0.03798708277796728
tfidf_2572	0.03791361616846386
tfidf_4555	0.037896595644567724
tfidf_2347	0.0378455082579147
tfidf_1308	0.037831366661562636
tfidf_4529	0.03780895246062387
tfidf_4620	0.03779501134262016
tfidf_2319	0.03778427309161248
tfidf_3277	0.037774295296884274
tfidf_934	0.03776982128169057
tfidf_2158	0.03773056725258961
tfidf_899	0.037670241155332546
tfidf_3699	0.03766983712526211
tfidf_3207	0.03761238778291406
tfidf_2111	0.03761174870219136
tfidf_5332	0.03758406076310866
tfidf_4496	0.03756109232664657
tfidf_3017	0.03755951368000278
tfidf_4942	0.03754393130250191
tfidf_4071	0.037523896087901014
tfidf_3664	0.0375198914554621
tfidf_1513	0.03751700959361274
tfidf_3692	0.03750002720681715
tfidf_3630	0.03749344618246071
tfidf_1451	0.03748924179463796
tfidf_4954	0.037488406626553374
tfidf_3097	0.037486688298971275
tfidf_2153	0.03748179166365345
tfidf_3976	0.03746754864344695
tfidf_3946	0.037461159682765845
tfi

tfidf_1981	0.031974609312790496
tfidf_4354	0.031959528842516204
tfidf_1988	0.031958801892950035
tfidf_4307	0.03195565610376389
tfidf_590	0.031950604681667154
tfidf_3947	0.03190754774435198
tfidf_1841	0.03190526286783412
tfidf_4669	0.03187725188313575
tfidf_3292	0.03185111653280088
tfidf_1552	0.03184582999282515
tfidf_3576	0.03183785268299482
tfidf_2314	0.03182828714125903
tfidf_1683	0.031801126479811336
tfidf_3609	0.03177633055275771
tfidf_1231	0.03177068830754478
tfidf_2166	0.03176791231028066
tfidf_4183	0.03174553617244146
tfidf_4859	0.03174313557201575
tfidf_941	0.03174036318814229
tfidf_4423	0.03172059924463268
tfidf_2071	0.03171606909049127
tfidf_3704	0.03167834603196874
tfidf_2455	0.03167710772624582
tfidf_2774	0.03167496898554703
tfidf_38	0.03166984586480741
tfidf_4708	0.0316622192241741
tfidf_2841	0.031658511578880444
tfidf_3732	0.031648474956847646
tfidf_94	0.03162805278835692
tfidf_4079	0.03162377719842503
tfidf_2915	0.03158798339969087
tfidf_3958	0.03157796493663272
tfidf_44

tfidf_3080	0.027832998228429252
tfidf_2515	0.027828940474226245
tfidf_732	0.027828145994826015
tfidf_3265	0.027819872393125538
tfidf_599	0.027806766422021685
tfidf_4981	0.027798732883058307
tfidf_2216	0.027796123489488272
tfidf_4786	0.027794042801738753
tfidf_771	0.027786825760573543
tfidf_1362	0.027784437808070515
tfidf_4144	0.02775741702248489
tfidf_275	0.027734241876768796
tfidf_2148	0.02772567499240359
tfidf_3571	0.027719578679376747
tfidf_1265	0.027714955708993974
tfidf_1511	0.027706151866160007
tfidf_3518	0.0276923445152255
tfidf_2446	0.027687130340508178
tfidf_3263	0.02765644787499627
tfidf_2842	0.02764981330440274
tfidf_4878	0.027649287908169402
tfidf_2131	0.02759256671904924
tfidf_1173	0.027580180667683814
tfidf_1207	0.027572858301325345
tfidf_4346	0.027568462500047918
tfidf_2479	0.027557040619404043
tfidf_937	0.027551782057720204
tfidf_4932	0.02751309309549014
tfidf_591	0.027507196485685576
tfidf_2439	0.02749119855909912
tfidf_4910	0.027483219462473465
tfidf_2026	0.0274824610

tfidf_1170	0.0218684856422233
tfidf_4375	0.021864493004188735
tfidf_4125	0.02185851989151787
tfidf_843	0.021845270104466033
tfidf_1154	0.02183242994071866
tfidf_4256	0.021829882434395716
tfidf_969	0.021806512219645142
tfidf_3154	0.021793820813229513
tfidf_4616	0.02177218783691674
tfidf_2093	0.021759779769792133
tfidf_2353	0.02175383677782606
tfidf_1676	0.021737319328427634
tfidf_2666	0.021711839123977294
tfidf_3982	0.02170087457025119
tfidf_2322	0.021699814634091884
tfidf_3146	0.021699314779008216
tfidf_2792	0.021690027393164425
tfidf_2539	0.021680080213105135
tfidf_335	0.021663561285536904
tfidf_1874	0.021640150513490185
tfidf_4798	0.02162527995975353
tfidf_4415	0.021622616642766188
tfidf_5262	0.021577372833365225
tfidf_5224	0.021573983666814275
tfidf_489	0.021565218512975902
tfidf_4403	0.02155300663124581
tfidf_1619	0.021539644496494075
tfidf_2077	0.021538683139729998
tfidf_972	0.021535996508187365
tfidf_927	0.0215331835779569
tfidf_3033	0.021527423513893692
tfidf_3972	0.021517864798

tfidf_5235	0.019341309328108326
tfidf_5200	0.019308721516924247
tfidf_1175	0.01930870887573192
tfidf_1818	0.019306193233337328
tfidf_943	0.01929456951986309
tfidf_2629	0.019293362059178004
tfidf_2682	0.019286832951597218
tfidf_1084	0.019281987759209314
tfidf_3236	0.019280267206371948
tfidf_1396	0.019269037638257974
tfidf_2735	0.019268539816352088
tfidf_610	0.019262141184170564
tfidf_3549	0.019260167819436295
tfidf_3125	0.019258455997113528
tfidf_3879	0.019252379545045908
tfidf_1068	0.019251544449944723
tfidf_5162	0.019225622636038196
tfidf_3574	0.01922067937508451
tfidf_1326	0.01921238475844252
tfidf_3515	0.019197840519331395
tfidf_2997	0.019195776022839357
tfidf_56	0.019192413017587288
tfidf_2878	0.019161825733543754
tfidf_1964	0.019160014363272117
tfidf_3420	0.01915156237360856
tfidf_2335	0.01914319433180275
tfidf_296	0.019136029811516166
tfidf_238	0.01913449515214552
tfidf_4021	0.019117110943183996
tfidf_3282	0.01910745221739192
tfidf_2014	0.01909955676312189
tfidf_274	0.01909774086

tfidf_697	0.016927191994455464
tfidf_2745	0.016894277119379374
tfidf_1235	0.016878323909011957
tfidf_546	0.016873500886904525
tfidf_2318	0.01687321270720016
tfidf_4636	0.016847641502910585
tfidf_849	0.01683220202070667
tfidf_5237	0.016830309699485
tfidf_342	0.016828818313858466
tfidf_3252	0.016824204649366875
tfidf_3386	0.01681870199868071
tfidf_5269	0.016793597642101515
tfidf_4278	0.016773073027307055
tfidf_1328	0.016762630577503032
tfidf_5255	0.01675787984121805
tfidf_847	0.016746663280714284
tfidf_3767	0.01673714365870853
tfidf_2385	0.01673644646721157
tfidf_4418	0.016726909022849196
tfidf_4912	0.01672473310377427
tfidf_2781	0.016701086714928728
tfidf_3709	0.01669910391946942
tfidf_2763	0.016691258863800133
tfidf_902	0.016679573206652164
tfidf_1255	0.016675208617984983
tfidf_1222	0.016673887078289693
tfidf_1011	0.0166700929152292
tfidf_2068	0.01666797900212876
tfidf_3426	0.016655835873639568
tfidf_5299	0.016652067863653173
tfidf_4707	0.01664696807871014
tfidf_280	0.01663815406866434

tfidf_3251	0.01485993489146607
tfidf_4500	0.014849708251264651
tfidf_2605	0.014845074101216896
tfidf_1730	0.014836292116832838
tfidf_622	0.0148326967934542
tfidf_4486	0.014831883288911569
tfidf_3637	0.014827151511016563
tfidf_703	0.01482486093089633
tfidf_411	0.014814217351136267
tfidf_2270	0.014812007986410924
tfidf_2659	0.014779715954254752
tfidf_1733	0.014772978847811728
tfidf_2946	0.014766372898740143
tfidf_2885	0.01476380888751773
tfidf_2474	0.014716270563510924
tfidf_2037	0.014714738046779167
tfidf_4315	0.014706253759705818
tfidf_3023	0.014705485875285744
tfidf_552	0.014684858755250831
tfidf_4719	0.014676146938775496
tfidf_2896	0.014671734867948795
tfidf_87	0.01464979072973435
tfidf_2145	0.014638016932516089
tfidf_639	0.014629716894389055
tfidf_2883	0.014604629860862037
tfidf_187	0.014603764837259383
tfidf_5096	0.014602235403949844
tfidf_1001	0.014601612291317245
tfidf_3606	0.014599151927662918
tfidf_1916	0.014591387059945452
tfidf_349	0.01458196046112623
tfidf_658	0.014578909422

tfidf_1574	0.012752597386190816
tfidf_1618	0.012752406681081935
tfidf_4607	0.012739283041490437
tfidf_3990	0.012736507689533648
tfidf_2835	0.012726966373996097
tfidf_1079	0.012722890788305496
tfidf_438	0.012721649924218701
tfidf_4131	0.012719437823031227
tfidf_264	0.012717900812529749
tfidf_4895	0.01271788277725262
tfidf_3491	0.012712931751601349
tfidf_3106	0.01269158517392742
tfidf_4317	0.012687978062618122
tfidf_1440	0.012680739545193689
tfidf_3194	0.012669035558689205
tfidf_4397	0.012667296903648748
tfidf_447	0.012664581934137345
tfidf_180	0.01266411176176765
tfidf_4548	0.012662420182240374
tfidf_2313	0.012662418007710915
tfidf_60	0.012658783675126207
tfidf_4352	0.012658640830270764
tfidf_4743	0.012657765756298294
tfidf_4487	0.01265587208549506
tfidf_5068	0.01263197403816964
tfidf_3826	0.012607227180990435
tfidf_3449	0.012602805001268027
tfidf_1048	0.012601950140572667
tfidf_2287	0.0125975255069217
tfidf_4618	0.012574892139402336
tfidf_1338	0.012574874370737206
tfidf_5003	0.01256906

tfidf_4337	0.010110841843319047
tfidf_2672	0.010074889675613333
tfidf_3878	0.010072532588808872
tfidf_3432	0.010062908344433694
tfidf_2853	0.010061834095735415
tfidf_3177	0.010058138806025197
tfidf_250	0.010053814947738278
tfidf_1391	0.010047392311352937
tfidf_2575	0.010044126334500204
tfidf_3217	0.010034598873619233
tfidf_259	0.010034099939807892
tfidf_2514	0.010025946094191369
tfidf_2538	0.010019945210186149
tfidf_2662	0.010019626091161922
tfidf_1974	0.010015291105190978
tfidf_3315	0.010013809195059115
tfidf_2235	0.01000717011018108
tfidf_968	0.009994686692472722
tfidf_633	0.009991395449476774
tfidf_5040	0.009990090325207642
tfidf_302	0.009988674590458874
tfidf_4552	0.00996547483664568
tfidf_242	0.00994929782268963
tfidf_2227	0.009947769317925351
tfidf_5347	0.009942178646404629
tfidf_3663	0.009926067602303874
tfidf_3594	0.009915500577283182
tfidf_2749	0.009912948670305757
tfidf_2128	0.00990498368365006
tfidf_2522	0.00990365928225741
tfidf_918	0.009903502493918656
tfidf_5169	0.0098976

tfidf_486	0.006685902136587756
tfidf_517	0.006677006354693895
tfidf_3198	0.006675840322371725
tfidf_102	0.006654877553925186
tfidf_2254	0.006643481888945781
tfidf_2645	0.006641856014460087
tfidf_1573	0.006624990204553217
tfidf_5241	0.00660329988415435
tfidf_1565	0.0066030951020175244
tfidf_6	0.006598525481330844
tfidf_2790	0.00659654629528078
tfidf_1756	0.006592530587111794
tfidf_2438	0.006592016001440262
tfidf_150	0.006591160505690756
tfidf_4659	0.0065876949793430865
tfidf_1845	0.0065830228620324905
tfidf_1748	0.0065821544970733345
tfidf_3573	0.0065761499984918065
tfidf_2673	0.0065690043958798105
tfidf_897	0.006560684794302047
tfidf_3997	0.006559160365903608
tfidf_2960	0.006553916677689331
tfidf_3317	0.006547873296495943
tfidf_2047	0.0065442237577425155
tfidf_5080	0.0065428605092787626
tfidf_5004	0.006533648125524471
tfidf_1460	0.006529067007572692
tfidf_2115	0.006525250168615976
tfidf_1910	0.006519907877751806
tfidf_2650	0.006512067052888144
tfidf_973	0.0065099787679862966
tfidf_4705

tfidf_114	0.003578817801326792
tfidf_2242	0.003577116640433087
tfidf_4633	0.0035611699267246366
tfidf_5190	0.0035485535857673572
tfidf_2904	0.0035326420924264154
tfidf_4464	0.003530723554849266
tfidf_5077	0.0035143262778464315
tfidf_582	0.0035084075505192715
tfidf_1171	0.0035025907616109816
tfidf_5177	0.003496180103889688
tfidf_1932	0.0034936694214395656
tfidf_3016	0.0034652446023217516
tfidf_324	0.00344320481428679
tfidf_1794	0.003436702763495496
tfidf_2279	0.003422955876509989
tfidf_2676	0.0034205382342300957
tfidf_2451	0.003410687219231285
tfidf_212	0.0034094522312207495
tfidf_685	0.0034038034629675024
tfidf_5270	0.003403703920222277
tfidf_5024	0.0033717907119303722
tfidf_4772	0.0033491499527692143
tfidf_3615	0.0033481999215414504
tfidf_1785	0.0033481586990088876
tfidf_2818	0.003344909471174214
tfidf_1984	0.0033290996999302937
tfidf_510	0.0033259677748836087
tfidf_2002	0.0033217860402535208
tfidf_2383	0.0033212532214955347
tfidf_882	0.003318879100922139
tfidf_2154	0.0033135971322177

tfidf_4794	0.0016693192891040721
tfidf_2608	0.0016607260372437423
tfidf_1702	0.0016536017041647666
tfidf_1072	0.0016509994772200135
tfidf_5340	0.0016387012424148352
tfidf_1790	0.0016267984729164211
tfidf_1169	0.0016186722198399562
tfidf_1449	0.0016121188907619666
tfidf_318	0.0016068433940248823
tfidf_1033	0.0015795922617386554
tfidf_4918	0.0015783766447142447
tfidf_2306	0.0015757650684979656
tfidf_1956	0.0015735924686578022
tfidf_5098	0.0015603270369081367
tfidf_518	0.001513290379557909
tfidf_1377	0.0015057365017987485
tfidf_3058	0.0014804033377357122
tfidf_2981	0.0014689186963387267
tfidf_272	0.0014686216039527214
tfidf_2694	0.001468240629067356
tfidf_3743	0.0014562114931288266
tfidf_1094	0.0014528580094457617
tfidf_2649	0.001423606375352719
tfidf_1117	0.0014232754421578892
tfidf_3382	0.00142154144933736
tfidf_70	0.0014195861908972332
tfidf_1035	0.001419160012699726
tfidf_1860	0.0013984590063897506
tfidf_4015	0.0013907338859663338
tfidf_1893	0.0013907255709655687
tfidf_4334	0.00138271

In [46]:
model2.bestModel.asInstanceOf[PipelineModel].stages.foreach(stage => println(stage.extractParamMap))

{
	regexTok_077ab9d82f46-gaps: true,
	regexTok_077ab9d82f46-inputCol: text,
	regexTok_077ab9d82f46-minTokenLength: 1,
	regexTok_077ab9d82f46-outputCol: tokens,
	regexTok_077ab9d82f46-pattern: \W+,
	regexTok_077ab9d82f46-toLowercase: true
}
{
	stopWords_1db5213c5166-caseSensitive: false,
	stopWords_1db5213c5166-inputCol: tokens,
	stopWords_1db5213c5166-locale: fr_FR,
	stopWords_1db5213c5166-outputCol: text_filtered,
	stopWords_1db5213c5166-stopWords: [Ljava.lang.String;@2f49ac50
}
{
	cntVec_7fa573caaffe-binary: false,
	cntVec_7fa573caaffe-inputCol: text_filtered,
	cntVec_7fa573caaffe-maxDF: 9.223372036854776E18,
	cntVec_7fa573caaffe-minDF: 35.0,
	cntVec_7fa573caaffe-minTF: 1.0,
	cntVec_7fa573caaffe-outputCol: cv_features,
	cntVec_7fa573caaffe-vocabSize: 262144
}
{
	idf_6c2a96e6163e-inputCol: cv_features,
	idf_6c2a96e6163e-minDocFreq: 0,
	idf_6c2a96e6163e-outputCol: tfidf
}
{
	strIdx_8c3c18a145e5-handleInvalid: keep,
	strIdx_8c3c18a145e5-inputCol: country,
	strIdx_8c3c18a145e5-outputCol:

In [55]:
model2.bestModel.asInstanceOf[PipelineModel].stages.foreach(stage => println(stage.extractParamMap))

{
	regexTok_76339ff684db-gaps: true,
	regexTok_76339ff684db-inputCol: text,
	regexTok_76339ff684db-minTokenLength: 1,
	regexTok_76339ff684db-outputCol: tokens,
	regexTok_76339ff684db-pattern: \W+,
	regexTok_76339ff684db-toLowercase: true
}
{
	stopWords_d47f3a8d2ebc-caseSensitive: false,
	stopWords_d47f3a8d2ebc-inputCol: tokens,
	stopWords_d47f3a8d2ebc-locale: fr_FR,
	stopWords_d47f3a8d2ebc-outputCol: text_filtered,
	stopWords_d47f3a8d2ebc-stopWords: [Ljava.lang.String;@36e79098
}
{
	cntVec_ccb7eb330a34-binary: false,
	cntVec_ccb7eb330a34-inputCol: text_filtered,
	cntVec_ccb7eb330a34-maxDF: 9.223372036854776E18,
	cntVec_ccb7eb330a34-minDF: 45.0,
	cntVec_ccb7eb330a34-minTF: 1.0,
	cntVec_ccb7eb330a34-outputCol: cv_features,
	cntVec_ccb7eb330a34-vocabSize: 262144
}
{
	idf_d4a06695bf6f-inputCol: cv_features,
	idf_d4a06695bf6f-minDocFreq: 0,
	idf_d4a06695bf6f-outputCol: tfidf
}
{
	strIdx_818c069ac5fd-handleInvalid: keep,
	strIdx_818c069ac5fd-inputCol: country,
	strIdx_818c069ac5fd-outputCol: