In [1]:
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SQLContext, SparkSession, Row}
import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType}
import org.apache.spark.ml.feature.{RegexTokenizer, StopWordsRemover, CountVectorizer, CountVectorizerModel, IDF, StringIndexer, OneHotEncoderEstimator, VectorAssembler}
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel, RandomForestClassificationModel, RandomForestClassifier}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.evaluation.{MulticlassClassificationEvaluator}
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.attribute.AttributeGroup


Intitializing Scala interpreter ...

Spark Web UI available at http://xavier-linux.home:4040
SparkContext available as 'sc' (version = 2.4.4, master = local[*], app id = local-1575725869743)
SparkSession available as 'spark'


import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SQLContext, SparkSession, Row}
import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType}
import org.apache.spark.ml.feature.{RegexTokenizer, StopWordsRemover, CountVectorizer, CountVectorizerModel, IDF, StringIndexer, OneHotEncoderEstimator, VectorAssembler}
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel, RandomForestClassificationModel, RandomForestClassifier}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.attribut...

## Configuration de SparkSession

In [2]:
val conf = new SparkConf().setAll(Map(
      "spark.scheduler.mode" -> "FIFO",
      "spark.speculation" -> "false",
      "spark.reducer.maxSizeInFlight" -> "48m",
      "spark.serializer" -> "org.apache.spark.serializer.KryoSerializer",
      "spark.kryoserializer.buffer.max" -> "1g",
      "spark.shuffle.file.buffer" -> "32k",
      "spark.default.parallelism" -> "12",
      "spark.sql.shuffle.partitions" -> "12"
    ))

conf: org.apache.spark.SparkConf = org.apache.spark.SparkConf@16c8faa9


In [3]:
 val spark = SparkSession
      .builder
      .config(conf)
      .appName("TP Spark : Trainer")
      .getOrCreate()

import spark.implicits._  // to use the symbol $

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@edfb74d
import spark.implicits._


## Chargement du DataFrame

In [4]:
val df: DataFrame = spark
            .read
            .option("header", true)
            .option("inferSchema", "true")
            .parquet("../data/prepared_trainingset/")

println(s"Nombre de lignes: ${df.count}")
println(s"Nombre de colonnes: ${df.columns.length}")

Nombre de lignes: 107614
Nombre de colonnes: 14


df: org.apache.spark.sql.DataFrame = [project_id: string, name: string ... 12 more fields]


In [5]:
df.show(5)

+--------------+--------------------+--------------------+------+--------------------+------------+--------+---------+-------------------+-------------------+-------------------+-------------+-----------+--------------------+
|    project_id|                name|                desc|  goal|            keywords|final_status|country2|currency2|          deadline2|        created_at2|       launched_at2|days_campaign|hours_prepa|                text|
+--------------+--------------------+--------------------+------+--------------------+------------+--------+---------+-------------------+-------------------+-------------------+-------------+-----------+--------------------+
| kkst471421639|american options ...|looking to create...|100000|american-options-...|           0|      US|      USD|2014-11-15 17:31:27|2014-10-10 21:23:58|2014-10-16 17:31:27|           30|    140.125|american options ...|
|kkst1098019088|iheadbones bone c...|wireless bluetoot...| 20000|iheadbones-bone-c...|          

## Retraitement des données textuelles

In [6]:
// Stage 1 : récupérer les mots des textes
val tokenizer = new RegexTokenizer()
  .setPattern("\\W+")
  .setGaps(true)
  .setInputCol("text")
  .setOutputCol("tokens")

//val words = tokenizer.transform(df)

tokenizer: org.apache.spark.ml.feature.RegexTokenizer = regexTok_690ae5d9ec5a


In [7]:
// Stage 2 : retirer les stop words (liste : StopWordsRemover.loadDefaultStopWords("english"))
val stopWordsRemover = new StopWordsRemover()
    .setInputCol("tokens")
    .setOutputCol("text_filtered")

//val words2 = stopWordsRemover.transform(words)

stopWordsRemover: org.apache.spark.ml.feature.StopWordsRemover = stopWords_af20114f9ace


In [8]:
// Stage 3 : computer la partie TF
val cvModel: CountVectorizer = new CountVectorizer()//Model(Array("a", "b", "c"))
    .setInputCol("text_filtered")
    .setOutputCol("cv_features")
    //.setVocabSize(3)
    //.setMinDF()

//val words3 = cvModel.fit(words2).transform(words2)

cvModel: org.apache.spark.ml.feature.CountVectorizer = cntVec_65b383a652ed


In [9]:
// Stage 4 : computer la partie IDF
val idf = new IDF()
    .setInputCol("cv_features")
    .setOutputCol("tfidf")

//val df2: DataFrame = idf.fit(words3).transform(words3)

idf: org.apache.spark.ml.feature.IDF = idf_eee3f8f5dedd


## Conversion des variables catégorielles en variables numériques

In [10]:
//Stage 5 : convertir country2 en quantités numériques
val indexer_country = new StringIndexer()
    .setInputCol("country2")
    .setOutputCol("country_indexed")
    .setHandleInvalid("keep")

//val df4: DataFrame = indexer.fit(df3).transform(df3)

indexer_country: org.apache.spark.ml.feature.StringIndexer = strIdx_e2d8b112b448


In [11]:
//Stage 6 : convertir currency2 en quantités numériques
val indexer_currency = new StringIndexer()
    .setInputCol("currency2")
    .setOutputCol("currency_indexed")

//val df5: DataFrame = indexer.fit(df4).transform(df4)

indexer_currency: org.apache.spark.ml.feature.StringIndexer = strIdx_40c8d8e9a601


In [12]:
// Stages 7 et 8: One-Hot encoder ces deux catégories
val encoder = new OneHotEncoderEstimator()
    .setInputCols(Array("country_indexed", "currency_indexed"))
    .setOutputCols(Array("country_onehot", "currency_onehot"))

//val df7: DataFrame = encoder.fit(df6).transform(df6)

encoder: org.apache.spark.ml.feature.OneHotEncoderEstimator = oneHotEncoder_486b91220d16


## Mise en forme des données sous une forme utilisable par Spark.ML

In [13]:
val assembler = new VectorAssembler()
  .setInputCols(Array("tfidf", "days_campaign", "hours_prepa", "goal", "country_onehot", "currency_onehot"))
  .setOutputCol("features")

//val df9 = assembler.transform(df8)

assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_d55f4b879072


## Création du modèle de classification

In [14]:
// Stage 10 : créer/instancier le modèle de classification
val lr = new LogisticRegression()
  .setElasticNetParam(0.0)
  .setFitIntercept(true)
  .setFeaturesCol("features")
  .setLabelCol("final_status")
  .setStandardization(true)
  .setPredictionCol("predictions")
  .setRawPredictionCol("raw_predictions")
  .setThresholds(Array(0.7, 0.3))
  .setTol(1.0e-6)
  .setMaxIter(20)

lr: org.apache.spark.ml.classification.LogisticRegression = logreg_0db011cfa8d2


## Création du pipeline

In [15]:
val pipeline = new Pipeline()
    .setStages(Array(tokenizer, stopWordsRemover, 
                     cvModel, idf, indexer_country, indexer_currency,
                     encoder, assembler, lr))

pipeline: org.apache.spark.ml.Pipeline = pipeline_f39b1e10681d


## Entraînement, test, et sauvegarde du modèle

In [16]:
val Array(training, test) = df.randomSplit(Array(0.9, 0.1), seed=261)

training: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [project_id: string, name: string ... 12 more fields]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [project_id: string, name: string ... 12 more fields]


In [17]:
val model = pipeline.fit(training)

model: org.apache.spark.ml.PipelineModel = pipeline_f39b1e10681d


In [18]:
// On enregistre le modèle entraîné
model.write.overwrite().save("../data/model/spark-logistic-regression-model")

In [19]:
val dfWithSimplePredictions = model.transform(test)

dfWithSimplePredictions: org.apache.spark.sql.DataFrame = [project_id: string, name: string ... 24 more fields]


In [20]:
dfWithSimplePredictions.groupBy("final_status", "predictions").count.show()

+------------+-----------+-----+
|final_status|predictions|count|
+------------+-----------+-----+
|           1|        0.0| 1812|
|           0|        1.0| 2328|
|           1|        1.0| 1590|
|           0|        0.0| 5036|
+------------+-----------+-----+



In [21]:
val evaluator = new MulticlassClassificationEvaluator()
    .setLabelCol("final_status")
    .setPredictionCol("predictions")
    .setMetricName("f1")
val f1_score = evaluator.evaluate(dfWithSimplePredictions)

evaluator: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = mcEval_d6c8b017231e
f1_score: Double = 0.6220287782615461


## Réglage des hyper-paramètres du modèle

Documentation : https://spark.apache.org/docs/2.2.0/ml-tuning.html

In [22]:
val paramGrid = new ParamGridBuilder()
    .addGrid(cvModel.minDF, Array(55.0, 75.0, 95.0))
    .addGrid(lr.elasticNetParam, Array(10e-8, 10e-6, 10e-4, 10e-2))
    .build()

paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	logreg_0db011cfa8d2-elasticNetParam: 1.0E-7,
	cntVec_65b383a652ed-minDF: 55.0
}, {
	logreg_0db011cfa8d2-elasticNetParam: 1.0E-5,
	cntVec_65b383a652ed-minDF: 55.0
}, {
	logreg_0db011cfa8d2-elasticNetParam: 0.001,
	cntVec_65b383a652ed-minDF: 55.0
}, {
	logreg_0db011cfa8d2-elasticNetParam: 0.1,
	cntVec_65b383a652ed-minDF: 55.0
}, {
	logreg_0db011cfa8d2-elasticNetParam: 1.0E-7,
	cntVec_65b383a652ed-minDF: 75.0
}, {
	logreg_0db011cfa8d2-elasticNetParam: 1.0E-5,
	cntVec_65b383a652ed-minDF: 75.0
}, {
	logreg_0db011cfa8d2-elasticNetParam: 0.001,
	cntVec_65b383a652ed-minDF: 75.0
}, {
	logreg_0db011cfa8d2-elasticNetParam: 0.1,
	cntVec_65b383a652ed-minDF: 75.0
}, {
	logreg_0db011cfa8d2-elasticNetParam: 1.0E-7,
	cntVec_65b383a652ed-min...

In [23]:
val validation = new TrainValidationSplit()
    .setEstimator(pipeline)
    .setEvaluator(evaluator)
    .setEstimatorParamMaps(paramGrid)
    .setTrainRatio(0.7)

validation: org.apache.spark.ml.tuning.TrainValidationSplit = tvs_7082b2fae424


In [24]:
val model_improved = validation.fit(training)

model_improved: org.apache.spark.ml.tuning.TrainValidationSplitModel = tvs_7082b2fae424


In [25]:
val dfWithPredictions = model_improved.transform(test)

dfWithPredictions: org.apache.spark.sql.DataFrame = [project_id: string, name: string ... 24 more fields]


In [26]:
dfWithPredictions.groupBy("final_status", "predictions").count.show()

+------------+-----------+-----+
|final_status|predictions|count|
+------------+-----------+-----+
|           1|        0.0| 1081|
|           0|        1.0| 2845|
|           1|        1.0| 2321|
|           0|        0.0| 4519|
+------------+-----------+-----+



In [27]:
val f1_score = evaluator.evaluate(dfWithPredictions)

f1_score: Double = 0.6480627333380333


In [28]:
// On enregistre le modèle entraîné
model_improved.write.overwrite().save("../data/model/spark-logistic-regression-model-improved")

## Test du modèle sur les données cleanées précédemment

In [29]:
val df_cleaned: DataFrame = spark
            .read
            .option("header", true)
            .option("inferSchema", "true")
            .parquet("../data/dataframe/")

println(s"Nombre de lignes: ${df_cleaned.count}")
println(s"Nombre de colonnes: ${df_cleaned.columns.length}")
val Array(training_cleaned, test_cleaned) = df_cleaned.randomSplit(Array(0.9, 0.1), seed=261)

Nombre de lignes: 108129
Nombre de colonnes: 8


df_cleaned: org.apache.spark.sql.DataFrame = [project_id: string, goal: double ... 6 more fields]
training_cleaned: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [project_id: string, goal: double ... 6 more fields]
test_cleaned: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [project_id: string, goal: double ... 6 more fields]


In [30]:
//Stage 5 : convertir country en quantités numériques
val indexer_country_cleaned = new StringIndexer()
    .setInputCol("country")
    .setOutputCol("country_indexed")
    .setHandleInvalid("keep")

//Stage 6 : convertir currency en quantités numériques
val indexer_currency_cleaned = new StringIndexer()
    .setInputCol("currency")
    .setOutputCol("currency_indexed")

val pipeline_cleaned = new Pipeline()
    .setStages(Array(tokenizer, stopWordsRemover, 
                     cvModel, idf, indexer_country_cleaned, indexer_currency_cleaned,
                     encoder, assembler, lr))

val validation_cleaned = new TrainValidationSplit()
    .setEstimator(pipeline_cleaned)
    .setEvaluator(evaluator)
    .setEstimatorParamMaps(paramGrid)
    .setTrainRatio(0.7)

indexer_country_cleaned: org.apache.spark.ml.feature.StringIndexer = strIdx_09f232b5a504
indexer_currency_cleaned: org.apache.spark.ml.feature.StringIndexer = strIdx_44f94f52a2d0
pipeline_cleaned: org.apache.spark.ml.Pipeline = pipeline_9ec525757b95
validation_cleaned: org.apache.spark.ml.tuning.TrainValidationSplit = tvs_4adbe75196e4


In [31]:
val model_improved_training_cleaned = validation_cleaned.fit(training_cleaned)
val dfCleanedWithPredictions = model_improved_training_cleaned.transform(test_cleaned)

model_improved_training_cleaned: org.apache.spark.ml.tuning.TrainValidationSplitModel = tvs_4adbe75196e4
dfCleanedWithPredictions: org.apache.spark.sql.DataFrame = [project_id: string, goal: double ... 18 more fields]


In [32]:
dfCleanedWithPredictions.groupBy("final_status", "predictions").count.show()
print(s"f1-score du modèle cleané: ${evaluator.evaluate(dfCleanedWithPredictions)}")

+------------+-----------+-----+
|final_status|predictions|count|
+------------+-----------+-----+
|           1|        0.0| 1060|
|           0|        1.0| 2849|
|           1|        1.0| 2445|
|           0|        0.0| 4459|
+------------+-----------+-----+

f1-score du modèle cleané: 0.6500313713545759

## Amélioration du modèle sur les données cleanées précédemment

In [33]:
val df2: DataFrame = spark
            .read
            .option("header", true)
            .option("inferSchema", "true")
            .parquet("../data/dataframe/")

println(s"Nombre de lignes: ${df2.count}")
println(s"Nombre de colonnes: ${df2.columns.length}")
val Array(training2, test2) = df2.randomSplit(Array(0.9, 0.1), seed=261)

Nombre de lignes: 108129
Nombre de colonnes: 8


df2: org.apache.spark.sql.DataFrame = [project_id: string, goal: double ... 6 more fields]
training2: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [project_id: string, goal: double ... 6 more fields]
test2: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [project_id: string, goal: double ... 6 more fields]


In [34]:
// Stage 1 : récupérer les mots des textes
val tokenizer2 = new RegexTokenizer()
  .setPattern("\\W+")
  .setGaps(true)
  .setInputCol("text")
  .setOutputCol("tokens")

// Stage 2 : retirer les stop words (liste : StopWordsRemover.loadDefaultStopWords("english"))
val stopWordsRemover2 = new StopWordsRemover()
    .setInputCol("tokens")
    .setOutputCol("text_filtered")

// Stage 3 : computer la partie TF
val cvModel2: CountVectorizer = new CountVectorizer()//Model(Array("a", "b", "c"))
    .setInputCol("text_filtered")
    .setOutputCol("cv_features")
    //.setVocabSize(3)
    //.setMinDF()

// Stage 4 : computer la partie IDF
val idf2 = new IDF()
    .setInputCol("cv_features")
    .setOutputCol("tfidf")

//Stage 5 : convertir country2 en quantités numériques
val indexer_country2 = new StringIndexer()
    .setInputCol("country")
    .setOutputCol("country_indexed")
    .setHandleInvalid("skip")

//Stage 6 : convertir currency2 en quantités numériques
val indexer_currency2 = new StringIndexer()
    .setInputCol("currency")
    .setOutputCol("currency_indexed")
    .setHandleInvalid("skip")

// Stages 7 et 8: One-Hot encoder ces deux catégories
val encoder2 = new OneHotEncoderEstimator()
    .setInputCols(Array("country_indexed", "currency_indexed"))
    .setOutputCols(Array("country_onehot", "currency_onehot"))

val assembler2 = new VectorAssembler()
    .setInputCols(Array("tfidf", "days_campaign", "hours_prepa", "goal", "country_onehot", "currency_onehot"))
    .setOutputCol("features")
    .setHandleInvalid("skip")

// Stage 10 : créer/instancier le modèle de classification
val lr2 = new LogisticRegression()
    .setElasticNetParam(0.0)
    .setFitIntercept(true)
    .setFeaturesCol("features")
    .setLabelCol("final_status")
    .setStandardization(true)
    .setPredictionCol("predictions")
    .setRawPredictionCol("raw_predictions")
    .setThresholds(Array(0.7, 0.3))
    .setTol(1.0e-6)
    .setMaxIter(20)

val pipeline2 = new Pipeline()
    .setStages(Array(tokenizer2, stopWordsRemover2, 
                     cvModel2, idf2, indexer_country2, indexer_currency2,
                     encoder2, assembler2, lr2))

val paramGrid2 = new ParamGridBuilder()
    .addGrid(cvModel2.minDF, Array(5.0, 15.0, 25.0, 35.0, 45.0, 55.0, 75.0, 95.0))
    .addGrid(lr2.elasticNetParam, Array(10e-8, 10e-6, 10e-4, 10e-2))
    .build()

val evaluator = new MulticlassClassificationEvaluator()
    .setLabelCol("final_status")
    .setPredictionCol("predictions")
    .setMetricName("f1")

val validation2 = new TrainValidationSplit()
    .setEstimator(pipeline2)
    .setEvaluator(evaluator)
    .setEstimatorParamMaps(paramGrid2)
    .setTrainRatio(0.7)

tokenizer2: org.apache.spark.ml.feature.RegexTokenizer = regexTok_2ef9e98c3c4f
stopWordsRemover2: org.apache.spark.ml.feature.StopWordsRemover = stopWords_56283922eceb
cvModel2: org.apache.spark.ml.feature.CountVectorizer = cntVec_e3378908d98b
idf2: org.apache.spark.ml.feature.IDF = idf_86d8769eacd0
indexer_country2: org.apache.spark.ml.feature.StringIndexer = strIdx_342131a9f796
indexer_currency2: org.apache.spark.ml.feature.StringIndexer = strIdx_7bc074809e32
encoder2: org.apache.spark.ml.feature.OneHotEncoderEstimator = oneHotEncoder_85855bd0e4a3
assembler2: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_84a4a522e9d5
lr2: org.apache.spark.ml.classification.LogisticRegression = logreg_87b3b74c0d44
pipeline2: org.apache.spark.ml.Pipeline = pipeline_5837f0b9dfbf
paramGrid2: ...

In [35]:
val model2 = validation2.fit(training2)
val df2 = model2.transform(test2)

model2: org.apache.spark.ml.tuning.TrainValidationSplitModel = tvs_467c8e8e23b1
df2: org.apache.spark.sql.DataFrame = [project_id: string, goal: double ... 18 more fields]


In [36]:
df2.count //108129

res7: Long = 10813


In [37]:
df2.groupBy("final_status", "predictions").count.show()
print(s"f1-score du modèle cleané: ${evaluator.evaluate(df2)}")
// 0.6576354660559056

+------------+-----------+-----+
|final_status|predictions|count|
+------------+-----------+-----+
|           1|        0.0| 1084|
|           0|        1.0| 2739|
|           1|        1.0| 2421|
|           0|        0.0| 4569|
+------------+-----------+-----+

f1-score du modèle cleané: 0.6576354660559056

## Explication du modèle

In [38]:
val model_lr = model2.bestModel.asInstanceOf[PipelineModel].stages.last.asInstanceOf[LogisticRegressionModel]
// Extract the attributes of the input (features)
val schema = model2.transform(test2).schema
val featureAttrs = AttributeGroup.fromStructField(schema(model_lr.getFeaturesCol)).attributes.get
val features = featureAttrs.map(_.name.get)

// Add "(Intercept)" to list of feature names if the model was fit with an intercept
val featureNames: Array[String] = if (model_lr.getFitIntercept) {
  Array("(Intercept)") ++ features
} else {
  features
}

// Get array of coefficients
val lrModelCoeffs = model_lr.coefficients.toArray
val coeffs = if (model_lr.getFitIntercept) {
  lrModelCoeffs ++ Array(model_lr.intercept)
} else {
  lrModelCoeffs
}
val coeffs_abs = coeffs.map(num => Math.abs(num))

model_lr: org.apache.spark.ml.classification.LogisticRegressionModel = LogisticRegressionModel: uid = logreg_87b3b74c0d44, numClasses = 2, numFeatures = 5396
schema: org.apache.spark.sql.types.StructType = StructType(StructField(project_id,StringType,true), StructField(goal,DoubleType,true), StructField(country,StringType,true), StructField(currency,StringType,true), StructField(final_status,IntegerType,true), StructField(days_campaign,IntegerType,true), StructField(hours_prepa,DoubleType,true), StructField(text,StringType,true), StructField(tokens,ArrayType(StringType,true),true), StructField(text_filtered,ArrayType(StringType,true),true), StructField(cv_features,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true), StructField(tfidf,org.apache.spark.ml.linalg.VectorUDT@3bfc3ba7,true), ...

In [39]:
// Print feature names & coefficients together
println("Feature\tCoefficient")
featureNames.zip(coeffs_abs).sortBy(_._2)(Ordering[Double].reverse).foreach { case (feature, coeff) =>
  println(s"$feature\t$coeff")
}

Feature	Coefficient
tfidf_30	2.6015223651907116
currency_onehot_DKK	0.7167146200501903
tfidf_1279	0.6663521680843651
country_onehot_NO	0.4385198261275509
tfidf_4393	0.4026714904656772
tfidf_2208	0.3948925988803582
country_onehot_AU	0.3424605722996145
tfidf_4237	0.2563367932124847
tfidf_901	0.2546160229773417
tfidf_826	0.2540326258168657
goal	0.25332063575518726
country_onehot_IE	0.25332063575518726
tfidf_2017	0.24827531293116498
currency_onehot_AUD	0.2452546065145863
tfidf_5183	0.23390849839758185
tfidf_4256	0.17494188159526566
tfidf_45	0.17371662580403674
tfidf_5048	0.17132367187091813
tfidf_3167	0.16975778774478253
tfidf_4886	0.16881644467270265
tfidf_5266	0.16264638855972888
tfidf_2711	0.16164761852451298
tfidf_5167	0.1603886624122995
tfidf_2470	0.16008892925395893
tfidf_4144	0.1584967238066007
country_onehot_DK	0.1576062412803305
tfidf_3852	0.1569889298033503
tfidf_4709	0.15403342424133432
tfidf_4627	0.15340001221366498
tfidf_3535	0.15152037316964093
tfidf_3197	0.15127431812885547


tfidf_5331	0.06999143061402835
tfidf_308	0.06995346002338859
tfidf_4838	0.0698216527912539
tfidf_4600	0.06977071887667434
tfidf_1309	0.06973158201188535
tfidf_4082	0.06970952122757063
tfidf_5221	0.06969920516410538
tfidf_4509	0.0696648076027564
tfidf_1628	0.0696349476316658
tfidf_528	0.06954064209687734
tfidf_5271	0.06953024837946813
tfidf_4154	0.06939598012087522
tfidf_5281	0.0693700268396578
tfidf_3057	0.06934768924287653
tfidf_4974	0.06931488073530324
tfidf_4052	0.06931358299424328
tfidf_4321	0.06900687966119436
tfidf_4675	0.0689794000834825
tfidf_4551	0.06892783879049227
tfidf_4585	0.06887678807455709
tfidf_763	0.06887558430267961
tfidf_4785	0.0687523031664222
tfidf_2548	0.0687411069564043
tfidf_4949	0.06872132318456962
tfidf_4383	0.0687063151283717
tfidf_605	0.06865432727996781
tfidf_3817	0.06838238771990769
tfidf_165	0.06838009671284206
tfidf_3353	0.06824319223661779
tfidf_4902	0.06816503092785613
tfidf_4480	0.06812641101488524
tfidf_3399	0.06810010875253081
tfidf_5028	0.06799731

tfidf_294	0.05180043950671227
tfidf_5291	0.05179754114146228
tfidf_3846	0.051750897981879894
tfidf_5026	0.051747655017810965
tfidf_4367	0.05170851531013152
tfidf_2722	0.05170531949918007
tfidf_2049	0.051588078715315844
tfidf_1135	0.05157236500632644
tfidf_3262	0.051567504018983454
tfidf_5027	0.051527519950943484
tfidf_5251	0.051523276947237455
tfidf_4166	0.051515638878096674
tfidf_3552	0.05147845641671389
tfidf_5245	0.05146624454722662
tfidf_2745	0.05141734488665289
tfidf_2480	0.05141017514955643
tfidf_4973	0.051409641040627986
tfidf_2549	0.051407232928511466
tfidf_4889	0.05137838951357696
tfidf_4415	0.051300759888301566
tfidf_4127	0.051296849032245996
tfidf_2133	0.05128326213482769
tfidf_5137	0.051205548397319625
tfidf_3897	0.051133722620871354
tfidf_1939	0.05107510719373661
tfidf_4839	0.05104705724602453
tfidf_563	0.051041383898056236
tfidf_665	0.05102465939727639
tfidf_2421	0.05101492047085656
tfidf_2651	0.05096015100101862
tfidf_3372	0.05094374773610635
tfidf_2134	0.050928569350681

tfidf_4016	0.04444681515097443
tfidf_2473	0.044415783028569945
tfidf_3879	0.04439936317849617
tfidf_1760	0.04439393711727445
tfidf_4220	0.044393248604992636
tfidf_2975	0.04437980961136925
tfidf_2337	0.04434492226930388
tfidf_4724	0.04433185008037613
tfidf_574	0.04432828060470123
tfidf_2452	0.04431823559954205
tfidf_2065	0.04426174767116394
tfidf_4862	0.0442598921110172
tfidf_5334	0.0442518404456122
tfidf_3284	0.044229909164842235
tfidf_4994	0.04421654529647031
tfidf_4632	0.044195029273598516
tfidf_1685	0.04417322765785045
country_onehot_GB	0.04416235349366167
currency_onehot_GBP	0.04416235349366167
tfidf_4078	0.04415794575751515
tfidf_5212	0.044127846948182635
tfidf_2087	0.044123805381474285
tfidf_29	0.044072503189996316
tfidf_3959	0.04406282843999265
tfidf_3230	0.044039055561384266
tfidf_3255	0.04403508450087825
tfidf_4769	0.044010512442148104
tfidf_2218	0.04399260069639102
tfidf_5122	0.04396287460804258
tfidf_4668	0.04395958910094083
tfidf_3330	0.04389851608088371
tfidf_497	0.0438934

tfidf_3154	0.03958111418421376
tfidf_2501	0.03956478454718195
tfidf_1726	0.039547581495525695
tfidf_4676	0.03953815140743578
tfidf_5241	0.03953041445651761
tfidf_930	0.039523558390241534
tfidf_1368	0.0394659500881178
tfidf_2506	0.03945750312866318
tfidf_3450	0.03945703237269313
tfidf_5207	0.03945196778896346
tfidf_1277	0.03943728315688108
tfidf_4703	0.03942489753978781
tfidf_1907	0.03941792242428949
tfidf_3173	0.039402354739298956
tfidf_4556	0.03938925450726144
tfidf_5143	0.03937620582288641
tfidf_4731	0.03936228018598391
tfidf_1122	0.03935838288611189
tfidf_1944	0.03935772636888258
tfidf_1011	0.0393478641502441
tfidf_219	0.039328292018519465
tfidf_3626	0.03929918596407826
tfidf_4858	0.039245342251240906
tfidf_3125	0.03921887882403475
tfidf_4310	0.0392043380902184
tfidf_2029	0.03917413169026344
tfidf_4826	0.03916124901185552
tfidf_1901	0.0391310563960227
tfidf_4697	0.03913018390028073
tfidf_4723	0.03911998072399192
tfidf_1891	0.03910941606325068
tfidf_3370	0.03909251044617993
tfidf_224

tfidf_1930	0.035279184089335065
tfidf_5238	0.03525660922846329
tfidf_2197	0.03524388427072163
tfidf_2352	0.035239985121775154
tfidf_2086	0.03523668343313032
tfidf_1366	0.03523068605464433
tfidf_4011	0.03521644403539415
tfidf_5217	0.035187595852230545
tfidf_2642	0.03518366986125793
tfidf_2383	0.03516125243738404
tfidf_2166	0.03515507971818779
tfidf_2392	0.03515344556247586
tfidf_1128	0.03509470587660529
tfidf_488	0.0350785574488579
tfidf_1689	0.035034155877392364
tfidf_3568	0.0350277131215192
tfidf_1058	0.03502645801359663
tfidf_4485	0.03502483156704229
tfidf_2710	0.03501763375697722
tfidf_4359	0.03493438414349197
tfidf_3213	0.03493128613500101
tfidf_4875	0.03492056449349736
tfidf_132	0.03491055919312143
tfidf_1230	0.03489398686582273
tfidf_1227	0.034874557444838375
tfidf_4625	0.03486550487584149
tfidf_2034	0.03482487995191491
tfidf_3432	0.034824647435367076
tfidf_1889	0.03481880003817745
tfidf_5154	0.03481873475932199
tfidf_4099	0.03479271083255744
tfidf_3526	0.0347886314640595
tfidf_4

tfidf_4488	0.031572358016675885
tfidf_1103	0.03154553955975356
tfidf_3630	0.0315413708207199
tfidf_3849	0.03150472904222716
tfidf_1376	0.03150105157045629
tfidf_4176	0.031491322044587586
tfidf_5087	0.031482190941860166
tfidf_4299	0.03144053514493834
tfidf_2268	0.03141431790132063
tfidf_1264	0.03141246439999781
tfidf_3273	0.03139897441004787
tfidf_4966	0.03138804991093601
tfidf_1543	0.03136302708856752
tfidf_1492	0.031362502472565605
tfidf_3176	0.03135179327111773
tfidf_642	0.03128809115129088
tfidf_1563	0.03128541654472765
tfidf_474	0.03127079075413548
tfidf_4056	0.031267289708458455
tfidf_817	0.03123390727466806
tfidf_1604	0.031231330052353932
tfidf_1097	0.031230064980638104
tfidf_1037	0.031228119030601645
tfidf_1379	0.03118975635514353
tfidf_4088	0.031184991658749477
tfidf_3303	0.03117193445388203
tfidf_508	0.03116235656771466
tfidf_1068	0.03114795390672131
tfidf_668	0.031141345962182064
tfidf_1606	0.031138832442575155
tfidf_915	0.031121544224539877
tfidf_2773	0.031113787033862004
tf

tfidf_531	0.028125249130584177
tfidf_3168	0.028123546469233058
tfidf_2630	0.028100809597935213
tfidf_4178	0.0280922336007365
tfidf_4775	0.02805447966109399
tfidf_3481	0.028043826039811048
tfidf_3248	0.02804135257905063
tfidf_1962	0.028041243952494625
tfidf_4578	0.02803827192504783
tfidf_3208	0.02802810848586256
tfidf_5203	0.02799775243807932
tfidf_983	0.027981776992066276
tfidf_2457	0.027980174597105376
tfidf_5244	0.02797731703815166
tfidf_1584	0.02797322749784231
tfidf_991	0.02796982363774082
tfidf_1201	0.02795901173143621
tfidf_2797	0.027954264987394006
tfidf_4719	0.02795054644338369
tfidf_3558	0.027947991242775957
tfidf_2660	0.027942843513397323
tfidf_2500	0.027896144353061227
tfidf_2916	0.02788821648083629
tfidf_178	0.02787818432567514
tfidf_3621	0.027869406063706692
tfidf_4073	0.027868169337756248
tfidf_1424	0.02786771975302477
tfidf_4748	0.02785593401434093
tfidf_461	0.027854947785668847
tfidf_3080	0.027833131110340307
tfidf_732	0.027830185656799705
tfidf_2515	0.02782781821179862

tfidf_1942	0.02498563241953388
tfidf_1579	0.024952092660177977
tfidf_4363	0.024932509986106147
tfidf_1052	0.02493137917141737
tfidf_2015	0.024911373444238254
tfidf_4055	0.024904799500379073
tfidf_2729	0.02488498561506565
tfidf_2562	0.024877534245649032
tfidf_3008	0.024874850886562338
tfidf_5072	0.024869373846544376
tfidf_3205	0.02485781481176372
tfidf_4272	0.02482605495877506
tfidf_3144	0.024825729731028257
tfidf_5193	0.0248043870963833
tfidf_5295	0.02479729302544094
tfidf_1498	0.024781453922890915
tfidf_2270	0.02475005661261086
tfidf_5067	0.024745471293830715
tfidf_1728	0.024725228499474818
tfidf_765	0.02472281879122508
tfidf_2599	0.02469894404304778
tfidf_2214	0.02468708055051582
tfidf_4076	0.024686287589355885
tfidf_2886	0.024682730458353508
tfidf_987	0.02467274683228762
tfidf_2331	0.024666859015854407
tfidf_137	0.024664210540687497
tfidf_4147	0.024649854043429394
tfidf_1664	0.02464801074952842
tfidf_5197	0.02463890971355487
tfidf_2566	0.02462805863045789
tfidf_541	0.024577674920508

tfidf_1670	0.022126920899209163
tfidf_1780	0.022114962853129087
tfidf_3214	0.022114283228604783
tfidf_1298	0.022097435348119532
tfidf_5076	0.02207462082008223
tfidf_5152	0.022073147314912753
tfidf_1594	0.02206922851644365
tfidf_388	0.022057500199533488
tfidf_1163	0.02205427921424938
tfidf_1369	0.022040183517790257
tfidf_5313	0.022034343959948813
tfidf_2170	0.02200288089898069
tfidf_3206	0.022001922674241257
tfidf_1625	0.02199936949614274
tfidf_3713	0.021993861070676374
tfidf_142	0.021993730588196883
tfidf_2612	0.021990362106193435
tfidf_2827	0.02197426682717366
tfidf_5094	0.021965406979613165
tfidf_4596	0.021958647967183064
tfidf_5140	0.021956287346336367
tfidf_3191	0.021941092879538236
tfidf_1542	0.02193947639566266
tfidf_4678	0.02191707631991371
tfidf_1688	0.021910444179454293
tfidf_3757	0.021894241390474287
tfidf_774	0.021877310798413083
tfidf_3521	0.02187358971349419
tfidf_1168	0.0218670625174207
tfidf_4370	0.021865170781709804
tfidf_4121	0.021860668312095915
tfidf_842	0.0218462510

tfidf_465	0.019727650826635017
tfidf_3070	0.019710915607331306
tfidf_40	0.01969844820976574
tfidf_2667	0.01968309566941548
tfidf_1272	0.019673038007289236
tfidf_3589	0.019664953315607456
tfidf_4692	0.019642250049604643
tfidf_2084	0.019638442232782678
tfidf_1744	0.019632065050110337
tfidf_393	0.019621619074282327
tfidf_1556	0.019614810770931725
tfidf_2511	0.019600871958189003
tfidf_2568	0.01959047910974524
tfidf_2178	0.01956143963196384
tfidf_4077	0.019559170586186732
tfidf_4707	0.019535164397411157
tfidf_4141	0.019517175739309126
tfidf_3882	0.01949745435588033
tfidf_4379	0.01947816105493827
tfidf_3006	0.01946202092735325
tfidf_319	0.019444500304612904
tfidf_4845	0.01944322949563778
tfidf_641	0.019434809977919763
tfidf_5124	0.019433529869583226
tfidf_4210	0.01943282304230509
tfidf_3217	0.019414571238427138
tfidf_4976	0.019368886450571183
tfidf_951	0.019350038725726014
tfidf_4239	0.01934140537048898
tfidf_5252	0.019339729334554086
tfidf_1821	0.019310108385653505
tfidf_5190	0.019309356365

tfidf_3148	0.017201299663285867
tfidf_5088	0.017193029602479332
tfidf_666	0.01719172107355179
tfidf_627	0.017174969131834937
tfidf_4069	0.017168804317168284
tfidf_5126	0.017165833978767496
tfidf_1534	0.017160530451950512
tfidf_4510	0.01715306321213656
tfidf_1856	0.01714392065885059
tfidf_4939	0.01714115206343723
tfidf_2619	0.017096947483735924
tfidf_3172	0.01708651094573441
tfidf_478	0.017077842986691922
tfidf_3945	0.017074355268304277
tfidf_116	0.017069722792756238
tfidf_2505	0.017062564608317805
tfidf_4813	0.017043004116457178
tfidf_3441	0.017030947516873184
tfidf_3594	0.017006320209564517
tfidf_2482	0.01700241062142898
tfidf_738	0.01699576179224261
tfidf_819	0.016995606491657038
tfidf_967	0.016974867466116067
tfidf_1925	0.016946356384709343
tfidf_214	0.016944106934138014
tfidf_4484	0.016930618628148347
tfidf_67	0.016930354248720265
tfidf_799	0.01692937510281504
tfidf_1480	0.016927079653301407
tfidf_697	0.01692646936400394
tfidf_2743	0.01689367677720316
tfidf_1234	0.01687856339815641

tfidf_3325	0.01511144457571452
tfidf_3566	0.015110263898489225
tfidf_3606	0.015105912750716774
tfidf_5273	0.015105876023783394
tfidf_1739	0.01509814473723667
tfidf_3638	0.01509719884413709
tfidf_2289	0.01509359552212995
tfidf_3130	0.01508411767103838
tfidf_1088	0.015083192745075072
tfidf_695	0.015068858077636967
tfidf_1938	0.015059432411585937
tfidf_1523	0.015059389543747985
tfidf_468	0.015027135663381277
tfidf_4388	0.015022935238611652
tfidf_2481	0.015016032403333053
tfidf_3157	0.01501229980671929
tfidf_5274	0.015003329838294712
tfidf_1282	0.014967551484516521
tfidf_3482	0.014963249755175061
tfidf_4116	0.014950566575018911
tfidf_2675	0.014947597650219184
tfidf_1460	0.014915565862297912
tfidf_916	0.014900384185240709
tfidf_2538	0.014895443133544156
tfidf_1687	0.014888082344468615
tfidf_1718	0.014879415853142935
tfidf_2742	0.014869138305659196
tfidf_501	0.014865884981517514
tfidf_5102	0.014865628341844347
tfidf_3243	0.014862220453962141
tfidf_4505	0.014848767716424195
tfidf_2605	0.01484

tfidf_4114	0.012936641788554658
tfidf_4005	0.012932088587560437
tfidf_1364	0.012921762644705124
tfidf_2347	0.012920718567589614
tfidf_147	0.012909214022537521
tfidf_1414	0.012907912876755426
tfidf_174	0.01290459972140289
tfidf_1879	0.012900295592519465
tfidf_631	0.012888227425410818
tfidf_3821	0.01288718127069307
tfidf_2267	0.012884002571744544
tfidf_2057	0.012868871486786053
tfidf_2238	0.012848464024142556
tfidf_2300	0.012846730141021407
tfidf_1455	0.012830905949186434
tfidf_3138	0.012827503063608938
tfidf_53	0.01282204449051866
tfidf_5213	0.012815529923653564
tfidf_5101	0.012814225286287327
tfidf_3207	0.012810112000447673
tfidf_2417	0.01280705035591043
tfidf_3447	0.012804851475468634
tfidf_1483	0.012790976111148203
tfidf_1823	0.012782786318358297
tfidf_121	0.012780099019531574
tfidf_4867	0.01277967369321896
tfidf_4371	0.012770458841130945
tfidf_1006	0.012768094520813211
tfidf_1101	0.012765737389779866
tfidf_4366	0.012761336178782718
tfidf_1577	0.012751119698663172
tfidf_1616	0.012748

tfidf_838	0.01103176659428529
tfidf_79	0.011026578207997618
tfidf_2893	0.011022498169473949
tfidf_1138	0.01100708579780134
tfidf_327	0.010998731507800135
tfidf_794	0.010986872593971134
tfidf_2954	0.010986319196475226
tfidf_5024	0.01098230637334139
tfidf_1474	0.010975469089144586
tfidf_924	0.010970943689928205
tfidf_1320	0.01095140798462016
tfidf_4260	0.010942383045111007
tfidf_74	0.010939598999863366
tfidf_773	0.010915504558484623
tfidf_115	0.010912867691848363
tfidf_422	0.010909180850731974
tfidf_3335	0.010882666209666287
tfidf_922	0.010870781023903323
tfidf_3454	0.010858675769280648
tfidf_2766	0.010850610895145475
tfidf_2542	0.010850562189314744
tfidf_2299	0.010843204694969215
tfidf_2724	0.01083741702219721
tfidf_1161	0.010818339810876893
tfidf_4744	0.010803998580027042
tfidf_1815	0.010798831291587629
tfidf_1578	0.0107635531935029
tfidf_1783	0.010762795683691137
tfidf_714	0.010757698654174733
tfidf_350	0.010757581773342705
tfidf_1425	0.010748276466026468
tfidf_5292	0.0107477463655682

tfidf_3677	0.009016966560608517
tfidf_2422	0.009009488000819004
tfidf_3272	0.00900642179977987
tfidf_755	0.009001151698196123
tfidf_2467	0.008997856216346568
tfidf_973	0.008997696251813684
tfidf_1040	0.008987331438754085
tfidf_1077	0.008981569570655745
tfidf_1469	0.008980351662077985
tfidf_5071	0.0089792715242567
tfidf_510	0.008968874326630785
tfidf_984	0.00896647979872416
tfidf_2749	0.008954822856944633
tfidf_4997	0.008944031314887586
tfidf_4758	0.008943702143571318
tfidf_602	0.008941597197680763
tfidf_3346	0.008940620090111612
tfidf_4822	0.008936107106015133
tfidf_3516	0.008924789081166661
tfidf_4048	0.008922234808526287
tfidf_4248	0.008917693608037935
tfidf_2272	0.008915894376083906
tfidf_244	0.00891253677330947
tfidf_3412	0.008912005605199762
tfidf_410	0.008911012632912975
tfidf_829	0.008910989814897338
tfidf_2097	0.008885959143218264
tfidf_487	0.008880041388441166
tfidf_2601	0.00886228361703976
tfidf_1361	0.008861273113839838
tfidf_2142	0.0088460725065759
tfidf_3994	0.008843346546

tfidf_1597	0.007098331328481346
tfidf_1723	0.007088003105935212
tfidf_677	0.0070863507381962705
tfidf_1703	0.007079912683626899
tfidf_649	0.007077956106481644
tfidf_2038	0.007073537138859763
tfidf_2635	0.007073015499824182
tfidf_3534	0.007058411033932924
tfidf_3298	0.007055819796668039
tfidf_2448	0.007047999238777174
tfidf_2169	0.007040882611679937
tfidf_290	0.007037622484238849
tfidf_985	0.007030825858877957
tfidf_4216	0.00702495282507988
tfidf_2820	0.007022942922850022
tfidf_3105	0.0070226728460171025
tfidf_3960	0.007022609456528633
tfidf_4311	0.007010584760662258
tfidf_1696	0.007010261666690745
tfidf_748	0.007008265168383961
tfidf_2460	0.007004030060556024
tfidf_2113	0.007003792814772867
tfidf_958	0.006993871894239885
tfidf_986	0.006988316141931703
tfidf_2858	0.006977119594399024
tfidf_1743	0.006976730989396067
tfidf_11	0.006958818693680534
tfidf_818	0.006938172369027302
tfidf_2589	0.006929254318349574
tfidf_990	0.006920418764306944
tfidf_2435	0.006903582193720638
tfidf_3502	0.00689

tfidf_769	0.005243583582655573
tfidf_4508	0.00523636509595493
tfidf_2520	0.005228355502910697
tfidf_3049	0.005224111141459215
tfidf_3381	0.005203876504677654
tfidf_2774	0.005197230070675961
tfidf_3453	0.005195877064176045
tfidf_434	0.0051896239677034495
tfidf_1633	0.0051816963120263
tfidf_1977	0.005178806465596482
tfidf_3498	0.00516396800571961
tfidf_2462	0.005159878045583735
tfidf_2164	0.00514825417685677
tfidf_600	0.005139305954100096
tfidf_4680	0.005135256050426923
tfidf_2771	0.005128421232286341
tfidf_2685	0.005124477973135042
tfidf_5297	0.0051203079317534836
tfidf_3114	0.0051152882811421625
tfidf_482	0.0051088035427552
tfidf_464	0.005099677532581468
tfidf_1317	0.005099149013107923
tfidf_888	0.0050917073338874515
tfidf_4402	0.005082252585817164
tfidf_731	0.0050759449643538665
tfidf_1707	0.005065978437685598
tfidf_3363	0.005064499713814405
tfidf_315	0.005058503307841797
tfidf_1057	0.005047032623340735
tfidf_2509	0.005038260506600265
tfidf_3751	0.005036674333991134
tfidf_2880	0.00503

tfidf_2313	0.0032415503432141565
tfidf_3108	0.0032307809734051954
tfidf_1374	0.003229982773517685
tfidf_2543	0.003229917712162568
tfidf_1397	0.0032230855275372965
tfidf_2081	0.003210510492520411
tfidf_3356	0.0032080303329133143
tfidf_2748	0.0032067667173187237
tfidf_3830	0.0032038667511699887
tfidf_736	0.003202570133790672
tfidf_5188	0.003198711179097554
tfidf_3347	0.003196501421234503
tfidf_2461	0.0031964703151761784
tfidf_4737	0.0031937060057115516
tfidf_4938	0.003188717173700688
tfidf_1002	0.0031884587258821932
tfidf_2703	0.0031857137679084514
tfidf_2406	0.0031823580416406568
tfidf_4295	0.0031781212988666483
tfidf_66	0.0031756614367193354
tfidf_4417	0.003168131206078339
tfidf_621	0.0031638833971426744
tfidf_2840	0.0031542946536000003
tfidf_4705	0.0031323632813015988
tfidf_5007	0.003127181478656462
tfidf_4188	0.003094634874780511
tfidf_1773	0.003090390287423232
tfidf_4177	0.003080357977834351
tfidf_2782	0.0030803092766147266
tfidf_2068	0.003078127322026607
tfidf_235	0.003076113958260

tfidf_2172	0.0013423458948686328
tfidf_3853	0.001341678910548803
tfidf_4846	0.001338614916232203
tfidf_402	0.0013230893743330376
tfidf_148	0.0013195276236458136
tfidf_85	0.0013180635642918528
tfidf_1916	0.0013132934995890942
tfidf_2911	0.0013011633678989653
tfidf_5014	0.0012879856012322373
tfidf_3149	0.0012879448477626178
tfidf_320	0.0012874873625122213
tfidf_4917	0.0012864432428748773
tfidf_663	0.001273842053735105
tfidf_4148	0.0012695367094374497
tfidf_3538	0.0012596105971346163
tfidf_1605	0.0012568844692855666
tfidf_2594	0.001256521182447935
tfidf_4134	0.0012559651952380446
tfidf_2344	0.0012529740515852192
tfidf_2141	0.0012498678056755814
tfidf_716	0.0012464488809416226
tfidf_821	0.0012299598863167594
tfidf_5228	0.0012145839880225977
tfidf_5161	0.0012069003884886947
tfidf_1862	0.0012003638103606662
tfidf_1213	0.0011964852616601926
tfidf_3196	0.0011855720317927353
tfidf_933	0.0011841849231990884
tfidf_1359	0.0011749381349091866
tfidf_4763	0.0011514886784230426
tfidf_2483	0.0011429770

In [40]:
model2.bestModel.asInstanceOf[PipelineModel].stages.foreach(stage => println(stage.extractParamMap))

{
	regexTok_2ef9e98c3c4f-gaps: true,
	regexTok_2ef9e98c3c4f-inputCol: text,
	regexTok_2ef9e98c3c4f-minTokenLength: 1,
	regexTok_2ef9e98c3c4f-outputCol: tokens,
	regexTok_2ef9e98c3c4f-pattern: \W+,
	regexTok_2ef9e98c3c4f-toLowercase: true
}
{
	stopWords_56283922eceb-caseSensitive: false,
	stopWords_56283922eceb-inputCol: tokens,
	stopWords_56283922eceb-locale: fr_FR,
	stopWords_56283922eceb-outputCol: text_filtered,
	stopWords_56283922eceb-stopWords: [Ljava.lang.String;@4ec47244
}
{
	cntVec_e3378908d98b-binary: false,
	cntVec_e3378908d98b-inputCol: text_filtered,
	cntVec_e3378908d98b-maxDF: 9.223372036854776E18,
	cntVec_e3378908d98b-minDF: 35.0,
	cntVec_e3378908d98b-minTF: 1.0,
	cntVec_e3378908d98b-outputCol: cv_features,
	cntVec_e3378908d98b-vocabSize: 262144
}
{
	idf_86d8769eacd0-inputCol: cv_features,
	idf_86d8769eacd0-minDocFreq: 0,
	idf_86d8769eacd0-outputCol: tfidf
}
{
	strIdx_342131a9f796-handleInvalid: skip,
	strIdx_342131a9f796-inputCol: country,
	strIdx_342131a9f796-outputCol:

## Test random forest

In [41]:
// Stage 1 : récupérer les mots des textes
val tokenizer2 = new RegexTokenizer()
  .setPattern("\\W+")
  .setGaps(true)
  .setInputCol("text")
  .setOutputCol("tokens")

// Stage 2 : retirer les stop words (liste : StopWordsRemover.loadDefaultStopWords("english"))
val stopWordsRemover2 = new StopWordsRemover()
    .setInputCol("tokens")
    .setOutputCol("text_filtered")

// Stage 3 : computer la partie TF
val cvModel2: CountVectorizer = new CountVectorizer()//Model(Array("a", "b", "c"))
    .setInputCol("text_filtered")
    .setOutputCol("cv_features")
    //.setVocabSize(3)
    //.setMinDF()

// Stage 4 : computer la partie IDF
val idf2 = new IDF()
    .setInputCol("cv_features")
    .setOutputCol("tfidf")

//Stage 5 : convertir country2 en quantités numériques
val indexer_country2 = new StringIndexer()
    .setInputCol("country")
    .setOutputCol("country_indexed")
    .setHandleInvalid("skip")

//Stage 6 : convertir currency2 en quantités numériques
val indexer_currency2 = new StringIndexer()
    .setInputCol("currency")
    .setOutputCol("currency_indexed")
    .setHandleInvalid("skip")

// Stages 7 et 8: One-Hot encoder ces deux catégories
val encoder2 = new OneHotEncoderEstimator()
    .setInputCols(Array("country_indexed", "currency_indexed"))
    .setOutputCols(Array("country_onehot", "currency_onehot"))

val assembler2 = new VectorAssembler()
    .setInputCols(Array("tfidf", "days_campaign", "hours_prepa", "goal", "country_onehot", "currency_onehot"))
    .setOutputCol("features")
    .setHandleInvalid("skip")

val rf = new RandomForestClassifier()
    .setLabelCol("final_status")
    .setFeaturesCol("features")
    .setPredictionCol("predictions")
    .setRawPredictionCol("raw_predictions")
    .setNumTrees(10)

val pipeline3 = new Pipeline()
    .setStages(Array(tokenizer2, stopWordsRemover2, 
                     cvModel2, idf2, indexer_country2, indexer_currency2,
                     encoder2, assembler2, rf))

tokenizer2: org.apache.spark.ml.feature.RegexTokenizer = regexTok_bdfbe808a4d0
stopWordsRemover2: org.apache.spark.ml.feature.StopWordsRemover = stopWords_9bedac4b13c2
cvModel2: org.apache.spark.ml.feature.CountVectorizer = cntVec_e0b7e8f0efb9
idf2: org.apache.spark.ml.feature.IDF = idf_7d668197c36b
indexer_country2: org.apache.spark.ml.feature.StringIndexer = strIdx_68ae37ddeff5
indexer_currency2: org.apache.spark.ml.feature.StringIndexer = strIdx_10ae8415193f
encoder2: org.apache.spark.ml.feature.OneHotEncoderEstimator = oneHotEncoder_156f2a4258f2
assembler2: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_67a3cea6ee2a
rf: org.apache.spark.ml.classification.RandomForestClassifier = rfc_75131793c96c
pipeline3: org.apache.spark.ml.Pipeline = pipeline_de10556b862c


In [42]:
val model3 = pipeline3.fit(training2)
val df3 = model3.transform(test2)

model3: org.apache.spark.ml.PipelineModel = pipeline_de10556b862c
df3: org.apache.spark.sql.DataFrame = [project_id: string, goal: double ... 18 more fields]


In [43]:
df3.groupBy("final_status", "predictions").count.show()
print(s"f1-score du modèle cleané: ${evaluator.evaluate(df3)}")

+------------+-----------+-----+
|final_status|predictions|count|
+------------+-----------+-----+
|           1|        0.0| 3505|
|           0|        0.0| 7308|
+------------+-----------+-----+

f1-score du modèle cleané: 0.5451282760569609