In [85]:
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SQLContext, SparkSession, Row}
import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType}
import org.apache.spark.ml.feature.{RegexTokenizer, StopWordsRemover, CountVectorizer, CountVectorizerModel, IDF, StringIndexer, OneHotEncoderEstimator, VectorAssembler}
import org.apache.spark.ml.classification.{LogisticRegression}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.evaluation.{MulticlassClassificationEvaluator}

import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SQLContext, SparkSession, Row}
import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType}
import org.apache.spark.ml.feature.{RegexTokenizer, StopWordsRemover, CountVectorizer, CountVectorizerModel, IDF, StringIndexer, OneHotEncoderEstimator, VectorAssembler}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator


## Configuration de SparkSession

In [2]:
val conf = new SparkConf().setAll(Map(
      "spark.scheduler.mode" -> "FIFO",
      "spark.speculation" -> "false",
      "spark.reducer.maxSizeInFlight" -> "48m",
      "spark.serializer" -> "org.apache.spark.serializer.KryoSerializer",
      "spark.kryoserializer.buffer.max" -> "1g",
      "spark.shuffle.file.buffer" -> "32k",
      "spark.default.parallelism" -> "12",
      "spark.sql.shuffle.partitions" -> "12"
    ))

conf: org.apache.spark.SparkConf = org.apache.spark.SparkConf@28103c6b


In [3]:
 val spark = SparkSession
      .builder
      .config(conf)
      .appName("TP Spark : Preprocessor")
      .getOrCreate()

import spark.implicits._  // to use the symbol $

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@1eaaf461
import spark.implicits._


## Chargement du DataFrame

In [4]:
val df: DataFrame = spark
            .read
            .option("header", true)
            .option("inferSchema", "true")
            .parquet("../data/prepared_trainingset/")

println(s"Nombre de lignes: ${df.count}")
println(s"Nombre de colonnes: ${df.columns.length}")

Nombre de lignes: 107614
Nombre de colonnes: 14


df: org.apache.spark.sql.DataFrame = [project_id: string, name: string ... 12 more fields]


In [5]:
df.show(5)

+--------------+--------------------+--------------------+------+--------------------+------------+--------+---------+-------------------+-------------------+-------------------+-------------+-----------+--------------------+
|    project_id|                name|                desc|  goal|            keywords|final_status|country2|currency2|          deadline2|        created_at2|       launched_at2|days_campaign|hours_prepa|                text|
+--------------+--------------------+--------------------+------+--------------------+------------+--------+---------+-------------------+-------------------+-------------------+-------------+-----------+--------------------+
| kkst471421639|american options ...|looking to create...|100000|american-options-...|           0|      US|      USD|2014-11-15 17:31:27|2014-10-10 21:23:58|2014-10-16 17:31:27|           30|    140.125|american options ...|
|kkst1098019088|iheadbones bone c...|wireless bluetoot...| 20000|iheadbones-bone-c...|          

## Utilisation des données textuelles

In [6]:
// Stage 1 : récupérer les mots des textes
val tokenizer = new RegexTokenizer()
  .setPattern("\\W+")
  .setGaps(true)
  .setInputCol("text")
  .setOutputCol("tokens")

//val words = tokenizer.transform(df)

tokenizer: org.apache.spark.ml.feature.RegexTokenizer = regexTok_a15d3c67c72f


In [7]:
// Stage 2 : retirer les stop words (liste : StopWordsRemover.loadDefaultStopWords("english"))
val stopWordsRemover = new StopWordsRemover()
    .setInputCol("tokens")
    .setOutputCol("text_filtered")

//val words2 = stopWordsRemover.transform(words)

stopWordsRemover: org.apache.spark.ml.feature.StopWordsRemover = stopWords_c5d8d93b2fca


In [21]:
// Stage 3 : computer la partie TF
val cvModel: CountVectorizer = new CountVectorizer()//Model(Array("a", "b", "c"))
    .setInputCol("text_filtered")
    .setOutputCol("cv_features")
    //.setVocabSize(3)
    //.setMinDF(2)

//val words3 = cvModel.fit(words2).transform(words2)

cvModel: org.apache.spark.ml.feature.CountVectorizer = cntVec_f916e3516fdf


In [22]:
// Stage 4 : computer la partie IDF
val idf = new IDF()
    .setInputCol("cv_features")
    .setOutputCol("tfidf")

//val df2: DataFrame = idf.fit(words3).transform(words3)

idf: org.apache.spark.ml.feature.IDF = idf_5d5b944edbab


## Conversion des variables catégorielles en variables numériques

In [44]:
//Stage 5 : convertir country2 en quantités numériques
val indexer_country = new StringIndexer()
    .setInputCol("country2")
    .setOutputCol("country_indexed")
    .setHandleInvalid("keep")

//val df4: DataFrame = indexer.fit(df3).transform(df3)

indexer_country: org.apache.spark.ml.feature.StringIndexer = strIdx_55fb5f18d035


In [24]:
//Stage 6 : convertir currency2 en quantités numériques
val indexer_currency = new StringIndexer()
    .setInputCol("currency2")
    .setOutputCol("currency_indexed")

//val df5: DataFrame = indexer.fit(df4).transform(df4)

indexer_currency: org.apache.spark.ml.feature.StringIndexer = strIdx_0587c4ba9e40


In [25]:
// Stages 7 et 8: One-Hot encoder ces deux catégories
val encoder = new OneHotEncoderEstimator()
    .setInputCols(Array("country_indexed", "currency_indexed"))
    .setOutputCols(Array("country_onehot", "currency_onehot"))

//val df7: DataFrame = encoder.fit(df6).transform(df6)

encoder: org.apache.spark.ml.feature.OneHotEncoderEstimator = oneHotEncoder_6ee3aed16785


## Mise données sous une forme utilisable par Spark.ML

In [26]:
val assembler = new VectorAssembler()
  .setInputCols(Array("tfidf", "days_campaign", "hours_prepa", "goal", "country_onehot", "currency_onehot"))
  .setOutputCol("features")

//val df9 = assembler.transform(df8)

assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_4f750cb83f90


## Création du modèle de classification

In [27]:
// Stage 10 : créer/instancier le modèle de classification
val lr = new LogisticRegression()
  .setElasticNetParam(0.0)
  .setFitIntercept(true)
  .setFeaturesCol("features")
  .setLabelCol("final_status")
  .setStandardization(true)
  .setPredictionCol("predictions")
  .setRawPredictionCol("raw_predictions")
  .setThresholds(Array(0.7, 0.3))
  .setTol(1.0e-6)
  .setMaxIter(20)

lr: org.apache.spark.ml.classification.LogisticRegression = logreg_b27d1c2de6ca


## Création du pipeline

In [28]:
val pipeline = new Pipeline()
    .setStages(Array(tokenizer, stopWordsRemover, 
                     cvModel, idf, indexer_country, indexer_currency,
                     encoder, assembler, lr))

pipeline: org.apache.spark.ml.Pipeline = pipeline_a3b83b8d9334


## Entraînement, test, et sauvegarde du modèle

In [29]:
val Array(training, test) = df.randomSplit(Array(0.9, 0.1), seed=261)

training: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [project_id: string, name: string ... 12 more fields]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [project_id: string, name: string ... 12 more fields]


In [45]:
val model = pipeline.fit(training)

model: org.apache.spark.ml.PipelineModel = pipeline_a3b83b8d9334


In [34]:
// On enregistre le modèle entraîné
model.write.overwrite().save("../data/model/spark-logistic-regression-model")

In [82]:
val dfWithSimplePredictions = model.transform(test)

dfWithSimplePredictions: org.apache.spark.sql.DataFrame = [project_id: string, name: string ... 24 more fields]


In [83]:
dfWithSimplePredictions.groupBy("final_status", "predictions").count.show()

+------------+-----------+-----+
|final_status|predictions|count|
+------------+-----------+-----+
|           1|        0.0| 1812|
|           0|        1.0| 2328|
|           1|        1.0| 1590|
|           0|        0.0| 5036|
+------------+-----------+-----+



In [86]:
val evaluator = new MulticlassClassificationEvaluator()
    .setLabelCol("final_status")
    .setPredictionCol("predictions")
    .setMetricName("f1")
val f1_score = evaluator.evaluate(dfWithSimplePredictions)

evaluator: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = mcEval_d5737757e5e0
f1_score: Double = 0.6220287782615461
