
# Model training of the churn data
## Import the required libraries

In [1]:
//import libraries
import org.apache.spark.ml.feature.{StringIndexer,VectorAssembler}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.tuning.{CrossValidator,ParamGridBuilder}


# Load the data from Object Storage into dataframe and split for training, testing and evaluating

In [1]:

// This function is used to setup the access of Spark to your Object Storage. The definition contains your credentials.
// You might want to remove those credentials before you share your notebook.
def setHadoopConfig64e9c4bd3e4148978db0a312dfcc0a93(name: String) = {
    // This function sets the Hadoop configuration so it is possible to
    // access data from Bluemix Object Storage using Spark

    val prefix = "fs.swift.service." + name
    sc.hadoopConfiguration.set(prefix + ".auth.url", "https://identity.open.softlayer.com" + "/v3/auth/tokens")
    sc.hadoopConfiguration.set(prefix + ".auth.endpoint.prefix","endpoints")
    sc.hadoopConfiguration.set(prefix + ".tenant", "419cd8dece644c82af5a615b62af38e1")
    sc.hadoopConfiguration.set(prefix + ".username", "61e622a296924b9f8f5ddd0bd74f96b6")
    sc.hadoopConfiguration.set(prefix + ".password", "w0C9Eh4)Ssel~ihb")
    sc.hadoopConfiguration.setInt(prefix + ".http.port", 8080)
    sc.hadoopConfiguration.set(prefix + ".region", "dallas")
    sc.hadoopConfiguration.setBoolean(prefix + ".public", false)
}

// you can choose any name
val name = "keystone"
setHadoopConfig64e9c4bd3e4148978db0a312dfcc0a93(name)

val sqlContext = new org.apache.spark.sql.SQLContext(sc)

val dfData1 = sqlContext.
    read.format("com.databricks.spark.csv").
    option("header", "true").
    option("inferSchema", "true").
    load("swift://ChurnModelTraing." + name + "/CUST_SUM.csv")
dfData1.show(5)


+----------+---+---+---------+----------+-------+--------+-----+--------+------------+-------+----------+---------+-----+----------------+---------+-----------+
|   CUST_ID|SEX|AGE|EDUCATION|INVESTMENT| INCOME|ACTIVITY|CHURN|YRLY_AMT|AVG_DAILY_TX|YRLY_TX|AVG_TX_AMT|NEGTWEETS|STATE| EDUCATION_GROUP|TwitterID|CHURN_LABEL|
+----------+---+---+---------+----------+-------+--------+-----+--------+------------+-------+----------+---------+-----+----------------+---------+-----------+
|1009530860|  F| 84|        2|    114368|3852862|       5|    0|700259.0|    0.917808|    335|   2090.32|        3|   TX|Bachelors degree|        0|      false|
|1009544000|  F| 44|        2|     90298|3849843|       1|    0|726977.0|    0.950685|    347|   2095.04|        2|   CA|Bachelors degree|        0|      false|
|1009534260|  F| 23|        2|     94881|3217364|       1|    1|579084.0|    0.920548|    336|   1723.46|        5|   CA|Bachelors degree|        0|       true|
|1009574010|  F| 24|        2|    

In [3]:
val Array(training, test) = dfData1.randomSplit(Array(0.8, 0.2), seed = 11L)
println("The number of training data is ",training.count())
println("The number of test data is ",test.count())

(The number of training data is ,4837)
(The number of test data is ,1164)


In [4]:
//Feature definition
val genderIndexer = new StringIndexer().setInputCol("SEX").setOutputCol("gender_code")
val stateIndexer = new StringIndexer().setInputCol("STATE").setOutputCol("state_code")
val labelIndexer = new StringIndexer().setInputCol("CHURN").setOutputCol("label")

val featuresAssembler = new VectorAssembler().setInputCols(Array("AGE", 
                                                         "ACTIVITY", 
                                                         "EDUCATION", 
                                                         "NEGTWEETS" ,
                                                         "INCOME",
                                                         "gender_code",
                                                         "state_code")).setOutputCol("features")

In [5]:
//Select model automatically in candidate algorithm - Logistic Regression, SVM or Decision Tree?
val lr = new LogisticRegression().setRegParam(0.01).setLabelCol("label").setFeaturesCol("features")
val pipeline =new Pipeline().setStages(Array(labelIndexer, genderIndexer, stateIndexer, featuresAssembler,lr))
val auc_eval = new BinaryClassificationEvaluator()
val grid = new ParamGridBuilder().addGrid(lr.regParam, Array(1e-3, 1e-2)).addGrid(lr.elasticNetParam,Array(0.25, 0.0)).build()
val cross_val = new CrossValidator().setEstimator(pipeline).setEvaluator(auc_eval).setEstimatorParamMaps(grid).setNumFolds(3)

In [6]:
val pipeline_model = cross_val.fit(training)
val trainResult=pipeline_model.transform(test)

# Evaluate the trained model and draw the ROC curve

In [7]:
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
val testingResultScores = trainResult.select("prediction","label").rdd.map(r => (r(0).asInstanceOf[Double], r(1) .asInstanceOf[Double]))
val bc = new BinaryClassificationMetrics(testingResultScores)
val roc = bc.roc 

In [8]:
val rocDF = sqlContext.createDataFrame(roc).
                    withColumnRenamed("_1", "FPR").
                    withColumnRenamed("_2", "TPR")
rocDF.show()

+--------------------+-----------------+
|                 FPR|              TPR|
+--------------------+-----------------+
|                 0.0|              0.0|
|0.004640371229698376|0.956953642384106|
|                 1.0|              1.0|
|                 1.0|              1.0|
+--------------------+-----------------+



In [9]:
%AddJar -magic https://brunelvis.org/jar/spark-kernel-brunel-all-2.3.jar -f

Starting download from https://brunelvis.org/jar/spark-kernel-brunel-all-2.3.jar
Finished download of spark-kernel-brunel-all-2.3.jar


In [10]:
%%brunel data('rocDF') x(FPR) y(TPR) line tooltip(#all) axes(x:'False Positive Rate':grid, y:'True Positive Rate':grid)  

In [11]:
case class Person(AGE:Int,ACTIVITY:Int,EDUCATION:Int,NEGTWEETS:Int,INCOME:Int,SEX:String,STATE:String)
val dataFrame=Seq(Person(40,1,3,4,200000,"M","TX"),Person(40,1,3,8,200000,"M","OR"))
val df = sqlContext.createDataFrame(dataFrame)
df.show()
pipeline_model.transform(df).show()

+---+--------+---------+---------+------+---+-----+
|AGE|ACTIVITY|EDUCATION|NEGTWEETS|INCOME|SEX|STATE|
+---+--------+---------+---------+------+---+-----+
| 40|       1|        3|        4|200000|  M|   TX|
| 40|       1|        3|        8|200000|  M|   OR|
+---+--------+---------+---------+------+---+-----+

+---+--------+---------+---------+------+---+-----+-----------+----------+--------------------+--------------------+--------------------+----------+
|AGE|ACTIVITY|EDUCATION|NEGTWEETS|INCOME|SEX|STATE|gender_code|state_code|            features|       rawPrediction|         probability|prediction|
+---+--------+---------+---------+------+---+-----+-----------+----------+--------------------+--------------------+--------------------+----------+
| 40|       1|        3|        4|200000|  M|   TX|        0.0|      12.0|[40.0,1.0,3.0,4.0...|[2.15500232950555...|[0.89613530131090...|       0.0|
| 40|       1|        3|        8|200000|  M|   OR|        0.0|       5.0|[40.0,1.0,3.0,8.0