## Model training of the churn data

### Import the required libraries

In [1]:
//import libraries
import org.apache.spark.{SparkConf, SparkContext, SparkFiles}
import org.apache.spark.sql.{SQLContext, SparkSession, Row}
import org.apache.spark.SparkFiles

import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer, VectorAssembler}
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.classification.{LogisticRegression, DecisionTreeClassifier}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.ml.ibm.transformers.RenameColumn

import com.ibm.analytics.ngp.ingest.Sampling
import com.ibm.analytics.ngp.pipeline._
import com.ibm.analytics.ngp.util._
import com.ibm.analytics.ngp.pipeline.evaluate.{Evaluator,MLProblemType}

import com.ibm.analytics.{Learner, Target}
import com.ibm.analytics.cads.CADSEstimator

### Load the data from DB2 for z/OS into dataframe and split for training, testing and evaluating

In [2]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._ 

//Load data from DB2 for z/OS using JDBC driver
val churnDataRaw = spark.read.format("jdbc").
                        options(Map("driver" -> "com.ibm.db2.jcc.DB2Driver",
                         "url" -> "jdbc:db2://9.125.72.72:430/LOCDB11", 
                         "user" -> "tuser01","password" -> "c4deshop", 
                         "dbtable" -> "SA.CUST_SUM")).load()

val toDouble = udf {x: Int => x.toDouble}

val churnData = churnDataRaw.select("AGE", "ACTIVITY", "EDUCATION", "SEX", "STATE", "NEGTWEETS", "INCOME", "CHURN_LABEL")
churnData.show(5)

+---+--------+---------+---+-----+---------+-----------+-----------+
|AGE|ACTIVITY|EDUCATION|SEX|STATE|NEGTWEETS|     INCOME|CHURN_LABEL|
+---+--------+---------+---+-----+---------+-----------+-----------+
| 84|       5|        2|  F|   TX|        3|3852862.000|      false|
| 44|       1|        2|  F|   CA|        2|3849843.000|      false|
| 23|       1|        2|  F|   CA|        5|3217364.000|       true|
| 24|       4|        2|  F|   WA|        2|2438218.000|       true|
| 67|       3|        5|  F|   CT|        3|2428245.000|      false|
+---+--------+---------+---+-----+---------+-----------+-----------+
only showing top 5 rows



In [3]:
val train = 70
val test = 15
val validate = 15

//Split the data into training data set, testing data set, and validation data set

val splits = Sampling.trainingSplit(churnData, train, test, validate)

val trainingDF = splits._1
val testDF = splits._2
val validationDF = splits._3

println("Training data set")
trainingDF.show(5)

println("Testing data set")
testDF.show(5)

println("Validation data set")
validationDF.show(5)

Training data set
+---+--------+---------+---+-----+---------+---------+-----------+
|AGE|ACTIVITY|EDUCATION|SEX|STATE|NEGTWEETS|   INCOME|CHURN_LABEL|
+---+--------+---------+---+-----+---------+---------+-----------+
| 20|       0|        1|  F|   ID|       13|17877.000|       true|
| 20|       0|        1|  F|   WA|       15|15497.000|       true|
| 20|       1|        1|  F|   PA|        6|19556.000|       true|
| 20|       1|        1|  F|   WV|        3|20687.000|      false|
| 20|       1|        1|  M|   ID|        6|18453.000|       true|
+---+--------+---------+---+-----+---------+---------+-----------+
only showing top 5 rows

Testing data set
+---+--------+---------+---+-----+---------+---------+-----------+
|AGE|ACTIVITY|EDUCATION|SEX|STATE|NEGTWEETS|   INCOME|CHURN_LABEL|
+---+--------+---------+---+-----+---------+---------+-----------+
| 20|       0|        1|  F|   CA|        7|17088.000|       true|
| 20|       1|        1|  F|   TN|        3|13480.000|      false|
| 

### Use CADS(Cognitive Assistant of Data Science) to train & recommend the best model automatically from DT and LR

In [4]:
//Feature definition

val genderIndexer = new StringIndexer().setInputCol("SEX").setOutputCol("gender_code")
val stateIndexer = new StringIndexer().setInputCol("STATE").setOutputCol("state_code")
val labelIndexer = new StringIndexer().setInputCol("CHURN_LABEL").setOutputCol("label")

val featuresAssembler = new VectorAssembler().setInputCols(Array("AGE", 
                                                         "ACTIVITY", 
                                                         "EDUCATION", 
                                                         "NEGTWEETS" ,
                                                         "INCOME",
                                                         "gender_code",
                                                         "state_code")).setOutputCol("features")

In [5]:
//Select model automatically in candidate algorithm - Logistic Regression, SVM or Decision Tree?
val lr = new LogisticRegression().setRegParam(0.01).setLabelCol("label").setFeaturesCol("features")
val decisionTree = new DecisionTreeClassifier().setMaxBins(50).setLabelCol("label").setFeaturesCol("features")

In [6]:
//Cognitive Assistant for Data Scientists - predict model performance based on sampled data
val learners = List(Learner("DT", decisionTree), Learner("LR", lr))
val cads = CADSEstimator().setEvaluator(new BinaryClassificationEvaluator().
                           setMetricName("areaUnderROC")).
                           setLearners(learners).
                           setKeepBestNLearnersParam(3).
                           setTarget(Target("rawPrediction", "label")).
                           setNumSampleFoldsParam(2)
val pipeline = new IBMSparkPipeline().setStages(Array(labelIndexer, genderIndexer, stateIndexer, featuresAssembler, cads))
val model = pipeline.fit(trainingDF)

### Evaluate the trained model and draw the ROC curve

In [1]:
import com.ibm.analytics.ngp.pipeline.evaluate._
import com.ibm.analytics.ngp.pipeline.evaluate.JsonMetricsModel._
import spray.json._

val metrics = Evaluator.evaluateModel(MLProblemType.BinaryClassifier,model,testDF)

println(s"Binary Metric: ${metrics.asInstanceOf[BinaryClassificationMetricsModel].toJson}")

//Saving the model on file system if needed:
//model.saveToLocalPath("bfusr21/churnModel", "/home/bfusr21/model.tar.gz") 

Connections.setEnvironment("dev")
Connections.setMetaServiceHost("http://9.30.166.110:12501")
model.save("steve/ChurnCADSModel")

println("Model saved successfully, you can view and deploy in the model management dashboard")

Binary Metric: {"recallByThreshold":[{"threshold":1.0,"metric":0.9018691588785047},{"threshold":0.0,"metric":1.0}],"precisionByThreshold":[{"threshold":1.0,"metric":0.9897435897435898},{"threshold":0.0,"metric":0.23336968375136313}],"areaUnderPR":0.9572567559904366,"fMeasureByThreshold":[{"threshold":1.0,"metric":0.9437652811735943},{"threshold":0.0,"metric":0.37842617152961977}],"roc":[{"threshold":0.0,"metric":0.0},{"threshold":0.002844950213371266,"metric":0.9018691588785047},{"threshold":1.0,"metric":1.0},{"threshold":1.0,"metric":1.0}],"areaUnderROC":0.9495121043325667}
Model saved successfully, you can view and deploy in the model management dashboard


In [8]:
val rocCurve = metrics.asInstanceOf[BinaryClassificationMetricsModel].roc.map{ case ThresholdMetricModel(x, y) => (x,y)}

In [9]:
val rocDF = spark.createDataFrame(rocCurve).
                    withColumnRenamed("_1", "FPR").
                    withColumnRenamed("_2", "TPR")
rocDF.show(3)

+--------------------+------------------+
|                 FPR|               TPR|
+--------------------+------------------+
|                 0.0|               0.0|
|0.002844950213371266|0.9018691588785047|
|                 1.0|               1.0|
+--------------------+------------------+
only showing top 3 rows



In [10]:
%AddJar -magic https://brunelvis.org/jar/spark-kernel-brunel-all-2.3.jar -f

Starting download from https://brunelvis.org/jar/spark-kernel-brunel-all-2.3.jar
Finished download of spark-kernel-brunel-all-2.3.jar


In [11]:
%%brunel data('rocDF') x(FPR) y(TPR) line tooltip(#all) axes(x:'False Positive Rate':grid, y:'True Positive Rate':grid) title('ROC') 