## Customer Churn Model Scoring

### Step 1: Download new customer data



In [1]:
import wget
url_customer='https://raw.githubusercontent.com/yfphoon/dsx_demo/master/data/new_customer_churn_data.csv'

#remove existing files before downloading
!rm -f new_customer_churn_data.csv

customerFilename=wget.download(url_customer)

!ls -l new_customer_churn_data.csv

-rw-r----- 1 s191-4754435a94b541-9fb5ca908bcc users 27597 May 26 18:55 new_customer_churn_data.csv


### Step 2: Read data into a DataFrame
Note: the new dataset does not contain the label column

In [2]:
newData= sqlContext.read.format("csv").option("header", "true").option("inferSchema", "true").load(customerFilename)

In [3]:
newData = newData.withColumnRenamed("Est Income", "EstIncome").withColumnRenamed("Car Owner","CarOwner")
newData.toPandas().head()

Unnamed: 0,ID,Gender,Status,Children,EstIncome,CarOwner,Age,LongDistance,International,Local,Dropped,Paymethod,LocalBilltype,LongDistanceBilltype,Usage,RatePlan
0,2048,F,S,1,13576.5,N,39.426667,14.83,0,25.66,0,CC,Budget,Standard,40.49,1
1,2054,F,M,2,84166.1,N,54.013333,3.28,0,11.74,1,CC,Budget,Standard,15.02,2
2,2075,F,S,0,68427.4,N,42.393333,23.76,0,50.05,0,Auto,FreeLocal,Standard,73.81,3
3,2095,F,M,2,77551.1,Y,33.6,20.53,0,41.89,1,CC,Budget,Intnl_discount,62.42,2
4,2108,F,S,1,13109.1,N,62.606667,22.38,0,40.48,0,Auto,Budget,Standard,62.87,1


### Step 3: Load Saved Model
Load model in Object Storage.

In [5]:
from pyspark.ml import PipelineModel
model1_loaded = PipelineModel.load("PredictChurn.churnModel")

### Step 4: Score the new data
Note: The scored output contains the predicted values and confidence scores

In [6]:
results = model1_loaded.transform(newData)
results.toPandas().head(4)

Unnamed: 0,ID,Gender,Status,Children,EstIncome,CarOwner,Age,LongDistance,International,Local,...,GenderEncoded,StatusEncoded,CarOwnerEncoded,PaymethodEncoded,LocalBilltypeEncoded,LongDistanceBilltypeEncoded,features,rawPrediction,probability,prediction
0,2048,F,S,1,13576.5,N,39.426667,14.83,0,25.66,...,0,1,0,0,0,0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 13576.5, 3...","[1.08446712018, 18.9155328798]","[0.0542233560091, 0.945776643991]",1
1,2054,F,M,2,84166.1,N,54.013333,3.28,0,11.74,...,0,0,0,0,0,0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 84166.1, 5...","[7.7855992356, 12.2144007644]","[0.38927996178, 0.61072003822]",1
2,2075,F,S,0,68427.4,N,42.393333,23.76,0,50.05,...,0,1,0,1,1,0,"[0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 68427.4, 4...","[19.4280557204, 0.571944279588]","[0.971402786021, 0.0285972139794]",0
3,2095,F,M,2,77551.1,Y,33.6,20.53,0,41.89,...,0,0,1,0,0,1,"[0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 2.0, 77551.1, 3...","[19.8942788074, 0.105721192587]","[0.994713940371, 0.00528605962933]",0


### Step 5: Export Score into a csv file

In [7]:
#Select ID, prediction and probability fields from the results dataframe

r1=results.select(results["ID"],results["prediction"],results["probability"])
r1.show(5,False)

+----+----------+------------------------------------------+
|ID  |prediction|probability                               |
+----+----------+------------------------------------------+
|2048|1.0       |[0.054223356009070287,0.9457766439909298] |
|2054|1.0       |[0.3892799617799617,0.6107200382200382]   |
|2075|0.0       |[0.9714027860205997,0.028597213979400397] |
|2095|0.0       |[0.9947139403706687,0.0052860596293311845]|
|2108|1.0       |[0.08602071740858505,0.9139792825914149]  |
+----+----------+------------------------------------------+
only showing top 5 rows



#### Decompose the probability column
The probability column contains a vector for each record, and the elements must be extracted

In [8]:
from pyspark.sql import Row
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf
from pyspark.ml.linalg import Vectors

udf_0 = udf(lambda vector: float(vector[0]), DoubleType())
udf_1 = udf(lambda vector: float(vector[1]), DoubleType())

r2 = (r1.select(r1["ID"], r1["prediction"],r1["probability"])
    .withColumn('probability_0', udf_0(r1.probability))
    .withColumn('probability_1', udf_1(r1.probability))
    .drop("probability"))

r2.show(10, False)

+----+----------+--------------------+---------------------+
|ID  |prediction|probability_0       |probability_1        |
+----+----------+--------------------+---------------------+
|2048|1.0       |0.054223356009070287|0.9457766439909298   |
|2054|1.0       |0.3892799617799617  |0.6107200382200382   |
|2075|0.0       |0.9714027860205997  |0.028597213979400397 |
|2095|0.0       |0.9947139403706687  |0.0052860596293311845|
|2108|1.0       |0.08602071740858505 |0.9139792825914149   |
|2124|0.0       |0.9866935483870968  |0.013306451612903225 |
|2154|1.0       |0.38106060606060604 |0.6189393939393939   |
|2218|0.0       |0.9833333333333334  |0.01666666666666667  |
|2267|0.0       |0.975991974379071   |0.024008025620928842 |
|2284|1.0       |0.11277056277056277 |0.8872294372294371   |
+----+----------+--------------------+---------------------+
only showing top 10 rows



#### Connect to Object Storage

In [9]:
# @hidden_cell
from pyspark.sql import SparkSession

# @hidden_cell
# This function is used to setup the access of Spark to your Object Storage. The definition contains your credentials.
# You might want to remove those credentials before you share your notebook.
def set_hadoop_config_with_credentials_78e95108d20b4b6eb7f928636070a5c2(name):
    """This function sets the Hadoop configuration so it is possible to
    access data from Bluemix Object Storage using Spark"""

    prefix = 'fs.swift.service.' + name
    hconf = sc._jsc.hadoopConfiguration()
    hconf.set(prefix + '.auth.url', 'https://identity.open.softlayer.com'+'/v3/auth/tokens')
    hconf.set(prefix + '.auth.endpoint.prefix', 'endpoints')
    hconf.set(prefix + '.tenant', '985d92f671ed450d8c4a864b0835135f')
    hconf.set(prefix + '.username', '54d76d92adad41e8a2b7c2c24d32b3ce')
    hconf.set(prefix + '.password', 'kb/^on8e8qLWE6~r')
    hconf.setInt(prefix + '.http.port', 8080)
    hconf.set(prefix + '.region', 'dallas')
    hconf.setBoolean(prefix + '.public', False)

# you can choose any name
name = 'keystone'
set_hadoop_config_with_credentials_78e95108d20b4b6eb7f928636070a5c2(name)

spark = SparkSession.builder.getOrCreate()

#### Write sores .csv file

In [10]:
r2.write.csv('swift://PredictChurn.' + name + '/churn_scores.csv', mode='overwrite')
# Show csv file can be read back
r3= spark.read.csv('swift://PredictChurn.' + name + '/churn_scores.csv')
r3.select(r3["_c0"].alias("ID"), r3["_c1"].alias("prediction"), r3["_c2"].alias("probability_0"), r3["_c3"].alias("probability_1")).show(5, False)

+----+----------+--------------------+---------------------+
|ID  |prediction|probability_0       |probability_1        |
+----+----------+--------------------+---------------------+
|2048|1.0       |0.054223356009070287|0.9457766439909298   |
|2054|1.0       |0.3892799617799617  |0.6107200382200382   |
|2075|0.0       |0.9714027860205997  |0.028597213979400397 |
|2095|0.0       |0.9947139403706687  |0.0052860596293311845|
|2108|1.0       |0.08602071740858505 |0.9139792825914149   |
+----+----------+--------------------+---------------------+
only showing top 5 rows



### Step 6: Schedule this notebook to run at a time and frequency of your choice
Click on the "clock" icon at the top right

You have come to the end of this notebook

** Sidney Phoon** <br/>
yfphoon@us.ibm.com<br/>
May 4th, 2017