## Customer Churn Model Scoring

### Step 1: Download new customer data



In [1]:
import wget
url_customer='https://raw.githubusercontent.com/nwngeek212/DSX-DemoCenter/master/predictCustomerChurn/data_assets/new_customer_churn_data.csv'

#remove existing files before downloading
!rm -f new_customer_churn_data.csv

customerFilename=wget.download(url_customer)

!ls -l new_customer_churn_data.csv

-rw------- 1 s052-10aa358d73d58a-8f23243c3997 users 15579 Sep 14 03:20 new_customer_churn_data.csv


### Step 2: Read data into a DataFrame
Note: the new dataset does not contain the label column

In [3]:
newData= sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").load(customerFilename)

In [4]:
newData = newData.withColumnRenamed("Est Income", "EstIncome").withColumnRenamed("Car Owner","CarOwner")
newData.toPandas().head()

Unnamed: 0,ID,Gender,Status,Children,EstIncome,CarOwner,Age,LongDistance,International,Local,Dropped,Paymethod,LocalBilltype,LongDistanceBilltype,Usage,RatePlan
0,2048,F,S,1,13576.5,N,39.426667,14.83,0,25.66,0,CC,Budget,Standard,40.49,1
1,2054,F,M,2,84166.1,N,54.013333,3.28,0,11.74,1,CC,Budget,Standard,15.02,2
2,2075,F,S,0,68427.4,N,42.393333,23.76,0,50.05,0,Auto,FreeLocal,Standard,73.81,3
3,2095,F,M,2,77551.1,Y,33.6,20.53,0,41.89,1,CC,Budget,Intnl_discount,62.42,2
4,2108,F,S,1,13109.1,N,62.606667,22.38,0,40.48,0,Auto,Budget,Standard,62.87,1


### Step 3: Load Saved Model
Load model.

In [5]:
from pyspark.ml import PipelineModel
model1_loaded = PipelineModel.load("PredictChurn.churnModel")

### Step 4: Score the new data
Note: The scored output contains the predicted values and confidence scores

In [6]:
results = model1_loaded.transform(newData)
results.toPandas().head(4)

Unnamed: 0,ID,Gender,Status,Children,EstIncome,CarOwner,Age,LongDistance,International,Local,...,GenderEncoded,StatusEncoded,CarOwnerEncoded,PaymethodEncoded,LocalBilltypeEncoded,LongDistanceBilltypeEncoded,features,rawPrediction,probability,prediction
0,2048,F,S,1,13576.5,N,39.426667,14.83,0,25.66,...,0,1,0,0,0,0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 13576.5, 3...","[0.347826086957, 19.652173913]","[0.0173913043478, 0.982608695652]",1
1,2054,F,M,2,84166.1,N,54.013333,3.28,0,11.74,...,0,0,0,0,0,0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 84166.1, 5...","[7.86514005602, 12.134859944]","[0.393257002801, 0.606742997199]",1
2,2075,F,S,0,68427.4,N,42.393333,23.76,0,50.05,...,0,1,0,1,1,0,"[0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 68427.4, 4...","[18.8454155863, 1.15458441371]","[0.942270779314, 0.0577292206856]",0
3,2095,F,M,2,77551.1,Y,33.6,20.53,0,41.89,...,0,0,1,0,0,1,"[0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 2.0, 77551.1, 3...","[19.527457771, 0.472542229013]","[0.976372888549, 0.0236271114506]",0


### Step 5: Export Score into a csv file

In [7]:
#Select ID, prediction and probability fields from the results dataframe

r1=results.select(results["ID"],results["prediction"],results["probability"])
r1.show(5,False)

+----+----------+-----------------------------------------+
|ID  |prediction|probability                              |
+----+----------+-----------------------------------------+
|2048|1.0       |[0.017391304347826087,0.9826086956521738]|
|2054|1.0       |[0.39325700280112047,0.6067429971988795] |
|2075|0.0       |[0.942270779314416,0.05772922068558384]  |
|2095|0.0       |[0.9763728885493592,0.023627111450640864]|
|2108|1.0       |[0.075,0.925]                            |
+----+----------+-----------------------------------------+
only showing top 5 rows



#### Decompose the probability column
The probability column contains a vector for each record, and the elements must be extracted

In [8]:
from pyspark.sql import Row
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import udf
from pyspark.ml.linalg import Vectors

udf_0 = udf(lambda vector: float(vector[0]), DoubleType())
udf_1 = udf(lambda vector: float(vector[1]), DoubleType())

r2 = (r1.select(r1["ID"], r1["prediction"],r1["probability"])
    .withColumn('probability_0', udf_0(r1.probability))
    .withColumn('probability_1', udf_1(r1.probability))
    .drop("probability"))

r2.show(10, False)

+----+----------+--------------------+--------------------+
|ID  |prediction|probability_0       |probability_1       |
+----+----------+--------------------+--------------------+
|2048|1.0       |0.017391304347826087|0.9826086956521738  |
|2054|1.0       |0.39325700280112047 |0.6067429971988795  |
|2075|0.0       |0.942270779314416   |0.05772922068558384 |
|2095|0.0       |0.9763728885493592  |0.023627111450640864|
|2108|1.0       |0.075               |0.925               |
|2124|0.0       |0.9919057692664552  |0.00809423073354488 |
|2154|1.0       |0.16981225296442687 |0.8301877470355731  |
|2218|0.0       |0.9143637805237788  |0.08563621947622133 |
|2267|0.0       |0.9805743411792578  |0.01942565882074223 |
|2284|1.0       |0.08849206349206348 |0.9115079365079366  |
+----+----------+--------------------+--------------------+
only showing top 10 rows



### Step 6: Schedule this notebook to run at a time and frequency of your choice
Click on the "clock" icon at the top right

You have come to the end of this notebook

** Sidney Phoon** <br/>
yfphoon@us.ibm.com<br/>
May 4th, 2017