
# Model training of the churn data
## Import the required libraries

In [1]:
#import libraries
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator


# Load the data from Object Storage into dataframe and split for training, testing and evaluating

In [3]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

# This function is used to setup the access of Spark to your Object Storage. The definition contains your credentials.
# You might want to remove those credentials before you share your notebook.
def set_hadoop_config_with_credentials_64e9c4bd3e4148978db0a312dfcc0a93(name):
    """This function sets the Hadoop configuration so it is possible to
    access data from Bluemix Object Storage using Spark"""

    prefix = 'fs.swift.service.' + name
    hconf = sc._jsc.hadoopConfiguration()
    hconf.set(prefix + '.auth.url', 'https://identity.open.softlayer.com'+'/v3/auth/tokens')
    hconf.set(prefix + '.auth.endpoint.prefix', 'endpoints')
    hconf.set(prefix + '.tenant', '419cd8dece644c82af5a615b62af38e1')
    hconf.set(prefix + '.username', '61e622a296924b9f8f5ddd0bd74f96b6')
    hconf.set(prefix + '.password', 'w0C9Eh4)Ssel~ihb')
    hconf.setInt(prefix + '.http.port', 8080)
    hconf.set(prefix + '.region', 'dallas')
    hconf.setBoolean(prefix + '.public', False)

# you can choose any name
name = 'keystone'
set_hadoop_config_with_credentials_64e9c4bd3e4148978db0a312dfcc0a93(name)

df_data_1 = sqlContext.read.format('com.databricks.spark.csv')\
  .options(header='true', inferschema='true')\
  .load("swift://ChurnModelTraing." + name + "/CUST_SUM.csv")
df_data_1.take(5)


[Row(CUST_ID=1009530860, SEX=u'F', AGE=84, EDUCATION=2, INVESTMENT=114368, INCOME=3852862, ACTIVITY=5, CHURN=0, YRLY_AMT=700259.0, AVG_DAILY_TX=0.917808, YRLY_TX=335, AVG_TX_AMT=2090.32, NEGTWEETS=3, STATE=u'TX', EDUCATION_GROUP=u'Bachelors degree', TwitterID=0, CHURN_LABEL=u'false'),
 Row(CUST_ID=1009544000, SEX=u'F', AGE=44, EDUCATION=2, INVESTMENT=90298, INCOME=3849843, ACTIVITY=1, CHURN=0, YRLY_AMT=726977.0, AVG_DAILY_TX=0.950685, YRLY_TX=347, AVG_TX_AMT=2095.04, NEGTWEETS=2, STATE=u'CA', EDUCATION_GROUP=u'Bachelors degree', TwitterID=0, CHURN_LABEL=u'false'),
 Row(CUST_ID=1009534260, SEX=u'F', AGE=23, EDUCATION=2, INVESTMENT=94881, INCOME=3217364, ACTIVITY=1, CHURN=1, YRLY_AMT=579084.0, AVG_DAILY_TX=0.920548, YRLY_TX=336, AVG_TX_AMT=1723.46, NEGTWEETS=5, STATE=u'CA', EDUCATION_GROUP=u'Bachelors degree', TwitterID=0, CHURN_LABEL=u'true'),
 Row(CUST_ID=1009574010, SEX=u'F', AGE=24, EDUCATION=2, INVESTMENT=112099, INCOME=2438218, ACTIVITY=4, CHURN=1, YRLY_AMT=470964.0, AVG_DAILY_TX=0

In [4]:
# Split data into training (80%) and test (20%)
trainDF, testDF = df_data_1.randomSplit([0.8, 0.2])
print "The number of training data is ",trainDF.count()
print "The number of test data is ",df_data_1.count()

The number of training data is  4771
The number of test data is  6001


In [5]:
genderIndexer = StringIndexer(inputCol="SEX",outputCol="gender_code")
stateIndexer = StringIndexer(inputCol="STATE",outputCol="state_code")
labelIndexer = StringIndexer(inputCol="CHURN",outputCol="label")
featuresAssembler = VectorAssembler(inputCols=["AGE","ACTIVITY","EDUCATION","NEGTWEETS" ,"INCOME","gender_code","state_code"],outputCol="features")

In [6]:
lr = LogisticRegression(regParam=0.01,labelCol="label",featuresCol="features")
pipeline = Pipeline(stages=[labelIndexer, genderIndexer, stateIndexer, featuresAssembler,lr])
auc_eval = BinaryClassificationEvaluator()
grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [1e-4, 1e-2]) \
    .addGrid(lr.elasticNetParam, [0.5, 0.0]) \
    .build()
cross_val = CrossValidator(estimator=pipeline, evaluator=auc_eval, estimatorParamMaps=grid, numFolds=3)

In [7]:
pipeline_model = cross_val.fit(trainDF)
testResult=pipeline_model.transform(testDF)




# Evaluate the trained model and draw the ROC curve

In [8]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve
testDF=testResult.select("prediction","label").toPandas()
a=np.array(testDF.label)
b=np.array(testDF.prediction)
fpr, tpr, thresholds = roc_curve(a, b, pos_label=1)
data={'FPR':fpr,'TPR':tpr}
rocPD=pd.DataFrame(data)

In [9]:
import brunel
%brunel data('rocPD') x(FPR) y(TPR) line tooltip(#all) axes(x:'False Positive Rate':grid, y:'True Positive Rate':grid)

<IPython.core.display.Javascript object>

In [10]:
dataFrame=[(40,1,3,4,200000,'M','TX'),(40,1,3,8,200000,'M','OR')]
dataTest=sqlContext.createDataFrame(dataFrame,["AGE","ACTIVITY","EDUCATION","NEGTWEETS" ,"INCOME","SEX","STATE"])
dataTest.show()
pipeline_model.transform(dataTest).select("gender_code","state_code","features","prediction").show() 

+---+--------+---------+---------+------+---+-----+
|AGE|ACTIVITY|EDUCATION|NEGTWEETS|INCOME|SEX|STATE|
+---+--------+---------+---------+------+---+-----+
| 40|       1|        3|        4|200000|  M|   TX|
| 40|       1|        3|        8|200000|  M|   OR|
+---+--------+---------+---------+------+---+-----+

+-----------+----------+--------------------+----------+
|gender_code|state_code|            features|prediction|
+-----------+----------+--------------------+----------+
|        0.0|      22.0|[40.0,1.0,3.0,4.0...|       0.0|
|        0.0|       5.0|[40.0,1.0,3.0,8.0...|       1.0|
+-----------+----------+--------------------+----------+

