In [None]:
# Import libraries required for reading data files
from pyspark import SparkContext
from pyspark.sql import SQLContext
sc = SparkContext.getOrCreate()
# Initialize SparkSQL Context
sqlContext = SQLContext(sc)

In [6]:
# Read in the data from file Churn_Rate.csv
df_data_1 = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('assets/Churn_Rate.csv')
df_data_1.show(5)

+----+-------+------------+----------+
|Year|Quarter|Quarter_Year|Churn_Rate|
+----+-------+------------+----------+
|2014|      1|        1Q14|      18.1|
|2014|      2|        2Q14|      18.7|
|2014|      3|        3Q14|      19.3|
|2014|      4|        4Q14|      19.9|
|2015|      1|        1Q15|      20.5|
+----+-------+------------+----------+
only showing top 5 rows



In [7]:
# Transform Spark Data Frame to Pandas Data Frame for vivualization
d1 = df_data_1.toPandas()

In [8]:
# Make a plot with brunel
import brunel
%brunel data('d1') x(Quarter_Year) y(Churn_Rate) bar tooltip(#all) sort(Year:ascending)

<IPython.core.display.Javascript object>

In [9]:
# Read in another data file CUST_SUM.csv
df_data_2 = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('assets/CUST_SUM.csv')
df_data_2.show(5)

+----------+---+---+---------+----------+-------+--------+-----+--------+------------+-------+----------+---------+-----+----------------+---------+-----------+
|   CUST_ID|SEX|AGE|EDUCATION|INVESTMENT| INCOME|ACTIVITY|CHURN|YRLY_AMT|AVG_DAILY_TX|YRLY_TX|AVG_TX_AMT|NEGTWEETS|STATE| EDUCATION_GROUP|TwitterID|CHURN_LABEL|
+----------+---+---+---------+----------+-------+--------+-----+--------+------------+-------+----------+---------+-----+----------------+---------+-----------+
|1009530860|  F| 84|        2|    114368|3852862|       5|    0|700259.0|    0.917808|    335|   2090.32|        3|   TX|Bachelors degree|        0|      false|
|1009544000|  F| 44|        2|     90298|3849843|       1|    0|726977.0|    0.950685|    347|   2095.04|        2|   CA|Bachelors degree|        0|      false|
|1009534260|  F| 23|        2|     94881|3217364|       1|    1|579084.0|    0.920548|    336|   1723.46|        5|   CA|Bachelors degree|        0|       true|
|1009574010|  F| 24|        2|    

In [10]:
d2 = df_data_2.toPandas()

In [11]:
%brunel data('d2') map key(STATE) x(STATE) color(INCOME) mean(INCOME) label(STATE) tooltip(#all) :: width=800, height=500

<IPython.core.display.Javascript object>

In [12]:
# Import libraries required for machine learning in spark
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [15]:
# Split the data into training and testing sets
trainDF, testDF = df_data_2.randomSplit([0.8, 0.2])
print "The number of training data is ",trainDF.count()
print "The number of test data is ",testDF.count()

The number of training data is  4753
The number of test data is  1248


In [16]:
# Transform string-typed columns into numerical values
genderIndexer = StringIndexer(inputCol="SEX",outputCol="gender_code")
stateIndexer = StringIndexer(inputCol="STATE",outputCol="state_code")
# Mark the label column
labelIndexer = StringIndexer(inputCol="CHURN",outputCol="label")

In [17]:
# Select features and assemble them together
featuresAssembler = VectorAssembler(inputCols=["AGE", "ACTIVITY", "EDUCATION", "NEGTWEETS" , "INCOME", "gender_code", "state_code"], outputCol="features")

In [18]:
# Choose logistic regression as classifier and construct the pipeline
lr = LogisticRegression(regParam=0.01, labelCol="label", featuresCol="features")
pipeline = Pipeline(stages=[labelIndexer, genderIndexer, stateIndexer, featuresAssembler, lr])

In [19]:
# Use cross validation for parameter selection
auc_eval = BinaryClassificationEvaluator()
grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [1e-4, 1e-2]) \
    .addGrid(lr.elasticNetParam, [0.5, 0.0]) \
    .build()
cross_val = CrossValidator(estimator=pipeline, evaluator=auc_eval, estimatorParamMaps=grid, numFolds=3)

In [20]:
# Train the model
pipeline_model = cross_val.fit(trainDF)
# Make prediction with test data
testResult=pipeline_model.transform(testDF)

In [26]:
# Draw ROC
import pandas as pd
from sklearn.metrics import roc_curve
testDF=testResult.select("prediction","label").toPandas()
a=testDF.label
b=testDF.prediction
fpr, tpr, thresholds = roc_curve(testDF.label, testDF.prediction, pos_label=1)
data={'FPR':fpr,'TPR':tpr}
rocPD=pd.DataFrame(data)

In [29]:
%brunel data('rocPD') x(FPR) y(TPR) line tooltip(#all) axes(x:'False Positive Rate':grid, y:'True Positive Rate':grid)

<IPython.core.display.Javascript object>