<h5> 
<ol>
 <li> Identify the features we want to use
 <li> Train the features on multiple models
 <li> Print the results
</ol> 
</h5>

In [2]:

#uncomment for jupyter notebook
#import findspark
#findspark.init()

#import pyspark
#sc = pyspark.SparkContext()
#spark = pyspark.sql.SparkSession(sc)

# General imports
from datetime import datetime
from pyspark.sql import SQLContext
from pyspark.ml.feature import VectorAssembler

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder



from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import MinMaxScaler

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import LinearSVC

from pyspark.sql.functions import *
from pyspark.ml.evaluation import BinaryClassificationEvaluator

from pyspark.mllib.evaluation import BinaryClassificationMetrics


s = SQLContext(sc)

<h5> Convert user logs into monthly data </h5>
<ol> 
  <li> convert data to year & month </li>
  <li> add a colum for months before feb 2017 (which is when we need to train for) </li>
  <li> save as a new table to be used later </li>
  <li> Uncomment to rerun- commented to prevent the table from being overwritten </li>
</ol>

In [4]:
def renameCols(df,oldName, newNames):
    nDF = df
    assert(len(df.schema.names) == len(newNames))
    assert(len(oldName) == len(newNames))
    for i in range(0,len(df.schema.names)):
        nDF = nDF.withColumnRenamed(oldName[i],newNames[i])
    return nDF

In [5]:
#d = s.sql('select * from unmgmt_user_logs_all_table')
#r = d.rdd.map(lambda l: (l[0],l[1],l[2],l[3],l[4],l[5],l[6],l[7],l[8],l[1].year,l[1].month,  (2017-l[1].year)*12 - l[1].month + 1) )
#usrLogWithYM = renameCols(r.toDF(), d.schema.names + ['year','month','months_b4_Feb2017'])
#usrLogWithYM.show(1)
#usrGroup = usrLogWithYM.groupby(['msno','year','month']).agg({ 'num_25': 'sum' , 'num_50': 'sum' ,'num_75': 'sum' ,'num_985': 'sum' ,'num_100': 'sum' ,'num_unq': 'sum','total_secs': 'sum','months_b4_Feb2017' : 'avg'})
#colNames = ','.join(usrGroup.schema.names).replace('(','_').replace(')','').split(',')
#renamed = renameCols(usrGroup, colNames)
#renamed.write.saveAsTable('usrGroupByMonthAndYear')

<h5> Read Transactions data (aggregated by month) </h5>

In [7]:
transData = s.sql('select * from unmgmt_transactions_table')

<h7> Add a field for number of months before feb 2017. Feb 2017 = 0, Jan 2017 = 1 </h7>

In [9]:
nTransData = transData.withColumn('expiring', (2017-year(transData.membership_expire_date))*12 - month(transData.membership_expire_date) + 2).drop('transaction_date').drop('membership_expire_date')

<h7> Find all feb/2017 expiring accounts </h7>

In [11]:
tData = nTransData.filter(nTransData.expiring == 0)

<h5> Read User Data (aggregated by month) </h5>

In [13]:
usrAvg = s.sql('select * from usrGroupAvgByMonthAndYear')

In [14]:
usrAvgF = usrAvg.filter(usrAvg.avg_months_b4_Feb2017 == 0).drop('year').drop('month').drop('avg_months_b4_Feb2017')

In [15]:
usrAvgF.show(2)

<h5> Join all the data sets to create a master data set </h5>

In [17]:
fData = usrAvgF.join(tData,'msno','inner')

In [18]:
trainDF = s.sql("select * from unmgmt_train_table").join(fData,'msno','inner')

<h5> Extract, scale and one-hot encode the features from the master data set </h5>

In [20]:
indexer1 = StringIndexer(inputCol="plan_list_price", outputCol="plan_list_price_enc_", handleInvalid = 'skip')
oneHot1  = OneHotEncoder(inputCol="plan_list_price_enc_", outputCol= "plan_list_price_")

indexer2 = StringIndexer(inputCol="payment_method_id", outputCol="payment_method_id_enc_",handleInvalid = 'skip')
oneHot2  = OneHotEncoder(inputCol="payment_method_id_enc_", outputCol= "payment_method_id_")

indexer3 = StringIndexer(inputCol="payment_plan_days", outputCol="payment_plan_days_enc_",handleInvalid = 'skip')
oneHot3  = OneHotEncoder(inputCol="payment_plan_days_enc_", outputCol= "payment_plan_days_")

featureColumns = ['avg_num_100','avg_num_25','avg_num_75','avg_num_unq','avg_num_50','avg_num_985','avg_total_secs','is_auto_renew','is_cancel','plan_list_price_','payment_method_id_','payment_plan_days_']
assembler = VectorAssembler ( inputCols = featureColumns, outputCol = 'rawFeatures')
scalar    = StandardScaler(inputCol="rawFeatures", outputCol="outOfRangefeatures", withStd=True, withMean=True)
minMax    = MinMaxScaler(inputCol="outOfRangefeatures", outputCol="features")
lr        = LogisticRegression(labelCol = 'is_churn', maxIter = 100,regParam = 0.4)
gbt       = GBTClassifier(labelCol='is_churn', maxIter=10, maxBins=34)
svm       = LinearSVC(maxIter=20,regParam=0.1, labelCol='is_churn')

In [21]:
np = Pipeline(stages = [indexer1, indexer2,indexer3,oneHot1,oneHot2,oneHot3,assembler,scalar,minMax])
featureDF = np.fit(trainDF).transform(trainDF).select('features','is_churn')

<h5> Lets split the churn data into a training and a test </h5>

In [23]:
(trainChurn, testChurn) = featureDF.randomSplit([0.7,0.3])

<h5> Train and evaluate the SVM Model on this data </h5>

In [25]:
def evaluateModel(test,train,description):
   predictionAndLabelTest = test.select(['prediction','is_churn']).rdd.map(lambda x: ( float(x[0]) , float(x[1]) ) )
   predictionAndLabelTrain = train.select(['prediction','is_churn']).rdd.map(lambda x: ( float(x[0]) , float(x[1]) ) )

   metricsTest = BinaryClassificationMetrics(predictionAndLabelTest)
   metricsTrain = BinaryClassificationMetrics(predictionAndLabelTrain)
   print "\n------------- " + description + " -----------------"
   print "Area under precision: test = %s, train = "  % metricsTest.areaUnderPR, metricsTrain.areaUnderPR
   print "Area under       ROC: test = %s" % metricsTest.areaUnderROC, metricsTrain.areaUnderROC  

In [26]:
model = svm.fit(trainChurn)
predictionsTest = model.transform(testChurn)
predictionsTrain = model.transform(trainChurn)

evaluateModel(predictionsTest,predictionsTrain, "Support Vector Machine")

<h5> Train and evaluate a GBT Model on this data </h5>

In [28]:
model = gbt.fit(trainChurn)
predictionsTest = model.transform(testChurn)
predictionsTrain = model.transform(trainChurn)

evaluateModel(predictionsTest,predictionsTrain, "Gradient Boosted Trees")

<h5> Train and evaluate a Logistic Regression Model on this data </h5>

In [30]:
model = lr.fit(trainChurn)
predictionsTest = model.transform(testChurn)
predictionsTrain = model.transform(trainChurn)

evaluateModel(predictionsTest,predictionsTrain, "Logistic Regression")