# Prepare the environment and collect data

In [21]:
import warnings
warnings.filterwarnings('ignore')
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("Linear regression").getOrCreate()
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.ml.feature import VectorAssembler, VectorIndexer
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder
from pyspark.sql.functions import mean, col
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.mllib.evaluation import BinaryClassificationMetrics


file_location = "bank_deposit.csv"
file_type = "csv"
infer_schema = "False"
first_row_is_header = "True"
df = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.option("header", first_row_is_header) \
.load(file_location)

df.printSchema()

root
 |-- age: string (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: string (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- campaign: string (nullable = true)
 |-- pdays: string (nullable = true)
 |-- previous: string (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- deposit: string (nullable = true)



In [22]:
df.show(5,False)

+---+----------+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+
|age|job       |marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|deposit|
+---+----------+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+
|59 |admin.    |married|secondary|no     |2343   |yes    |no  |unknown|5  |may  |1042    |1       |-1   |0       |unknown |yes    |
|56 |admin.    |married|secondary|no     |45     |no     |no  |unknown|5  |may  |1467    |1       |-1   |0       |unknown |yes    |
|41 |technician|married|secondary|no     |1270   |yes    |no  |unknown|5  |may  |1389    |1       |-1   |0       |unknown |yes    |
|55 |services  |married|secondary|no     |2476   |yes    |no  |unknown|5  |may  |579     |1       |-1   |0       |unknown |yes    |
|54 |admin.    |married|tertiary |no     |184    |no     |no  |unknown|5  |m

In [23]:
from pyspark.sql.types import *
#Identifying and assigning lists of variables
float_vars=['age', 'balance', 'duration','campaign','pdays','previous']
#Converting variables
for column in float_vars:
 df=df.withColumn(column,df[column].cast(IntegerType()))
df.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'int'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'string'),
 ('month', 'string'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('poutcome', 'string'),
 ('deposit', 'string')]

# Data Transformation

In [24]:
def data_transformation(df, CatCols, continuousCols, labelCol):
  
  indexers = [StringIndexer(inputCol=c, 
                            outputCol="{0}_indexed".format(c)) for c in CatCols]

  encoders = [OneHotEncoder(inputCol=indexer.getOutputCol(),
              outputCol="{0}_encoded".format(indexer.getOutputCol()))
              for indexer in indexers]


  v = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders]
                              + continuousCols, outputCol="features")
  
  indexer = StringIndexer(inputCol=labelCol, outputCol='indexedLabel')

  pipeline = Pipeline(stages = indexers + encoders + [v ] + [indexer])

  model=pipeline.fit(df)
    
  data = model.transform(df)

  data =  data.withColumn('label', col(labelCol))
  
  return  data.select('features', 
                     'indexedLabel', 
                     'label'), StringIndexer(inputCol='label').fit(data)

In [25]:
CatCols = ['job', 'marital', 'education', 
                      'default', 'housing', 'loan', 
                      'contact', 'poutcome']

NumCols = ['age', 'balance', 'duration', 
               'campaign', 'pdays', 'previous']

(df, labelindexer) = data_transformation(df, CatCols, NumCols, 'deposit')

df.show(10)



+--------------------+------------+-----+
|            features|indexedLabel|label|
+--------------------+------------+-----+
|(30,[3,11,13,16,1...|         1.0|  yes|
|(30,[3,11,13,16,1...|         1.0|  yes|
|(30,[2,11,13,16,1...|         1.0|  yes|
|(30,[4,11,13,16,1...|         1.0|  yes|
|(30,[3,11,14,16,1...|         1.0|  yes|
|(30,[0,12,14,16,2...|         1.0|  yes|
|(30,[0,11,14,16,2...|         1.0|  yes|
|(30,[5,13,16,18,2...|         1.0|  yes|
|(30,[2,11,13,16,1...|         1.0|  yes|
|(30,[4,12,13,16,1...|         1.0|  yes|
+--------------------+------------+-----+
only showing top 10 rows



In [26]:
featureIndexer = VectorIndexer(inputCol="features", 
                               outputCol="indexedFeatures", 
                               maxCategories=4).fit(df)

featureIndexer.transform(df).show(10)

                                                                                

+--------------------+------------+-----+--------------------+
|            features|indexedLabel|label|     indexedFeatures|
+--------------------+------------+-----+--------------------+
|(30,[3,11,13,16,1...|         1.0|  yes|(30,[3,11,13,16,1...|
|(30,[3,11,13,16,1...|         1.0|  yes|(30,[3,11,13,16,1...|
|(30,[2,11,13,16,1...|         1.0|  yes|(30,[2,11,13,16,1...|
|(30,[4,11,13,16,1...|         1.0|  yes|(30,[4,11,13,16,1...|
|(30,[3,11,14,16,1...|         1.0|  yes|(30,[3,11,14,16,1...|
|(30,[0,12,14,16,2...|         1.0|  yes|(30,[0,12,14,16,2...|
|(30,[0,11,14,16,2...|         1.0|  yes|(30,[0,11,14,16,2...|
|(30,[5,13,16,18,2...|         1.0|  yes|(30,[5,13,16,18,2...|
|(30,[2,11,13,16,1...|         1.0|  yes|(30,[2,11,13,16,1...|
|(30,[4,12,13,16,1...|         1.0|  yes|(30,[4,12,13,16,1...|
+--------------------+------------+-----+--------------------+
only showing top 10 rows



In [27]:
df.show(10)

+--------------------+------------+-----+
|            features|indexedLabel|label|
+--------------------+------------+-----+
|(30,[3,11,13,16,1...|         1.0|  yes|
|(30,[3,11,13,16,1...|         1.0|  yes|
|(30,[2,11,13,16,1...|         1.0|  yes|
|(30,[4,11,13,16,1...|         1.0|  yes|
|(30,[3,11,14,16,1...|         1.0|  yes|
|(30,[0,12,14,16,2...|         1.0|  yes|
|(30,[0,11,14,16,2...|         1.0|  yes|
|(30,[5,13,16,18,2...|         1.0|  yes|
|(30,[2,11,13,16,1...|         1.0|  yes|
|(30,[4,12,13,16,1...|         1.0|  yes|
+--------------------+------------+-----+
only showing top 10 rows



In [28]:
# Data splitting
(trainingData, testData) = df.randomSplit([0.8, 0.2], seed=10)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

                                                                                

Training Dataset Count: 8911


[Stage 1257:>                                                       (0 + 1) / 1]

Test Dataset Count: 2251


                                                                                

# Regression using decision trees

In [29]:
from pyspark.ml.classification import DecisionTreeClassifier
# Create initial Decision Tree Model
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="features")

# Train model with Training Data.
dtModel = dt.fit(trainingData)

# Make predictions on test data.
predictions = dtModel.transform(testData)



                                                                                

In [35]:
# Evaluate the model by computing the metrics. 
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
print("Accuracy = ",format(evaluator.evaluate(predictions)))

predictionAndLabel = predictions.select("prediction", "indexedLabel").rdd

# Instantiate metrics object 
metricsMulti = MulticlassMetrics(predictionAndLabel)
metricsBinary= BinaryClassificationMetrics(predictionAndLabel)

precision = metricsMulti.precision(label=1) 
recall = metricsMulti.recall(label=1) 


print("Precision = ",precision) 
print("Recall =" ,recall) 
print("Area under ROC = %s" % metricsBinary.areaUnderROC)

                                                                                

Accuracy =  0.7947578853842736


                                                                                

Precision =  0.7464788732394366
Recall = 0.8524124881740776


[Stage 1881:>                                                       (0 + 1) / 1]

Area under ROC = 0.7980655405694509


                                                                                

# Using k-Fold Cross Validation

In [34]:
paramGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [1, 2, 6, 10])
             .addGrid(dt.maxBins, [20, 40, 80])
             .build())

evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                          labelCol="indexedLabel")

pipeline = Pipeline(stages=[featureIndexer, dt, labelConverter]) 
kFold  = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, 
                    evaluator=evaluator, numFolds=2, 
                    parallelism=10, 
                    seed=100)
kFoldModel = kFold .fit(trainingData)

predictions = kFoldModel.transform(testData)

# Evaluate the best model
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", 
                                              predictionCol="prediction", 
                                              metricName="accuracy")
print("Accuracy = ",format(evaluator.evaluate(predictions)))

predictionAndLabel = predictions.select("prediction", "indexedLabel").rdd

# Instantiate metrics object 
metricsMulti = MulticlassMetrics(predictionAndLabel)
metricsBinary= BinaryClassificationMetrics(predictionAndLabel)
# Overall statistics 
precision = metricsMulti.precision(label=1) 
recall = metricsMulti.recall(label=1) 

print("Precision = ",precision) 
print("Recall = " ,recall) 
print("Area under ROC = %s" % metricsBinary.areaUnderROC)

                                                                                

The Accuracy for test set is 0.7947578853842736


                                                                                

Precision =  0.7464788732394366
Recall =  0.8524124881740776


[Stage 1866:>                                                       (0 + 1) / 1]

Area under ROC = 0.7980655405694509


                                                                                