# Linear regression

In [4]:
import warnings
warnings.filterwarnings('ignore')
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("Linear regression").getOrCreate()
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.ml.feature import VectorAssembler, VectorIndexer
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder
from pyspark.sql.functions import mean, col
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.mllib.evaluation import BinaryClassificationMetrics


file_location = "bank_deposit.csv"
file_type = "csv"
infer_schema = "False"
first_row_is_header = "True"
df = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.option("header", first_row_is_header) \
.load(file_location)

df.printSchema()

root
 |-- age: string (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: string (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- campaign: string (nullable = true)
 |-- pdays: string (nullable = true)
 |-- previous: string (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- deposit: string (nullable = true)



In [5]:
df.show(5,False)

+---+----------+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+
|age|job       |marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|deposit|
+---+----------+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+
|59 |admin.    |married|secondary|no     |2343   |yes    |no  |unknown|5  |may  |1042    |1       |-1   |0       |unknown |yes    |
|56 |admin.    |married|secondary|no     |45     |no     |no  |unknown|5  |may  |1467    |1       |-1   |0       |unknown |yes    |
|41 |technician|married|secondary|no     |1270   |yes    |no  |unknown|5  |may  |1389    |1       |-1   |0       |unknown |yes    |
|55 |services  |married|secondary|no     |2476   |yes    |no  |unknown|5  |may  |579     |1       |-1   |0       |unknown |yes    |
|54 |admin.    |married|tertiary |no     |184    |no     |no  |unknown|5  |m

In [6]:
from pyspark.sql.types import *
#Identifying and assigning lists of variables
float_vars=['age', 'balance', 'duration','campaign','pdays','previous']
#Converting variables
for column in float_vars:
 df=df.withColumn(column,df[column].cast(IntegerType()))
df.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'int'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'string'),
 ('month', 'string'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('poutcome', 'string'),
 ('deposit', 'string')]

# Data Transformation

In [7]:
def data_transformation(df, CatCols, continuousCols, labelCol):
  
  indexers = [StringIndexer(inputCol=c, 
                            outputCol="{0}_indexed".format(c)) for c in CatCols]

  encoders = [OneHotEncoder(inputCol=indexer.getOutputCol(),
              outputCol="{0}_encoded".format(indexer.getOutputCol()))
              for indexer in indexers]


  v = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders]
                              + continuousCols, outputCol="features")
  
  indexer = StringIndexer(inputCol=labelCol, outputCol='indexedLabel')

  pipeline = Pipeline(stages = indexers + encoders + [v ] + [indexer])

  model=pipeline.fit(df)
    
  data = model.transform(df)

  data =  data.withColumn('label', col(labelCol))
  
  return  data.select('features', 
                     'indexedLabel', 
                     'label'), StringIndexer(inputCol='label').fit(data)

In [8]:
CatCols = ['job', 'marital', 'education', 
                      'default', 'housing', 'loan', 
                      'contact', 'poutcome']

NumCols = ['age', 'balance', 'duration', 
               'campaign', 'pdays', 'previous']

(df, labelindexer) = data_transformation(df, CatCols, NumCols, 'deposit')

df.show(10)



                                                                                

22/10/15 18:21:41 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[Stage 33:>                                                         (0 + 1) / 1]

+--------------------+------------+-----+
|            features|indexedLabel|label|
+--------------------+------------+-----+
|(30,[3,11,13,16,1...|         1.0|  yes|
|(30,[3,11,13,16,1...|         1.0|  yes|
|(30,[2,11,13,16,1...|         1.0|  yes|
|(30,[4,11,13,16,1...|         1.0|  yes|
|(30,[3,11,14,16,1...|         1.0|  yes|
|(30,[0,12,14,16,2...|         1.0|  yes|
|(30,[0,11,14,16,2...|         1.0|  yes|
|(30,[5,13,16,18,2...|         1.0|  yes|
|(30,[2,11,13,16,1...|         1.0|  yes|
|(30,[4,12,13,16,1...|         1.0|  yes|
+--------------------+------------+-----+
only showing top 10 rows



                                                                                

In [23]:
featureIndexer = VectorIndexer(inputCol="features", 
                               outputCol="indexedFeatures", 
                               maxCategories=4).fit(df)

featureIndexer.transform(df).show(10)

                                                                                

+--------------------+------------+-----+--------------------+
|            features|indexedLabel|label|     indexedFeatures|
+--------------------+------------+-----+--------------------+
|(30,[3,11,13,16,1...|         1.0|  yes|(30,[3,11,13,16,1...|
|(30,[3,11,13,16,1...|         1.0|  yes|(30,[3,11,13,16,1...|
|(30,[2,11,13,16,1...|         1.0|  yes|(30,[2,11,13,16,1...|
|(30,[4,11,13,16,1...|         1.0|  yes|(30,[4,11,13,16,1...|
|(30,[3,11,14,16,1...|         1.0|  yes|(30,[3,11,14,16,1...|
|(30,[0,12,14,16,2...|         1.0|  yes|(30,[0,12,14,16,2...|
|(30,[0,11,14,16,2...|         1.0|  yes|(30,[0,11,14,16,2...|
|(30,[5,13,16,18,2...|         1.0|  yes|(30,[5,13,16,18,2...|
|(30,[2,11,13,16,1...|         1.0|  yes|(30,[2,11,13,16,1...|
|(30,[4,12,13,16,1...|         1.0|  yes|(30,[4,12,13,16,1...|
+--------------------+------------+-----+--------------------+
only showing top 10 rows



In [24]:
df.show(10,False)

+--------------------------------------------------------------------------------------------------------+------------+-----+
|features                                                                                                |indexedLabel|label|
+--------------------------------------------------------------------------------------------------------+------------+-----+
|(30,[3,11,13,16,18,20,21,24,25,26,27,28],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,59.0,2343.0,1042.0,1.0,-1.0])     |1.0         |yes  |
|(30,[3,11,13,16,17,18,20,21,24,25,26,27,28],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,56.0,45.0,1467.0,1.0,-1.0])|1.0         |yes  |
|(30,[2,11,13,16,18,20,21,24,25,26,27,28],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,41.0,1270.0,1389.0,1.0,-1.0])     |1.0         |yes  |
|(30,[4,11,13,16,18,20,21,24,25,26,27,28],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,55.0,2476.0,579.0,1.0,-1.0])      |1.0         |yes  |
|(30,[3,11,14,16,17,18,20,21,24,25,26,27,28],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,54.0,184.0,673.0,2.0,-1.0])|1.0         

In [25]:
#Data splitting
(trainingData, testData) = df.randomSplit([0.7, 0.3], seed=10)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

                                                                                

Training Dataset Count: 7808


[Stage 152:>                                                        (0 + 1) / 1]

Test Dataset Count: 3354


                                                                                

# Logistic Regression

In [26]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol="indexedLabel", featuresCol="features")

In [27]:
labelConverter = IndexToString(inputCol="prediction", 
                               outputCol="predictedLabel", 
                               labels=labelindexer.labels) 

pipeline = Pipeline(stages=[featureIndexer, lr, labelConverter])

lrModel = pipeline.fit(trainingData)

                                                                                

In [22]:
# Make predictions on the test data 
predictions = lrModel.transform(testData)

predictions.show(5)

[Stage 145:>                                                        (0 + 1) / 1]

+--------------------+------------+-----+--------------------+--------------------+--------------------+----------+--------------+
|            features|indexedLabel|label|     indexedFeatures|       rawPrediction|         probability|prediction|predictedLabel|
+--------------------+------------+-----+--------------------+--------------------+--------------------+----------+--------------+
|(30,[0,11,13,16,1...|         1.0|  yes|(30,[0,11,13,16,1...|[-0.8537615416222...|[0.29864438411413...|       1.0|           yes|
|(30,[0,11,13,16,1...|         1.0|  yes|(30,[0,11,13,16,1...|[1.05708789470724...|[0.74213364500738...|       0.0|            no|
|(30,[0,11,13,16,1...|         0.0|   no|(30,[0,11,13,16,1...|[1.79830254601045...|[0.85794217949956...|       0.0|            no|
|(30,[0,11,13,16,1...|         1.0|  yes|(30,[0,11,13,16,1...|[0.17261970311598...|[0.54304808482998...|       0.0|            no|
|(30,[0,11,13,16,1...|         0.0|   no|(30,[0,11,13,16,1...|[1.19431615092198...|

                                                                                

In [28]:
predictions.select("features", "label", "probability", "predictedLabel").show(5)

[Stage 205:>                                                        (0 + 1) / 1]

+--------------------+-----+--------------------+--------------+
|            features|label|         probability|predictedLabel|
+--------------------+-----+--------------------+--------------+
|(30,[0,11,13,16,1...|  yes|[0.29864438411413...|           yes|
|(30,[0,11,13,16,1...|  yes|[0.74213364500738...|            no|
|(30,[0,11,13,16,1...|   no|[0.85794217949956...|            no|
|(30,[0,11,13,16,1...|  yes|[0.54304808482998...|            no|
|(30,[0,11,13,16,1...|   no|[0.76751211548388...|            no|
+--------------------+-----+--------------------+--------------+
only showing top 5 rows



                                                                                

In [31]:
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

AttributeError: 'PipelineModel' object has no attribute 'coefficients'

In [215]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.evaluation import RegressionEvaluator
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(labelCol="indexedLabel",
                                predictionCol="prediction",
                                metricName="rmse")

rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE): %g" % rmse)

[Stage 10188:>                                                      (0 + 1) / 1]

Root Mean Squared Error (RMSE): 0.452546


                                                                                

In [216]:
y_true = predictions.select("indexedLabel").toPandas()
y_pred = predictions.select("prediction").toPandas()

import sklearn.metrics
r2_score = sklearn.metrics.r2_score(y_true, y_pred)
print('r2_score: {0}'.format(r2_score))

[Stage 10190:>                                                      (0 + 1) / 1]

r2_score: 0.1777628286497135


                                                                                

# Using k-Fold Cross Validation

In [202]:
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.5, 2.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(lr.maxIter, [1, 5, 10])
             .build())

evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                          labelCol="indexedLabel")

In [203]:
pipeline = Pipeline(stages=[featureIndexer, lr, labelConverter]) 
kFold = CrossValidator(estimator=pipeline, 
                    estimatorParamMaps=paramGrid, 
                    evaluator=evaluator, numFolds=2, 
                    parallelism=10, seed=100)
kFoldModel = kFold.fit(trainingData)

                                                                                

# Prediction

In [204]:
predictions = kFoldModel.transform(testData)

predictions.select("features", "label", "probability", "predictedLabel").show(5)

[Stage 10113:>                                                      (0 + 1) / 1]

+--------------------+-----+--------------------+--------------+
|            features|label|         probability|predictedLabel|
+--------------------+-----+--------------------+--------------+
|(30,[0,11,13,16,1...|  yes|[0.33127668903429...|           yes|
|(30,[0,11,13,16,1...|  yes|[0.70872079564193...|            no|
|(30,[0,11,13,16,1...|   no|[0.82459930528361...|            no|
|(30,[0,11,13,16,1...|  yes|[0.12758501942330...|           yes|
|(30,[0,11,13,16,1...|  yes|[0.40607998217389...|           yes|
+--------------------+-----+--------------------+--------------+
only showing top 5 rows



                                                                                

# Evaluate the model

In [3]:
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", 
                                              predictionCol="prediction", 
                                              metricName="accuracy")

predictionAndLabel = predictions.select("prediction", "indexedLabel").rdd

# Instantiate metrics object 
metricsMulti = MulticlassMetrics(predictionAndLabel)
metricsBinary= BinaryClassificationMetrics(predictionAndLabel)
# Overall statistics 

precision = metricsMulti.precision(label=1) 
recall = metricsMulti.recall(label=1) 
#f1Score = metricsMulti.fMeasure() 
print("Evaluation Metrics (Linear Regression)")

print("Accuracy = ",format(evaluator.evaluate(predictions)))
print("Precision = ", precision) 
print("Recall = ", recall) 
print("Area under ROC = %s" % metricsBinary.areaUnderROC)

NameError: name 'predictions' is not defined