# Prepare the environment and collect data

In [1]:
import warnings
warnings.filterwarnings('ignore')
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("Linear regression").getOrCreate()
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.ml.feature import VectorAssembler, VectorIndexer
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder
from pyspark.sql.functions import mean, col
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.mllib.evaluation import BinaryClassificationMetrics


file_location = "bank_deposit.csv"
file_type = "csv"
infer_schema = "False"
first_row_is_header = "True"
df = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.option("header", first_row_is_header) \
.load(file_location)

df.printSchema()

22/10/18 15:26:46 WARN Utils: Your hostname, m0 resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
22/10/18 15:26:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/18 15:26:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/18 15:27:03 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/10/18 15:27:03 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/10/18 15:27:03 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/10/18 15:27:03 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
22/10/18 15:27:03 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.


                                                                                

root
 |-- age: string (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: string (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- campaign: string (nullable = true)
 |-- pdays: string (nullable = true)
 |-- previous: string (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- deposit: string (nullable = true)



In [18]:
df.show(10)

+--------------------+------------+-----+
|            features|indexedLabel|label|
+--------------------+------------+-----+
|(30,[3,11,13,16,1...|         1.0|  yes|
|(30,[3,11,13,16,1...|         1.0|  yes|
|(30,[2,11,13,16,1...|         1.0|  yes|
|(30,[4,11,13,16,1...|         1.0|  yes|
|(30,[3,11,14,16,1...|         1.0|  yes|
|(30,[0,12,14,16,2...|         1.0|  yes|
|(30,[0,11,14,16,2...|         1.0|  yes|
|(30,[5,13,16,18,2...|         1.0|  yes|
|(30,[2,11,13,16,1...|         1.0|  yes|
|(30,[4,12,13,16,1...|         1.0|  yes|
+--------------------+------------+-----+
only showing top 10 rows



In [3]:
from pyspark.sql.types import *
#Identifying and assigning lists of variables
float_vars=['age', 'balance', 'duration','campaign','pdays','previous']
#Converting variables
for column in float_vars:
 df=df.withColumn(column,df[column].cast(IntegerType()))
df.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'int'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'string'),
 ('month', 'string'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('poutcome', 'string'),
 ('deposit', 'string')]

# Data Transformation

In [4]:
def data_transformation(df, CatCols, continuousCols, labelCol):
  
  indexers = [StringIndexer(inputCol=c, 
                            outputCol="{0}_indexed".format(c)) for c in CatCols]

  encoders = [OneHotEncoder(inputCol=indexer.getOutputCol(),
              outputCol="{0}_encoded".format(indexer.getOutputCol()))
              for indexer in indexers]


  v = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders]
                              + continuousCols, outputCol="features")
  
  indexer = StringIndexer(inputCol=labelCol, outputCol='indexedLabel')

  pipeline = Pipeline(stages = indexers + encoders + [v ] + [indexer])

  model=pipeline.fit(df)
    
  data = model.transform(df)

  data =  data.withColumn('label', col(labelCol))
  
  return  data.select('features', 
                     'indexedLabel', 
                     'label'), StringIndexer(inputCol='label').fit(data)

In [5]:
CatCols = ['job', 'marital', 'education', 
                      'default', 'housing', 'loan', 
                      'contact', 'poutcome']

NumCols = ['age', 'balance', 'duration', 
               'campaign', 'pdays', 'previous']

(df, labelindexer) = data_transformation(df, CatCols, NumCols, 'deposit')

df.show(10)



                                                                                

22/10/18 15:27:57 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[Stage 32:>                                                         (0 + 1) / 1]                                                                                

+--------------------+------------+-----+
|            features|indexedLabel|label|
+--------------------+------------+-----+
|(30,[3,11,13,16,1...|         1.0|  yes|
|(30,[3,11,13,16,1...|         1.0|  yes|
|(30,[2,11,13,16,1...|         1.0|  yes|
|(30,[4,11,13,16,1...|         1.0|  yes|
|(30,[3,11,14,16,1...|         1.0|  yes|
|(30,[0,12,14,16,2...|         1.0|  yes|
|(30,[0,11,14,16,2...|         1.0|  yes|
|(30,[5,13,16,18,2...|         1.0|  yes|
|(30,[2,11,13,16,1...|         1.0|  yes|
|(30,[4,12,13,16,1...|         1.0|  yes|
+--------------------+------------+-----+
only showing top 10 rows



In [6]:
featureIndexer = VectorIndexer(inputCol="features", 
                               outputCol="indexedFeatures", 
                               maxCategories=4).fit(df)

featureIndexer.transform(df).show(10)

                                                                                

+--------------------+------------+-----+--------------------+
|            features|indexedLabel|label|     indexedFeatures|
+--------------------+------------+-----+--------------------+
|(30,[3,11,13,16,1...|         1.0|  yes|(30,[3,11,13,16,1...|
|(30,[3,11,13,16,1...|         1.0|  yes|(30,[3,11,13,16,1...|
|(30,[2,11,13,16,1...|         1.0|  yes|(30,[2,11,13,16,1...|
|(30,[4,11,13,16,1...|         1.0|  yes|(30,[4,11,13,16,1...|
|(30,[3,11,14,16,1...|         1.0|  yes|(30,[3,11,14,16,1...|
|(30,[0,12,14,16,2...|         1.0|  yes|(30,[0,12,14,16,2...|
|(30,[0,11,14,16,2...|         1.0|  yes|(30,[0,11,14,16,2...|
|(30,[5,13,16,18,2...|         1.0|  yes|(30,[5,13,16,18,2...|
|(30,[2,11,13,16,1...|         1.0|  yes|(30,[2,11,13,16,1...|
|(30,[4,12,13,16,1...|         1.0|  yes|(30,[4,12,13,16,1...|
+--------------------+------------+-----+--------------------+
only showing top 10 rows



In [7]:
df.show(10)

+--------------------+------------+-----+
|            features|indexedLabel|label|
+--------------------+------------+-----+
|(30,[3,11,13,16,1...|         1.0|  yes|
|(30,[3,11,13,16,1...|         1.0|  yes|
|(30,[2,11,13,16,1...|         1.0|  yes|
|(30,[4,11,13,16,1...|         1.0|  yes|
|(30,[3,11,14,16,1...|         1.0|  yes|
|(30,[0,12,14,16,2...|         1.0|  yes|
|(30,[0,11,14,16,2...|         1.0|  yes|
|(30,[5,13,16,18,2...|         1.0|  yes|
|(30,[2,11,13,16,1...|         1.0|  yes|
|(30,[4,12,13,16,1...|         1.0|  yes|
+--------------------+------------+-----+
only showing top 10 rows



In [8]:
# Data splitting
(trainingData, testData) = df.randomSplit([0.8, 0.2], seed=10)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))
trainingData.show()

                                                                                

Training Dataset Count: 8911


                                                                                

Test Dataset Count: 2251


[Stage 42:>                                                         (0 + 1) / 1]

+--------------------+------------+-----+
|            features|indexedLabel|label|
+--------------------+------------+-----+
|(30,[0,11,13,16,1...|         1.0|  yes|
|(30,[0,11,13,16,1...|         0.0|   no|
|(30,[0,11,13,16,1...|         1.0|  yes|
|(30,[0,11,13,16,1...|         0.0|   no|
|(30,[0,11,13,16,1...|         1.0|  yes|
|(30,[0,11,13,16,1...|         0.0|   no|
|(30,[0,11,13,16,1...|         1.0|  yes|
|(30,[0,11,13,16,1...|         1.0|  yes|
|(30,[0,11,13,16,1...|         0.0|   no|
|(30,[0,11,13,16,1...|         0.0|   no|
|(30,[0,11,13,16,1...|         0.0|   no|
|(30,[0,11,13,16,1...|         1.0|  yes|
|(30,[0,11,13,16,1...|         0.0|   no|
|(30,[0,11,13,16,1...|         0.0|   no|
|(30,[0,11,13,16,1...|         1.0|  yes|
|(30,[0,11,13,16,1...|         0.0|   no|
|(30,[0,11,13,16,1...|         0.0|   no|
|(30,[0,11,13,16,1...|         0.0|   no|
|(30,[0,11,13,16,1...|         0.0|   no|
|(30,[0,11,13,16,1...|         0.0|   no|
+--------------------+------------

                                                                                

# K-means clustering

In [12]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

train_df= df.selectExpr("features as features")
kmeans = KMeans(k=3, featuresCol='features')
kmeans_model = kmeans.fit(train_df)
predictions = kmeans_model.transform(train_df)
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette measure using squared Euclidean distance = "+ str(silhouette
                                                                    
print("--*--"*20)
                                                                    
cluster_centers = kmeans_model.clusterCenters()
print(cluster_centers)


[Stage 262:>                                                        (0 + 1) / 1]

Silhouette measure using squared Euclidean distance = 0.877766399738675
--*----*----*----*----*----*----*----*----*----*----*----*----*----*----*----*----*----*----*----*--
[array([3.37662338e-01, 9.09090909e-02, 1.29870130e-01, 5.19480519e-02,
       3.89610390e-02, 1.68831169e-01, 6.49350649e-02, 2.59740260e-02,
       0.00000000e+00, 5.19480519e-02, 1.29870130e-02, 6.36363636e-01,
       2.85714286e-01, 3.24675325e-01, 5.19480519e-01, 1.03896104e-01,
       1.00000000e+00, 6.75324675e-01, 9.61038961e-01, 7.40259740e-01,
       1.42857143e-01, 7.27272727e-01, 6.49350649e-02, 1.03896104e-01,
       4.73376623e+01, 2.74114935e+04, 3.71519481e+02, 2.88311688e+00,
       4.41948052e+01, 9.09090909e-01]), array([2.22458485e-01, 1.79323613e-01, 1.63021466e-01, 1.22620494e-01,
       8.63710004e-02, 6.44997975e-02, 3.55407047e-02, 3.24017821e-02,
       3.33130822e-02, 2.96678817e-02, 2.46051033e-02, 5.64196031e-01,
       3.18550020e-01, 5.03645200e-01, 3.19360065e-01, 1.33151073e-01,
    

                                                                                

# Hierarchical clustering using bisecting K-means

In [11]:
from pyspark.ml.clustering import BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator

train_df = df.selectExpr("features as features")
                                      
bkmeans = BisectingKMeans(k=3, featuresCol='features')
bkmeans_model = bkmeans.fit(train_df)
predictions = bkmeans_model.transform(train_df)
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette measure using squared euclidean distance = "+ str(silhouette))

print("--*--"*20)

cluster_centers = bkmeans_model.clusterCenters()
print(cluster_centers)


[Stage 222:>                                                        (0 + 1) / 1]

Silhouette measure using squared euclidean distance = 0.7914265424459597
--*----*----*----*----*----*----*----*----*----*----*----*----*----*----*----*----*----*----*----*--
[array([2.17575619e-01, 1.84922090e-01, 1.62923923e-01, 1.27520623e-01,
       8.97112741e-02, 5.76306141e-02, 3.47158570e-02, 3.23098075e-02,
       3.33409716e-02, 2.92163153e-02, 2.45187901e-02, 5.59692942e-01,
       3.23556370e-01, 5.14550871e-01, 3.11984418e-01, 1.32218148e-01,
       9.80980752e-01, 5.03208066e-01, 8.51397800e-01, 7.15857012e-01,
       2.23189734e-01, 7.58936755e-01, 1.06324473e-01, 8.86801100e-02,
       4.03988313e+01, 4.81029904e+02, 3.66856439e+02, 2.54812099e+00,
       5.02430110e+01, 7.79903758e-01]), array([2.63910158e-01, 1.43440531e-01, 1.58754467e-01, 9.44359367e-02,
       6.12557427e-02, 1.14854518e-01, 4.23685554e-02, 3.21592649e-02,
       2.50127616e-02, 3.01174068e-02, 2.55232261e-02, 6.02348137e-01,
       2.80245023e-01, 4.17049515e-01, 3.82337928e-01, 1.42930066e-01,
   

                                                                                