# Prepare the environment and collect data

In [1]:
import warnings
warnings.filterwarnings('ignore')
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("Linear regression").getOrCreate()
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import StringIndexer, IndexToString
from pyspark.ml.feature import VectorAssembler, VectorIndexer
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder
from pyspark.sql.functions import mean, col
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.mllib.evaluation import BinaryClassificationMetrics


file_location = "bank_deposit.csv"
file_type = "csv"
infer_schema = "False"
first_row_is_header = "True"
df = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.option("header", first_row_is_header) \
.load(file_location)

df.printSchema()

22/10/19 01:32:14 WARN Utils: Your hostname, m0 resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
22/10/19 01:32:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/19 01:32:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/19 01:32:24 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/10/19 01:32:24 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/10/19 01:32:24 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/10/19 01:32:24 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
22/10/19 01:32:24 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.
22/10/19 01:32:24 WARN Utils: Service 'SparkUI' could not bind on port 4045. Attempting port 4046.
22/10/19 01:32:24 WARN Utils: Service 'SparkUI' could not bind on port 4046. Attempting port 4047.


                                                                                

root
 |-- age: string (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: string (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: string (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: string (nullable = true)
 |-- campaign: string (nullable = true)
 |-- pdays: string (nullable = true)
 |-- previous: string (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- deposit: string (nullable = true)



In [2]:
df.show(5,False)

+---+----------+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+
|age|job       |marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|deposit|
+---+----------+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+
|59 |admin.    |married|secondary|no     |2343   |yes    |no  |unknown|5  |may  |1042    |1       |-1   |0       |unknown |yes    |
|56 |admin.    |married|secondary|no     |45     |no     |no  |unknown|5  |may  |1467    |1       |-1   |0       |unknown |yes    |
|41 |technician|married|secondary|no     |1270   |yes    |no  |unknown|5  |may  |1389    |1       |-1   |0       |unknown |yes    |
|55 |services  |married|secondary|no     |2476   |yes    |no  |unknown|5  |may  |579     |1       |-1   |0       |unknown |yes    |
|54 |admin.    |married|tertiary |no     |184    |no     |no  |unknown|5  |m

In [3]:
from pyspark.sql.types import *
#Identifying and assigning lists of variables
float_vars=['age', 'balance', 'duration','campaign','pdays','previous']
#Converting variables
for column in float_vars:
 df=df.withColumn(column,df[column].cast(IntegerType()))
df.dtypes

[('age', 'int'),
 ('job', 'string'),
 ('marital', 'string'),
 ('education', 'string'),
 ('default', 'string'),
 ('balance', 'int'),
 ('housing', 'string'),
 ('loan', 'string'),
 ('contact', 'string'),
 ('day', 'string'),
 ('month', 'string'),
 ('duration', 'int'),
 ('campaign', 'int'),
 ('pdays', 'int'),
 ('previous', 'int'),
 ('poutcome', 'string'),
 ('deposit', 'string')]

# Data Transformation

In [4]:
def data_transformation(df, CatCols, continuousCols, labelCol):
  
  indexers = [StringIndexer(inputCol=c, 
                            outputCol="{0}_indexed".format(c)) for c in CatCols]

  encoders = [OneHotEncoder(inputCol=indexer.getOutputCol(),
              outputCol="{0}_encoded".format(indexer.getOutputCol()))
              for indexer in indexers]


  v = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders]
                              + continuousCols, outputCol="features")
  
  indexer = StringIndexer(inputCol=labelCol, outputCol='indexedLabel')

  pipeline = Pipeline(stages = indexers + encoders + [v ] + [indexer])

  model=pipeline.fit(df)
    
  data = model.transform(df)

  data =  data.withColumn('label', col(labelCol))
  
  return  data.select('features', 
                     'indexedLabel', 
                     'label'), StringIndexer(inputCol='label').fit(data)

In [5]:
CatCols = ['job', 'marital', 'education', 
                      'default', 'housing', 'loan', 
                      'contact', 'poutcome']

NumCols = ['age', 'balance', 'duration', 
               'campaign', 'pdays', 'previous']

(df, labelindexer) = data_transformation(df, CatCols, NumCols, 'deposit')

df.show(10)



                                                                                

22/10/19 01:33:34 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

+--------------------+------------+-----+
|            features|indexedLabel|label|
+--------------------+------------+-----+
|(30,[3,11,13,16,1...|         1.0|  yes|
|(30,[3,11,13,16,1...|         1.0|  yes|
|(30,[2,11,13,16,1...|         1.0|  yes|
|(30,[4,11,13,16,1...|         1.0|  yes|
|(30,[3,11,14,16,1...|         1.0|  yes|
|(30,[0,12,14,16,2...|         1.0|  yes|
|(30,[0,11,14,16,2...|         1.0|  yes|
|(30,[5,13,16,18,2...|         1.0|  yes|
|(30,[2,11,13,16,1...|         1.0|  yes|
|(30,[4,12,13,16,1...|         1.0|  yes|
+--------------------+------------+-----+
only showing top 10 rows



In [6]:
featureIndexer = VectorIndexer(inputCol="features", 
                               outputCol="indexedFeatures", 
                               maxCategories=4).fit(df)

featureIndexer.transform(df).show(10)

                                                                                

+--------------------+------------+-----+--------------------+
|            features|indexedLabel|label|     indexedFeatures|
+--------------------+------------+-----+--------------------+
|(30,[3,11,13,16,1...|         1.0|  yes|(30,[3,11,13,16,1...|
|(30,[3,11,13,16,1...|         1.0|  yes|(30,[3,11,13,16,1...|
|(30,[2,11,13,16,1...|         1.0|  yes|(30,[2,11,13,16,1...|
|(30,[4,11,13,16,1...|         1.0|  yes|(30,[4,11,13,16,1...|
|(30,[3,11,14,16,1...|         1.0|  yes|(30,[3,11,14,16,1...|
|(30,[0,12,14,16,2...|         1.0|  yes|(30,[0,12,14,16,2...|
|(30,[0,11,14,16,2...|         1.0|  yes|(30,[0,11,14,16,2...|
|(30,[5,13,16,18,2...|         1.0|  yes|(30,[5,13,16,18,2...|
|(30,[2,11,13,16,1...|         1.0|  yes|(30,[2,11,13,16,1...|
|(30,[4,12,13,16,1...|         1.0|  yes|(30,[4,12,13,16,1...|
+--------------------+------------+-----+--------------------+
only showing top 10 rows



In [7]:
df.show(10)

+--------------------+------------+-----+
|            features|indexedLabel|label|
+--------------------+------------+-----+
|(30,[3,11,13,16,1...|         1.0|  yes|
|(30,[3,11,13,16,1...|         1.0|  yes|
|(30,[2,11,13,16,1...|         1.0|  yes|
|(30,[4,11,13,16,1...|         1.0|  yes|
|(30,[3,11,14,16,1...|         1.0|  yes|
|(30,[0,12,14,16,2...|         1.0|  yes|
|(30,[0,11,14,16,2...|         1.0|  yes|
|(30,[5,13,16,18,2...|         1.0|  yes|
|(30,[2,11,13,16,1...|         1.0|  yes|
|(30,[4,12,13,16,1...|         1.0|  yes|
+--------------------+------------+-----+
only showing top 10 rows



[Stage 35:>                                                         (0 + 1) / 1]                                                                                

In [8]:
# Data splitting
(trainingData, testData) = df.randomSplit([0.8, 0.2], seed=10)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))
trainingData.show()

                                                                                

Training Dataset Count: 8911


                                                                                

Test Dataset Count: 2251


[Stage 42:>                                                         (0 + 1) / 1]

+--------------------+------------+-----+
|            features|indexedLabel|label|
+--------------------+------------+-----+
|(30,[0,11,13,16,1...|         1.0|  yes|
|(30,[0,11,13,16,1...|         0.0|   no|
|(30,[0,11,13,16,1...|         1.0|  yes|
|(30,[0,11,13,16,1...|         0.0|   no|
|(30,[0,11,13,16,1...|         1.0|  yes|
|(30,[0,11,13,16,1...|         0.0|   no|
|(30,[0,11,13,16,1...|         1.0|  yes|
|(30,[0,11,13,16,1...|         1.0|  yes|
|(30,[0,11,13,16,1...|         0.0|   no|
|(30,[0,11,13,16,1...|         0.0|   no|
|(30,[0,11,13,16,1...|         0.0|   no|
|(30,[0,11,13,16,1...|         1.0|  yes|
|(30,[0,11,13,16,1...|         0.0|   no|
|(30,[0,11,13,16,1...|         0.0|   no|
|(30,[0,11,13,16,1...|         1.0|  yes|
|(30,[0,11,13,16,1...|         0.0|   no|
|(30,[0,11,13,16,1...|         0.0|   no|
|(30,[0,11,13,16,1...|         0.0|   no|
|(30,[0,11,13,16,1...|         0.0|   no|
|(30,[0,11,13,16,1...|         0.0|   no|
+--------------------+------------

                                                                                

# Gaussian mixture model

In [None]:
from pyspark.ml.clustering import GaussianMixture

train_df= df.selectExpr("features as features")

gmm = GaussianMixture(k=3, featuresCol='features')
gmm_model = gmm.fit(train_df)
gmm_model.gaussiansDF.show()

[Stage 164:>                                                        (0 + 1) / 1]