In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
spark2 = SparkSession.builder.appName('ml').getOrCreate()
#Create a Spark Session
SpSession = SparkSession \
    .builder \
    .master("local[2]") \
    .appName("ml") \
    .config("spark.executor.memory", "0.1g") \
    .config("spark.cores.max","2") \
    .config("spark.sql.warehouse.dir", "/home/sushant/Projects/Spark_Project/temp")\
    .getOrCreate()
    
#Get the Spark Context from Spark Session    
SpContext = SpSession.sparkContext

In [3]:
"""--------------------------------------------------------------------------
Load Data
-------------------------------------------------------------------------"""
#Load the CSV file into a RDD
bankData = SpContext.textFile("bank.csv")
bankData.cache()
bankData.count()

#Remove the first line (contains headers)
firstLine=bankData.first()
dataLines = bankData.filter(lambda x: x != firstLine)
dataLines.count()

"""--------------------------------------------------------------------------
Cleanup Data
-------------------------------------------------------------------------"""

# Change labels to numeric ones and build a Row object

import math
from pyspark.ml.linalg import Vectors
from pyspark.sql import Row

def transformToNumeric( inputStr) :
    
    attList=inputStr.replace("\"","").split(";")
    
    age=float(attList[0])
    #convert outcome to float    
    outcome = 0.0 if attList[16] == "no" else 1.0
    
    #create indicator variables for single/married    
    single= 1.0 if attList[2] == "single" else 0.0
    married = 1.0 if attList[2] == "married" else 0.0
    divorced = 1.0 if attList[2] == "divorced" else 0.0
    
    #create indicator variables for education
    primary = 1.0 if attList[3] == "primary" else 0.0
    secondary = 1.0 if attList[3] == "secondary" else 0.0
    tertiary = 1.0 if attList[3] == "tertiary" else 0.0
    
    #convert default to float
    default= 0.0 if attList[4] == "no" else 1.0
    #convert balance amount to float
    balance=float(attList[5])
    #convert loan to float
    loan= 0.0 if attList[7] == "no" else 1.0
    
    #Create a row with cleaned up and converted data
    values= Row(     OUTCOME=outcome ,\
                    AGE=age, \
                    SINGLE=single, \
                    MARRIED=married, \
                    DIVORCED=divorced, \
                    PRIMARY=primary, \
                    SECONDARY=secondary, \
                    TERTIARY=tertiary, \
                    DEFAULT=default, \
                    BALANCE=balance, \
                    LOAN=loan                    
                    ) 
    return values
    
#Change to a Vector
bankRows = dataLines.map(transformToNumeric)
bankRows.collect()[:15]

bankData = SpSession.createDataFrame(bankRows)

In [4]:
"""--------------------------------------------------------------------------
Perform Data Analytics
-------------------------------------------------------------------------"""
#See descriptive analytics.
bankData.describe().show()

#Find correlation between predictors and target
for i in bankData.columns:
    if not( isinstance(bankData.select(i).take(1)[0][0], str)) :
        print( "Correlation to OUTCOME for ", i, \
            bankData.stat.corr('OUTCOME',i))

"""--------------------------------------------------------------------------
Prepare data for ML
-------------------------------------------------------------------------"""
#Transform to a Data Frame for input to Machine Learing

def transformToLabeledPoint(row) :
    lp = ( row["OUTCOME"], \
            Vectors.dense([
                row["AGE"], \
                row["BALANCE"], \
                row["DEFAULT"], \
                row["DIVORCED"], \
                row["LOAN"], \
                row["MARRIED"], \
                row["PRIMARY"], \
                row["SECONDARY"], \
                row["SINGLE"], \
                row["TERTIARY"]
        ]))
    return lp
    
bankLp = bankData.rdd.map(transformToLabeledPoint)
bankLp.collect()
bankDF = SpSession.createDataFrame(bankLp,["label", "features"])
bankDF.select("label","features").show(10)


+-------+------------------+------------------+--------------------+-------------------+-------------------+------------------+-------------------+-------------------+------------------+-------------------+-------------------+
|summary|               AGE|           BALANCE|             DEFAULT|           DIVORCED|               LOAN|           MARRIED|            OUTCOME|            PRIMARY|         SECONDARY|             SINGLE|           TERTIARY|
+-------+------------------+------------------+--------------------+-------------------+-------------------+------------------+-------------------+-------------------+------------------+-------------------+-------------------+
|  count|               541|               541|                 541|                541|                541|               541|                541|                541|               541|                541|                541|
|   mean| 41.26987060998152|1444.7818853974122|0.022181146025878003|0.10905730129390019|0.16

In [5]:
"""--------------------------------------------------------------------------
Perform Machine Learning
-------------------------------------------------------------------------"""

#Perform PCA
from pyspark.ml.feature import PCA
bankPCA = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
pcaModel = bankPCA.fit(bankDF)
pcaResult = pcaModel.transform(bankDF).select("label","pcaFeatures")
pcaResult.show(truncate=False)

#Indexing needed as pre-req for Decision Trees
from pyspark.ml.feature import StringIndexer
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(pcaResult)
td = si_model.transform(pcaResult)
td.collect()

#Split into training and testing data
(trainingData, testData) = td.randomSplit([0.7, 0.3])
trainingData.count()
testData.count()
testData.collect()

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


#Create the model
rmClassifer = RandomForestClassifier(labelCol="indexed", \
                featuresCol="pcaFeatures")
rmModel = rmClassifer.fit(trainingData)

#Predict on the test data
predictions = rmModel.transform(testData)
predictions.select("prediction","indexed","label","pcaFeatures").collect()
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", \
                    labelCol="indexed",metricName="accuracy")
evaluator.evaluate(predictions)      

#Draw a confusion matrix
predictions.groupBy("indexed","prediction").count().show()



+-----+-------------------------------------------------------------+
|label|pcaFeatures                                                  |
+-----+-------------------------------------------------------------+
|0.0  |[-1787.0188971973814,28.86209683775569,-0.06459982604876238] |
|1.0  |[-4789.020177138494,29.92256263634302,-0.9830243513096398]   |
|1.0  |[-1350.0222131632624,34.101108097967185,0.8951427168301703]  |
|1.0  |[-1476.018951718456,29.051333993597034,0.395272386802192]    |
|0.0  |[-0.037889185366468646,58.98971820001771,-0.7290792383661894]|
|1.0  |[-747.0223377634925,34.488291981817895,0.9045654956970107]   |
|1.0  |[-307.0230691022594,35.79985053965531,0.5170631523785959]    |
|0.0  |[-147.02501216176347,38.90107856650334,-0.8069627548799411]  |
|0.0  |[-221.02629853487875,40.853633675695,0.5373036365803205]     |
|1.0  |[87.97238687688711,43.06265944115104,-0.06701642871171584]   |
|0.0  |[-9374.023105550945,32.976458837994976,-0.9511484606914444]  |
|0.0  |[-264.0275573