In [None]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

In [None]:
#This is one time on your PC; you may need to run it everytime on colab and databrick notebooks
!pip install -U scikit-learn

In [None]:
from pyspark.sql import SQLContext
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from sklearn.metrics import confusion_matrix

In [None]:
sc = SparkContext(conf=SparkConf())
spark = SparkSession(sparkContext=sc)

### Linear Support Vector Machine with pyspark¶

Import data

In [None]:
df = spark.read.csv('data/SparkData/bank.csv', header=True, inferSchema=True, sep=";")
df.drop('day','month','poutcome').show(5)

In [None]:
df.printSchema()

### Deal with categorical data and Convert the data to dense vector¶

In [None]:
catcols = ['job','marital','education','default','housing','loan','contact','poutcome']
num_cols = ['balance', 'duration','campaign','pdays','previous']
labelCol = 'y'


### Process categorical columns

The following code does three things with pipeline:

StringIndexer all categorical columns

OneHotEncoder all categorical index columns

VectorAssembler all feature columns into one vector column

Categorical columns

In [None]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

# categorical columns
categorical_columns = catcols

In [None]:
indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)) for c in categorical_columns ]

In [None]:
encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(), \
                           outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers ]

In [None]:
assembler = VectorAssembler(inputCols=[encoder.getOutputCol() \
                                       for encoder in encoders] + num_cols, outputCol="features")

In [None]:
pipeline = Pipeline(stages=indexers + encoders + [assembler])
model=pipeline.fit(df)
data = model.transform(df)
data = data.withColumn('label',col(labelCol))
data=data.select('features','label')
data.show(5, truncate=False)

### We need to deal with label, which is string, yes or no, need to make them numbers

Build StringIndexer stages

In [None]:
# Index labels, adding metadata to the label column 
labelIndexer = StringIndexer(inputCol='label',
                             outputCol='indexedLabel').fit(data)
data=labelIndexer.transform(data)

In [None]:
data.show(5)

In [None]:
from pyspark.ml.feature import VectorIndexer
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous. 
featureIndexer =VectorIndexer(inputCol="features", \
                                  outputCol="indexedFeatures", \
                                  maxCategories=3).fit(data)


In [None]:
data=featureIndexer.transform(data)
data.show(5)

### Split the data to training and test data sets¶

In [None]:
# Split the data into training and test sets (40% held out for testing)
(trainingData, testData) = data.randomSplit([0.6, 0.4])
trainingData.show(5,False)
testData.show(5,False)

### Build cross-validation model¶

In [None]:
from pyspark.ml.classification import LinearSVC
lsvc = LinearSVC(featuresCol="indexedFeatures", labelCol="indexedLabel", maxIter=50)

In [None]:
# Convert indexed labels back to original labels.
from pyspark.ml.feature import IndexToString
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)


In [None]:
# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[lsvc,labelConverter])
# Train model.  This also runs the indexers.
lsvcModel = pipeline.fit(trainingData)

### Make predictions

In [None]:
# Make predictions.
predictions = lsvcModel.transform(testData)
# Select example rows to display. 
predictions.show(5)

### Evaluation

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy = {accuracy}")
print("Test Error = %g" % (1.0 - accuracy))


### Get confusion matrix

In [None]:
y_pred=predictions.select("prediction").collect()
y_orig=predictions.select("indexedLabel").collect()


In [None]:
cm = confusion_matrix(y_orig, y_pred)
print("Confusion Matrix:")
print(cm) 

### Here is the slope of the hyper-plane

In [None]:
lsvcModel.stages[0].coefficients

### Here is intercept of the hyper-plane

In [None]:
lsvcModel.stages[0].intercept

### Tear down machine learning pipeline

In [None]:
# Stop session 
sc.stop()  
