In [2]:
import numpy as np
import pandas as pd
import pyspark
import urllib

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.ml.feature import *

In [3]:
spark = SparkSession.builder.appName('example').getOrCreate()

In [3]:
import urllib
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
urllib.request.urlretrieve(URL, "iris.csv");

('iris.csv', <http.client.HTTPMessage at 0x40711a6aa0>)

In [4]:
columns = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']

# inferSchema reads the file twice, but detects numerical columns
data = spark.read.csv('iris.csv', header=False, inferSchema=True)
data = data.toDF(*columns)
data.printSchema()

root
 |-- sepal-length: double (nullable = true)
 |-- sepal-width: double (nullable = true)
 |-- petal-length: double (nullable = true)
 |-- petal-width: double (nullable = true)
 |-- class: string (nullable = true)



In [None]:
# can also create from a pandas dataframe
data = spark.createDataFrame(pd.read_csv('iris.csv', header=None, names=['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']))
data.show(10)

## Linear Classifier

To perform Machine Learning, you need to select features into a single vector column, and transform the target column.


In [5]:
# create feature column

feature_cols = data.columns[:-1]
assembler = pyspark.ml.feature.VectorAssembler(inputCols=feature_cols, outputCol='features')
data = assembler.transform(data)
data.show(10)

+------------+-----------+------------+-----------+-----------+-----------------+
|sepal-length|sepal-width|petal-length|petal-width|      class|         features|
+------------+-----------+------------+-----------+-----------+-----------------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|[5.1,3.5,1.4,0.2]|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|[4.9,3.0,1.4,0.2]|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|[4.7,3.2,1.3,0.2]|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|[4.6,3.1,1.5,0.2]|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|[5.0,3.6,1.4,0.2]|
|         5.4|        3.9|         1.7|        0.4|Iris-setosa|[5.4,3.9,1.7,0.4]|
|         4.6|        3.4|         1.4|        0.3|Iris-setosa|[4.6,3.4,1.4,0.3]|
|         5.0|        3.4|         1.5|        0.2|Iris-setosa|[5.0,3.4,1.5,0.2]|
|         4.4|        2.9|         1.4|        0.2|Iris-setosa|[4.4,2.9,1.4,0.2]|
|         4.9|  

In [13]:
# convert text labels into indices

data = data.select(['features', 'class'])
label_indexer = pyspark.ml.feature.StringIndexer(inputCol='class', outputCol='label').fit(data)
data = label_indexer.transform(data)
data.show(10)

+-----------------+-----------+-----+
|         features|      class|label|
+-----------------+-----------+-----+
|[5.1,3.5,1.4,0.2]|Iris-setosa|  0.0|
|[4.9,3.0,1.4,0.2]|Iris-setosa|  0.0|
|[4.7,3.2,1.3,0.2]|Iris-setosa|  0.0|
|[4.6,3.1,1.5,0.2]|Iris-setosa|  0.0|
|[5.0,3.6,1.4,0.2]|Iris-setosa|  0.0|
|[5.4,3.9,1.7,0.4]|Iris-setosa|  0.0|
|[4.6,3.4,1.4,0.3]|Iris-setosa|  0.0|
|[5.0,3.4,1.5,0.2]|Iris-setosa|  0.0|
|[4.4,2.9,1.4,0.2]|Iris-setosa|  0.0|
|[4.9,3.1,1.5,0.1]|Iris-setosa|  0.0|
+-----------------+-----------+-----+
only showing top 10 rows



In [14]:
# only select the features and label column

data = data.select(['features', 'label'])
print("Reading for machine learning")
data.show(10)

Reading for machine learning
+-----------------+-----+
|         features|label|
+-----------------+-----+
|[5.1,3.5,1.4,0.2]|  0.0|
|[4.9,3.0,1.4,0.2]|  0.0|
|[4.7,3.2,1.3,0.2]|  0.0|
|[4.6,3.1,1.5,0.2]|  0.0|
|[5.0,3.6,1.4,0.2]|  0.0|
|[5.4,3.9,1.7,0.4]|  0.0|
|[4.6,3.4,1.4,0.3]|  0.0|
|[5.0,3.4,1.5,0.2]|  0.0|
|[4.4,2.9,1.4,0.2]|  0.0|
|[4.9,3.1,1.5,0.1]|  0.0|
+-----------------+-----+
only showing top 10 rows



In [16]:
# use Logistic Regression to train on the training set

train, test = data.randomSplit([0.70, 0.30])
lr = pyspark.ml.classification.LogisticRegression(regParam=0.01)
model = lr.fit(train)

In [17]:
# predict on the test set

prediction = model.transform(test)
print("Prediction")
prediction.show(10)

Prediction
+-----------------+-----+--------------------+--------------------+----------+
|         features|label|       rawPrediction|         probability|prediction|
+-----------------+-----+--------------------+--------------------+----------+
|[4.3,3.0,1.1,0.1]|  0.0|[6.31969315070920...|[0.98476814297073...|       0.0|
|[4.4,3.2,1.3,0.2]|  0.0|[6.25702868169002...|[0.98681724592628...|       0.0|
|[4.6,3.2,1.4,0.2]|  0.0|[5.93084157354415...|[0.97943522015515...|       0.0|
|[4.6,3.4,1.4,0.3]|  0.0|[6.17685212018133...|[0.98755830862849...|       0.0|
|[4.8,3.0,1.4,0.1]|  0.0|[5.45569822648338...|[0.95223523625541...|       0.0|
|[4.8,3.4,1.9,0.2]|  0.0|[5.66984003271475...|[0.97459279502617...|       0.0|
|[4.9,3.0,1.4,0.2]|  0.0|[5.13373957470314...|[0.93814498244774...|       0.0|
|[4.9,3.1,1.5,0.1]|  0.0|[5.47077891765211...|[0.95357883239289...|       0.0|
|[4.9,3.1,1.5,0.1]|  0.0|[5.47077891765211...|[0.95357883239289...|       0.0|
|[5.0,2.0,3.5,1.0]|  1.0|[-0.945119291252

In [18]:
# evaluate the accuracy of the model using the test set

evaluator = pyspark.ml.evaluation.MulticlassClassificationEvaluator(metricName='accuracy')
accuracy = evaluator.evaluate(prediction)
accuracy

0.94

## Clustering

Similar story, just without the target column

In [11]:
km = pyspark.ml.clustering.KMeans(k=3)
xs = data.select(['features'])
clustering = km.fit(xs)
labels = clustering.transform(xs)

In [12]:
labels.show(10)

+-----------------+----------+
|         features|prediction|
+-----------------+----------+
|[5.1,3.5,1.4,0.2]|         1|
|[4.9,3.0,1.4,0.2]|         1|
|[4.7,3.2,1.3,0.2]|         1|
|[4.6,3.1,1.5,0.2]|         1|
|[5.0,3.6,1.4,0.2]|         1|
|[5.4,3.9,1.7,0.4]|         1|
|[4.6,3.4,1.4,0.3]|         1|
|[5.0,3.4,1.5,0.2]|         1|
|[4.4,2.9,1.4,0.2]|         1|
|[4.9,3.1,1.5,0.1]|         1|
+-----------------+----------+
only showing top 10 rows

