# Machine Learning Quick Start

In [None]:
import pyspark
import sys

In [None]:
import pyspark.sql.functions as fn

In [None]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer

In [None]:
# Check spark app name
spark.sparkContext.appName

In [None]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)

In [None]:
# print runtime versions
# Python version
sys.version

In [None]:
# Spark version
spark.version

### Exploring Data

In [None]:
# load iris.csv into Spark dataframe
df = spark.read.csv('data/iris.csv', header=True, inferSchema=True)

In [None]:
# First 5 rows of Iris dataset
df.show(5)

In [None]:
df.printSchema()

In [None]:
df.describe().show()

In [None]:
# number of records for each species available in the dataset
df.groupBy('species').count().show(10,False)

### Feature Engineering

In [None]:
# display all column names
df.columns

In [None]:
# vectorize all numerical columns into a single feature column
feature_cols = df.columns[:-1]
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
df = assembler.transform(df)

In [None]:
# convert text labels into indices
data = df.select(['features', 'species'])
label_indexer = StringIndexer(inputCol='species', outputCol='label').fit(data)
data = label_indexer.transform(data)

In [None]:
# only select the features and label column
data = data.select(['features', 'label'])

In [None]:
# Reading for machine learning
data.show(10)

In [None]:
data.select(['label']).distinct().show()

### Split Data - Train & Test sets

In [None]:
# use Logistic Regression to train on the training set
train, test = data.randomSplit([0.70, 0.30], seed=42)

### Build Logistic Regression Model

In [None]:
# change regularization rate and you will likely get a different accuracy.
reg = 0.01

In [None]:
lr = LogisticRegression(regParam=reg)
model = lr.fit(train)

In [None]:
# predict on the test set
prediction = model.transform(test)

In [None]:
# print prediction
prediction.show(10)

### Evaluate Model

In [None]:
# evaluate the accuracy of the model using the test set
evaluator = MulticlassClassificationEvaluator(metricName='accuracy')

In [None]:
accuracy = evaluator.evaluate(prediction)

In [None]:
# print accuracy 
accuracy