In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression

# Load and split data into training and testing sets
df = spark.read.csv("/FileStore/tables/Iris_modified6_5mod.csv", header = "true", sep=",", inferSchema=True)
df.show()

In [2]:
# Rename first column to "label"
df = df.withColumnRenamed(df.columns[0], 'label')

# Split into training and test data
training, testing = df.randomSplit([0.6, 0.4], seed=42)

#print "Training"
training.show()


In [3]:
#print "Testing"
testing.show()

In [4]:
# Configure an ML pipeline, which consists of two stages: feature assembler and lr.
# Transform n feature vectors into one single vector column
# Don't use the first three columns as features since it will result in a prediction accuracy of 100%
#assembler = VectorAssembler(inputCols=training.columns[3:], outputCol='features')

# Test with column 5 to see that some entries are classified wrongly
assembler = VectorAssembler(inputCols=training.columns[5:], outputCol='features')

lr = LogisticRegression(maxIter=10, regParam=0.01)
pipeline = Pipeline(stages=[assembler, lr])
#print "LogisticRegression parameters:\n" + lr.explainParams() + "\n"

# predict 
model = pipeline.fit(training)
prediction = model.transform(testing)

#print "prediction-schema: ", prediction.printSchema()
selected = prediction.select("features", "label", "probability", "prediction")
print ("Label vs. prediction", selected.show())

In [5]:
# Caculate prediction accuracy
numRows = selected.count()
correct = 0.0
for row in selected.collect():
	if row[1] == row[3]:
		correct = correct+1
	else:
		print(row)

accuracy = correct / numRows
print("Accuracy: ", accuracy)