In [None]:
!pip install pyspark



In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ML project').getOrCreate()

In [None]:
# load data 
df = spark.read.csv('iris.data', header = True, inferSchema = True)
df.printSchema()

root
 |-- 5.1: double (nullable = true)
 |-- 3.5: double (nullable = true)
 |-- 1.4: double (nullable = true)
 |-- 0.2: double (nullable = true)
 |-- Iris-setosa: string (nullable = true)



In [None]:
k=len(df.columns)
for i in range(k):
  old_name = df.columns[i]
  new_name = 'f'+str(i)
  print(old_name,new_name)
  df=df.withColumnRenamed(old_name,new_name)

5.1 f0
3.5 f1
1.4 f2
0.2 f3
Iris-setosa f4


In [None]:
df.show()

+---+---+---+---+-----------+
| f0| f1| f2| f3|         f4|
+---+---+---+---+-----------+
|4.9|3.0|1.4|0.2|Iris-setosa|
|4.7|3.2|1.3|0.2|Iris-setosa|
|4.6|3.1|1.5|0.2|Iris-setosa|
|5.0|3.6|1.4|0.2|Iris-setosa|
|5.4|3.9|1.7|0.4|Iris-setosa|
|4.6|3.4|1.4|0.3|Iris-setosa|
|5.0|3.4|1.5|0.2|Iris-setosa|
|4.4|2.9|1.4|0.2|Iris-setosa|
|4.9|3.1|1.5|0.1|Iris-setosa|
|5.4|3.7|1.5|0.2|Iris-setosa|
|4.8|3.4|1.6|0.2|Iris-setosa|
|4.8|3.0|1.4|0.1|Iris-setosa|
|4.3|3.0|1.1|0.1|Iris-setosa|
|5.8|4.0|1.2|0.2|Iris-setosa|
|5.7|4.4|1.5|0.4|Iris-setosa|
|5.4|3.9|1.3|0.4|Iris-setosa|
|5.1|3.5|1.4|0.3|Iris-setosa|
|5.7|3.8|1.7|0.3|Iris-setosa|
|5.1|3.8|1.5|0.3|Iris-setosa|
|5.4|3.4|1.7|0.2|Iris-setosa|
+---+---+---+---+-----------+
only showing top 20 rows



In [None]:
# Show Dataset in DataFrame
import pandas as pd
pd.DataFrame(df.take(5), columns=df.columns).transpose()

Unnamed: 0,0,1,2,3,4
f0,4.9,4.7,4.6,5,5.4
f1,3,3.2,3.1,3.6,3.9
f2,1.4,1.3,1.5,1.4,1.7
f3,0.2,0.2,0.2,0.2,0.4
f4,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa,Iris-setosa


In [None]:
# Data grouping by class 
class_name = df.columns[len(df.columns)-1]
df.groupby(class_name).count().toPandas()

Unnamed: 0,f4,count
0,Iris-virginica,50
1,Iris-setosa,49
2,Iris-versicolor,50


In [None]:
# convert string to numeric 
from pyspark.ml.feature import StringIndexer
indexer = StringIndexer()
indexer.setInputCol(class_name).setOutputCol("label")
df1 = indexer.fit(df).transform(df)


In [None]:
df1.show()

+---+---+---+---+-----------+-----+
| f0| f1| f2| f3|         f4|label|
+---+---+---+---+-----------+-----+
|4.9|3.0|1.4|0.2|Iris-setosa|  2.0|
|4.7|3.2|1.3|0.2|Iris-setosa|  2.0|
|4.6|3.1|1.5|0.2|Iris-setosa|  2.0|
|5.0|3.6|1.4|0.2|Iris-setosa|  2.0|
|5.4|3.9|1.7|0.4|Iris-setosa|  2.0|
|4.6|3.4|1.4|0.3|Iris-setosa|  2.0|
|5.0|3.4|1.5|0.2|Iris-setosa|  2.0|
|4.4|2.9|1.4|0.2|Iris-setosa|  2.0|
|4.9|3.1|1.5|0.1|Iris-setosa|  2.0|
|5.4|3.7|1.5|0.2|Iris-setosa|  2.0|
|4.8|3.4|1.6|0.2|Iris-setosa|  2.0|
|4.8|3.0|1.4|0.1|Iris-setosa|  2.0|
|4.3|3.0|1.1|0.1|Iris-setosa|  2.0|
|5.8|4.0|1.2|0.2|Iris-setosa|  2.0|
|5.7|4.4|1.5|0.4|Iris-setosa|  2.0|
|5.4|3.9|1.3|0.4|Iris-setosa|  2.0|
|5.1|3.5|1.4|0.3|Iris-setosa|  2.0|
|5.7|3.8|1.7|0.3|Iris-setosa|  2.0|
|5.1|3.8|1.5|0.3|Iris-setosa|  2.0|
|5.4|3.4|1.7|0.2|Iris-setosa|  2.0|
+---+---+---+---+-----------+-----+
only showing top 20 rows



In [None]:
# Split the data
(training_data, test_data) = df1.randomSplit([0.8,0.2])

In [None]:
training_data.toPandas()

Unnamed: 0,f0,f1,f2,f3,f4,label
0,4.3,3.0,1.1,0.1,Iris-setosa,2.0
1,4.4,3.0,1.3,0.2,Iris-setosa,2.0
2,4.4,3.2,1.3,0.2,Iris-setosa,2.0
3,4.5,2.3,1.3,0.3,Iris-setosa,2.0
4,4.6,3.4,1.4,0.3,Iris-setosa,2.0
...,...,...,...,...,...,...
111,7.3,2.9,6.3,1.8,Iris-virginica,1.0
112,7.6,3.0,6.6,2.1,Iris-virginica,1.0
113,7.7,2.6,6.9,2.3,Iris-virginica,1.0
114,7.7,3.0,6.1,2.3,Iris-virginica,1.0


In [None]:
# features and class 
class_name = 'label'
feature_names = df.columns[:-1]
print(class_name)
print(feature_names)

label
['f0', 'f1', 'f2', 'f3']


In [None]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler()
assembler.setInputCols(feature_names).setOutputCol('features')

transformed_data = assembler.transform(df1)

In [None]:
transformed_data.show()

+---+---+---+---+-----------+-----+-----------------+
| f0| f1| f2| f3|         f4|label|         features|
+---+---+---+---+-----------+-----+-----------------+
|4.9|3.0|1.4|0.2|Iris-setosa|  2.0|[4.9,3.0,1.4,0.2]|
|4.7|3.2|1.3|0.2|Iris-setosa|  2.0|[4.7,3.2,1.3,0.2]|
|4.6|3.1|1.5|0.2|Iris-setosa|  2.0|[4.6,3.1,1.5,0.2]|
|5.0|3.6|1.4|0.2|Iris-setosa|  2.0|[5.0,3.6,1.4,0.2]|
|5.4|3.9|1.7|0.4|Iris-setosa|  2.0|[5.4,3.9,1.7,0.4]|
|4.6|3.4|1.4|0.3|Iris-setosa|  2.0|[4.6,3.4,1.4,0.3]|
|5.0|3.4|1.5|0.2|Iris-setosa|  2.0|[5.0,3.4,1.5,0.2]|
|4.4|2.9|1.4|0.2|Iris-setosa|  2.0|[4.4,2.9,1.4,0.2]|
|4.9|3.1|1.5|0.1|Iris-setosa|  2.0|[4.9,3.1,1.5,0.1]|
|5.4|3.7|1.5|0.2|Iris-setosa|  2.0|[5.4,3.7,1.5,0.2]|
|4.8|3.4|1.6|0.2|Iris-setosa|  2.0|[4.8,3.4,1.6,0.2]|
|4.8|3.0|1.4|0.1|Iris-setosa|  2.0|[4.8,3.0,1.4,0.1]|
|4.3|3.0|1.1|0.1|Iris-setosa|  2.0|[4.3,3.0,1.1,0.1]|
|5.8|4.0|1.2|0.2|Iris-setosa|  2.0|[5.8,4.0,1.2,0.2]|
|5.7|4.4|1.5|0.4|Iris-setosa|  2.0|[5.7,4.4,1.5,0.4]|
|5.4|3.9|1.3|0.4|Iris-setosa

In [None]:
# Split the data
(training_data, test_data) = transformed_data.randomSplit([0.8,0.2])

In [None]:
from pyspark.ml.classification import LogisticRegression
model = LogisticRegression(featuresCol = 'features',labelCol=class_name, maxIter=30)

In [None]:
M = model.fit(training_data)

In [None]:
# Predict with the test dataset
predictions = M.transform(test_data)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

multi_evaluator = MulticlassClassificationEvaluator(labelCol = 'label', metricName = 'accuracy')
print('Logistic Regression Accuracy:', multi_evaluator.evaluate(predictions))


Logistic Regression Accuracy: 0.9583333333333334
