In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
sp=SparkSession.builder.appName('irisapp').getOrCreate()

In [4]:
diris=sp.read.csv('f:\\datasets\\iris.csv',header=True,inferSchema=True)
diris.show(5)
diris.count()

+-----------+----------+-----------+----------+-------+
|sepalLength|sepalWidth|petalLength|petalWidth|variety|
+-----------+----------+-----------+----------+-------+
|        5.1|       3.5|        1.4|       0.2| Setosa|
|        4.9|       3.0|        1.4|       0.2| Setosa|
|        4.7|       3.2|        1.3|       0.2| Setosa|
|        4.6|       3.1|        1.5|       0.2| Setosa|
|        5.0|       3.6|        1.4|       0.2| Setosa|
+-----------+----------+-----------+----------+-------+
only showing top 5 rows



150

In [5]:
diris.printSchema()
diris.select('variety').distinct().show()

root
 |-- sepalLength: double (nullable = true)
 |-- sepalWidth: double (nullable = true)
 |-- petalLength: double (nullable = true)
 |-- petalWidth: double (nullable = true)
 |-- variety: string (nullable = true)

+----------+
|   variety|
+----------+
| Virginica|
|    Setosa|
|Versicolor|
+----------+



In [6]:
diris.columns[:4]

['sepalLength', 'sepalWidth', 'petalLength', 'petalWidth']

In [7]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
va=VectorAssembler(inputCols=diris.columns[:4],outputCol='inputfeat')
df=va.transform(diris)
df.show(5)

+-----------+----------+-----------+----------+-------+-----------------+
|sepalLength|sepalWidth|petalLength|petalWidth|variety|        inputfeat|
+-----------+----------+-----------+----------+-------+-----------------+
|        5.1|       3.5|        1.4|       0.2| Setosa|[5.1,3.5,1.4,0.2]|
|        4.9|       3.0|        1.4|       0.2| Setosa|[4.9,3.0,1.4,0.2]|
|        4.7|       3.2|        1.3|       0.2| Setosa|[4.7,3.2,1.3,0.2]|
|        4.6|       3.1|        1.5|       0.2| Setosa|[4.6,3.1,1.5,0.2]|
|        5.0|       3.6|        1.4|       0.2| Setosa|[5.0,3.6,1.4,0.2]|
+-----------+----------+-----------+----------+-------+-----------------+
only showing top 5 rows



In [8]:
ind=StringIndexer(inputCol='variety',outputCol='Sp')
df1=ind.fit(df).transform(df)
df1.show()

+-----------+----------+-----------+----------+-------+-----------------+---+
|sepalLength|sepalWidth|petalLength|petalWidth|variety|        inputfeat| Sp|
+-----------+----------+-----------+----------+-------+-----------------+---+
|        5.1|       3.5|        1.4|       0.2| Setosa|[5.1,3.5,1.4,0.2]|0.0|
|        4.9|       3.0|        1.4|       0.2| Setosa|[4.9,3.0,1.4,0.2]|0.0|
|        4.7|       3.2|        1.3|       0.2| Setosa|[4.7,3.2,1.3,0.2]|0.0|
|        4.6|       3.1|        1.5|       0.2| Setosa|[4.6,3.1,1.5,0.2]|0.0|
|        5.0|       3.6|        1.4|       0.2| Setosa|[5.0,3.6,1.4,0.2]|0.0|
|        5.4|       3.9|        1.7|       0.4| Setosa|[5.4,3.9,1.7,0.4]|0.0|
|        4.6|       3.4|        1.4|       0.3| Setosa|[4.6,3.4,1.4,0.3]|0.0|
|        5.0|       3.4|        1.5|       0.2| Setosa|[5.0,3.4,1.5,0.2]|0.0|
|        4.4|       2.9|        1.4|       0.2| Setosa|[4.4,2.9,1.4,0.2]|0.0|
|        4.9|       3.1|        1.5|       0.1| Setosa|[4.9,3.1,

In [9]:
finaldata=df1.select('inputfeat','sp')
finaldata.show()

+-----------------+---+
|        inputfeat| sp|
+-----------------+---+
|[5.1,3.5,1.4,0.2]|0.0|
|[4.9,3.0,1.4,0.2]|0.0|
|[4.7,3.2,1.3,0.2]|0.0|
|[4.6,3.1,1.5,0.2]|0.0|
|[5.0,3.6,1.4,0.2]|0.0|
|[5.4,3.9,1.7,0.4]|0.0|
|[4.6,3.4,1.4,0.3]|0.0|
|[5.0,3.4,1.5,0.2]|0.0|
|[4.4,2.9,1.4,0.2]|0.0|
|[4.9,3.1,1.5,0.1]|0.0|
|[5.4,3.7,1.5,0.2]|0.0|
|[4.8,3.4,1.6,0.2]|0.0|
|[4.8,3.0,1.4,0.1]|0.0|
|[4.3,3.0,1.1,0.1]|0.0|
|[5.8,4.0,1.2,0.2]|0.0|
|[5.7,4.4,1.5,0.4]|0.0|
|[5.4,3.9,1.3,0.4]|0.0|
|[5.1,3.5,1.4,0.3]|0.0|
|[5.7,3.8,1.7,0.3]|0.0|
|[5.1,3.8,1.5,0.3]|0.0|
+-----------------+---+
only showing top 20 rows



In [10]:
train,test=finaldata.randomSplit([.70,.30])

In [11]:
from pyspark.ml.classification  import DecisionTreeClassifier
dtcmodel=DecisionTreeClassifier(labelCol='sp',featuresCol='inputfeat')

In [12]:
model=dtcmodel.fit(train)

In [13]:
model

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_37382ffb88e7, depth=4, numNodes=13, numClasses=3, numFeatures=4

In [14]:
pre_res=model.transform(test)

In [15]:
pre_res.select('inputfeat','sp','prediction').show()

+-----------------+---+----------+
|        inputfeat| sp|prediction|
+-----------------+---+----------+
|[4.3,3.0,1.1,0.1]|0.0|       0.0|
|[4.4,3.0,1.3,0.2]|0.0|       0.0|
|[4.6,3.2,1.4,0.2]|0.0|       0.0|
|[4.6,3.6,1.0,0.2]|0.0|       0.0|
|[4.7,3.2,1.3,0.2]|0.0|       0.0|
|[4.8,3.0,1.4,0.1]|0.0|       0.0|
|[4.8,3.0,1.4,0.3]|0.0|       0.0|
|[4.8,3.4,1.6,0.2]|0.0|       0.0|
|[4.9,2.4,3.3,1.0]|1.0|       1.0|
|[4.9,2.5,4.5,1.7]|2.0|       1.0|
|[4.9,3.0,1.4,0.2]|0.0|       0.0|
|[4.9,3.1,1.5,0.2]|0.0|       0.0|
|[5.0,2.3,3.3,1.0]|1.0|       1.0|
|[5.0,3.2,1.2,0.2]|0.0|       0.0|
|[5.1,2.5,3.0,1.1]|1.0|       1.0|
|[5.1,3.7,1.5,0.4]|0.0|       0.0|
|[5.1,3.8,1.5,0.3]|0.0|       0.0|
|[5.1,3.8,1.9,0.4]|0.0|       0.0|
|[5.4,3.0,4.5,1.5]|1.0|       1.0|
|[5.4,3.4,1.7,0.2]|0.0|       0.0|
+-----------------+---+----------+
only showing top 20 rows



In [16]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [17]:
ev=MulticlassClassificationEvaluator(labelCol='sp',predictionCol='prediction')
acc=ev.evaluate(pre_res)
print('Accuaracy of model:',acc)
print('Test Error of model:',(1-acc))

Accuaracy of model: 0.9632900432900433
Test Error of model: 0.03670995670995669


In [18]:
#IndexToString converstion
from pyspark.ml.feature import IndexToString
itos=IndexToString(inputCol='Sp',outputCol='va_category')
b=itos.transform(df1)
b.select('Sp','va_category').distinct().show()

+---+-----------+
| Sp|va_category|
+---+-----------+
|1.0| Versicolor|
|0.0|     Setosa|
|2.0|  Virginica|
+---+-----------+



In [19]:
f=pre_res.select('inputfeat','Sp','prediction')
from pyspark.ml.feature import IndexToString
itos=IndexToString(inputCol='Sp',outputCol='va_category')
b=itos.transform(f)
b.show()

+-----------------+---+----------+-----------+
|        inputfeat| Sp|prediction|va_category|
+-----------------+---+----------+-----------+
|[4.3,3.0,1.1,0.1]|0.0|       0.0|     Setosa|
|[4.4,3.0,1.3,0.2]|0.0|       0.0|     Setosa|
|[4.6,3.2,1.4,0.2]|0.0|       0.0|     Setosa|
|[4.6,3.6,1.0,0.2]|0.0|       0.0|     Setosa|
|[4.7,3.2,1.3,0.2]|0.0|       0.0|     Setosa|
|[4.8,3.0,1.4,0.1]|0.0|       0.0|     Setosa|
|[4.8,3.0,1.4,0.3]|0.0|       0.0|     Setosa|
|[4.8,3.4,1.6,0.2]|0.0|       0.0|     Setosa|
|[4.9,2.4,3.3,1.0]|1.0|       1.0| Versicolor|
|[4.9,2.5,4.5,1.7]|2.0|       1.0|  Virginica|
|[4.9,3.0,1.4,0.2]|0.0|       0.0|     Setosa|
|[4.9,3.1,1.5,0.2]|0.0|       0.0|     Setosa|
|[5.0,2.3,3.3,1.0]|1.0|       1.0| Versicolor|
|[5.0,3.2,1.2,0.2]|0.0|       0.0|     Setosa|
|[5.1,2.5,3.0,1.1]|1.0|       1.0| Versicolor|
|[5.1,3.7,1.5,0.4]|0.0|       0.0|     Setosa|
|[5.1,3.8,1.5,0.3]|0.0|       0.0|     Setosa|
|[5.1,3.8,1.9,0.4]|0.0|       0.0|     Setosa|
|[5.4,3.0,4.5

In [20]:
from pyspark.ml.classification  import RandomForestClassifier
rfmodel=RandomForestClassifier(labelCol='sp',featuresCol='inputfeat')
model=rfmodel.fit(train)

In [21]:
predctres=model.transform(test)
predctres.show()

+-----------------+---+--------------+---------------+----------+
|        inputfeat| sp| rawPrediction|    probability|prediction|
+-----------------+---+--------------+---------------+----------+
|[4.3,3.0,1.1,0.1]|0.0|[20.0,0.0,0.0]|  [1.0,0.0,0.0]|       0.0|
|[4.4,3.0,1.3,0.2]|0.0|[20.0,0.0,0.0]|  [1.0,0.0,0.0]|       0.0|
|[4.6,3.2,1.4,0.2]|0.0|[20.0,0.0,0.0]|  [1.0,0.0,0.0]|       0.0|
|[4.6,3.6,1.0,0.2]|0.0|[20.0,0.0,0.0]|  [1.0,0.0,0.0]|       0.0|
|[4.7,3.2,1.3,0.2]|0.0|[20.0,0.0,0.0]|  [1.0,0.0,0.0]|       0.0|
|[4.8,3.0,1.4,0.1]|0.0|[20.0,0.0,0.0]|  [1.0,0.0,0.0]|       0.0|
|[4.8,3.0,1.4,0.3]|0.0|[20.0,0.0,0.0]|  [1.0,0.0,0.0]|       0.0|
|[4.8,3.4,1.6,0.2]|0.0|[20.0,0.0,0.0]|  [1.0,0.0,0.0]|       0.0|
|[4.9,2.4,3.3,1.0]|1.0|[0.0,20.0,0.0]|  [0.0,1.0,0.0]|       1.0|
|[4.9,2.5,4.5,1.7]|2.0|[0.0,12.0,8.0]|  [0.0,0.6,0.4]|       1.0|
|[4.9,3.0,1.4,0.2]|0.0|[20.0,0.0,0.0]|  [1.0,0.0,0.0]|       0.0|
|[4.9,3.1,1.5,0.2]|0.0|[20.0,0.0,0.0]|  [1.0,0.0,0.0]|       0.0|
|[5.0,2.3,

In [26]:
ev=MulticlassClassificationEvaluator(labelCol='sp',predictionCol='prediction')
acc=ev.evaluate(predctres)
print('Accuaracy of model:',acc)
print('Test Error of model:',(1-acc))

Accuaracy of model: 0.9445359772545017
Test Error of model: 0.05546402274549833
