In [1]:
import pandas as pd
import numpy as np
import findspark
findspark.init('C:\spark')
import pyspark

from pyspark.sql import SparkSession
from pyspark.context import SparkContext
from pyspark import SparkConf
from pyspark.sql.functions import col
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [11]:
df = spark.read.csv("ten.csv", header=True, inferSchema=True)

In [12]:
df.show()

+---------+------+------+
|PulpColor| Taste|Edible|
+---------+------+------+
|      red|  sour|   yes|
|      red|  sour|    no|
|      red|bitter|    no|
|   yellow| sweet|   yes|
|   yellow|bitter|    no|
|    green|  sour|   yes|
|    green| sweet|   yes|
|    green|bitter|    no|
|   yellow|  sour|   yes|
+---------+------+------+



In [13]:
from pyspark.ml.feature import RFormula
supervised = RFormula(formula = "Edible ~ . + PulpColor + Taste")

In [14]:
fittedRF = supervised.fit(df)
preparedDF = fittedRF.transform(df)
preparedDF.show()

+---------+------+------+-----------------+-----+
|PulpColor| Taste|Edible|         features|label|
+---------+------+------+-----------------+-----+
|      red|  sour|   yes|[0.0,1.0,1.0,0.0]|  0.0|
|      red|  sour|    no|[0.0,1.0,1.0,0.0]|  1.0|
|      red|bitter|    no|[0.0,1.0,0.0,1.0]|  1.0|
|   yellow| sweet|   yes|        (4,[],[])|  0.0|
|   yellow|bitter|    no|    (4,[3],[1.0])|  1.0|
|    green|  sour|   yes|[1.0,0.0,1.0,0.0]|  0.0|
|    green| sweet|   yes|    (4,[0],[1.0])|  0.0|
|    green|bitter|    no|[1.0,0.0,0.0,1.0]|  1.0|
|   yellow|  sour|   yes|    (4,[2],[1.0])|  0.0|
+---------+------+------+-----------------+-----+



In [15]:
train,test = preparedDF.randomSplit([0.7,0.3])


In [16]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol = "label",featuresCol = "features")
print(lr.explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features, current: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label, current: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. The

In [17]:
fittedLR = lr.fit(train)
fittedLR.transform(train).select("PulpColor","Taste","Edible","label","prediction").show()

+---------+------+------+-----+----------+
|PulpColor| Taste|Edible|label|prediction|
+---------+------+------+-----+----------+
|    green|bitter|    no|  1.0|       1.0|
|      red|bitter|    no|  1.0|       1.0|
|      red|  sour|    no|  1.0|       0.0|
|      red|  sour|   yes|  0.0|       0.0|
|   yellow|bitter|    no|  1.0|       1.0|
|   yellow|  sour|   yes|  0.0|       0.0|
+---------+------+------+-----+----------+



In [18]:
predictions = fittedLR.transform(test)
predictions.select("PulpColor","Taste","Edible","label","prediction").show()

+---------+-----+------+-----+----------+
|PulpColor|Taste|Edible|label|prediction|
+---------+-----+------+-----+----------+
|    green| sour|   yes|  0.0|       0.0|
|    green|sweet|   yes|  0.0|       1.0|
|   yellow|sweet|   yes|  0.0|       1.0|
+---------+-----+------+-----+----------+

