In [None]:
import findspark
findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression

In [None]:
sc = SparkContext(conf=SparkConf())
spark = SparkSession(sparkContext=sc)

In [None]:
#Load training data
data = spark.read.format("libsvm").load("data/SparkData/sample_multiclass_classification_data.txt")

In [None]:
data.select('label').distinct().show()

In [None]:
data.show()

In [None]:
trainingData, testData = data.randomSplit([0.8,0.2])

In [None]:
lr = LogisticRegression(maxIter=10,regParam=0.3,elasticNetParam=0.8, \
                        featuresCol='features', labelCol='label')

In [None]:
#Fit the model
model=lr.fit(trainingData)

In [None]:
# Make predictions.
predictions = model.transform(testData)
# Select example rows to display. 
predictions.show(5)
#predictions.show(5, False)

In [None]:
predictions.select('label', 'rawPrediction', 'probability', 'prediction').filter('label = 2.0').show(5, False)

In [None]:
# Print the coefficients and intercept for multinomial logistic regression 
print("Coefficients: {}".format(model.coefficientMatrix)) 
print("Intercepts: {}".format(model.interceptVector))

In [None]:
trainingSummary = model.summary

In [None]:
# for multiclass, we can inspect metrics on a per-label basis 
print("False positive rate by label:")
for i in range(len(trainingSummary.falsePositiveRateByLabel)):
    print("label {}:{}".format(i,trainingSummary.falsePositiveRateByLabel[i]))

In [None]:
print("True positive rate by label:")
for i in range(len(trainingSummary.truePositiveRateByLabel)):
    print("label {}:{}".format(i,trainingSummary.truePositiveRateByLabel[i]))


In [None]:
print("Precision by label:")
for i in range(len(trainingSummary.precisionByLabel)):
    print("label {}:{}".format(i,trainingSummary.precisionByLabel[i]))


In [None]:
print("Recall by label:")
for i in range(len(trainingSummary.recallByLabel)):
    print("label {}:{}".format(i,trainingSummary.recallByLabel[i]))


In [None]:
print("F-measure by label:")
for i in range(len(trainingSummary.fMeasureByLabel())):
    print("label {}:{}".format(i,trainingSummary.fMeasureByLabel()[i])) 

In [None]:
accuracy = trainingSummary.accuracy

In [None]:
falsePositiveRate = trainingSummary.weightedFalsePositiveRate 
truePositiveRate = trainingSummary.weightedTruePositiveRate 
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: {0}\nFPR: {1}\nTPR: {2}\nF-measure: {3}\nPrecision: {4}\nRec all: {5}".format(accuracy,falsePositiveRate,truePositiveRate,fMeasure,precision,recall))


In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))


### There are 3 lines determines 3 classes (3 predictive values), hence, 3 slopes and 3 intercepts. 

In [None]:
model.coefficientMatrix.toDense()

In [None]:
model.interceptVector