In [1]:
# import findspark
# findspark.init()
# from pyspark.sql import SparkSession

# spark = SparkSession \
#     .builder \
#     .appName("Python Spark LR Classifier") \
#     .getOrCreate()

# sc=spark.sparkContext

In [1]:
from pyspark.ml.classification import LogisticRegression

In [2]:
#load and parse the data file,converitn it to a DataFrame
#path='hdfs://soit-hdp-pro-1.ucc.usyd.edu.au/share/MNIST/'
path='../Data/'
training_data=spark.read.csv(path+'Train-label-28x28.csv', header=False, inferSchema="true").withColumnRenamed('_c0','label')
testing_data=spark.read.csv(path+'Test-label-28x28.csv',header=False, inferSchema="true").withColumnRenamed('_c0','label')

In [50]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import monotonically_increasing_id

#Train Data
assembler=VectorAssembler(inputCols=training_data.columns[1:],outputCol='features')
newdata=assembler.transform(training_data)
train_data=newdata.select('label','features')
train_id = train_data.withColumn(
        '{}_id'.format(train_data), monotonically_increasing_id())
training=train_id.withColumnRenamed('DataFrame[label: int, features: vector]_id','ID')
training.show(5)

+-----+--------------------+---+
|label|            features| ID|
+-----+--------------------+---+
|    5|(784,[152,153,154...|  0|
|    0|(784,[127,128,129...|  1|
|    4|(784,[160,161,162...|  2|
|    1|(784,[158,159,160...|  3|
|    9|(784,[208,209,210...|  4|
+-----+--------------------+---+
only showing top 5 rows



In [51]:
# Test Data
assembler_test=VectorAssembler(inputCols=testing_data.columns[1:],outputCol='features')
newdata_test=assembler_test.transform(testing_data)
test_data=newdata_test.select('label','features')
test_id = test_data.withColumn(
        '{}_id'.format(test_data), monotonically_increasing_id())
testing=test_id.withColumnRenamed('DataFrame[label: int, features: vector]_id','ID')
testing.show(5)

+-----+--------------------+---+
|label|            features| ID|
+-----+--------------------+---+
|    7|(784,[202,203,204...|  0|
|    2|(784,[94,95,96,97...|  1|
|    1|(784,[128,129,130...|  2|
|    0|(784,[124,125,126...|  3|
|    4|(784,[150,151,159...|  4|
+-----+--------------------+---+
only showing top 5 rows



In [35]:
#apply PCA
from pyspark.ml.feature import PCA


pca = PCA(k=50, inputCol="features", outputCol="feature")
pca_train = pca.fit(training)

#Apply PCA to train / test features
train_pca = pca_train.transform(training).select("label","feature","ID")
test_pca = pca_train.transform(testing).select("label","feature","ID")
train_pca.show(5)

+-----+--------------------+---+
|label|             feature| ID|
+-----+--------------------+---+
|    5|[880.731433034386...|  0|
|    0|[1768.51722024166...|  1|
|    4|[704.949236329314...|  2|
|    1|[-42.328192193772...|  3|
|    9|[374.043902028333...|  4|
+-----+--------------------+---+
only showing top 5 rows



In [52]:
#define the LR model
lr = LogisticRegression(featuresCol="features",family="multinomial")

In [53]:
%%time
# Fit the model
lrModel = lr.fit(training)

CPU times: user 12.4 ms, sys: 4.51 ms, total: 16.9 ms
Wall time: 22 s


In [54]:
%%time
#train the model
prediction=lrModel.transform(testing)

CPU times: user 8.47 ms, sys: 2.74 ms, total: 11.2 ms
Wall time: 60.2 ms


In [56]:
#select columns
prediction.select('label','features','ID','probability','prediction').show(1)

+-----+--------------------+---+--------------------+----------+
|label|            features| ID|         probability|prediction|
+-----+--------------------+---+--------------------+----------+
|    7|(784,[202,203,204...|  0|[9.64530396645497...|       7.0|
+-----+--------------------+---+--------------------+----------+
only showing top 1 row



In [57]:
%%time
#Select (prediction, true label) and compute test error
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(prediction)
print("Accuracy = %g" % accuracy)
print("Test Error = %g" % (1.0 - accuracy))

Accuracy = 0.9269
Test Error = 0.0731
CPU times: user 11.4 ms, sys: 7.38 ms, total: 18.8 ms
Wall time: 2.18 s


In [46]:
# Print the coefficients and intercept for multinomial logistic regression
# print("Coefficients: \n" + str(lrModel.coefficientMatrix))
# print("Intercept: " + str(lrModel.interceptVector))

In [60]:
from pyspark.mllib.evaluation import MulticlassMetrics
def prepare_data(actual_data, prediction_data, on='ID'):
    return actual_data.join(prediction_data, on).rdd \
        .map(lambda x: (float(x.prediction), float(x.label)))


def overall_report(actual_data, prediction_data):
    # Calculate actual / predicted labels in rdd from
    prediction_and_labels = prepare_data(actual_data, prediction_data)

    # Calculate actual / predicted labels in rdd from
    metrics = MulticlassMetrics(prediction_and_labels)

    # Calculate overall level metrics
    # print('Precision:', metrics.precision(), type(metrics.precision()))
    # return sc.parallelize([(Vectors.dense(metrics.accuracy),
    #                         Vectors.dense(metrics.precision()),
    #                         Vectors.dense(metrics.recall()),
    #                         Vectors.dense(metrics.fMeasure()))]).toDF(['Accuracy', 'Precision', 'Recall', 'F - Score'])
    print('Accuracy\tPrecision\tRecall\tF-Score')
    print('{}\t{}\t{}\t{}'.format(metrics.accuracy, metrics.precision(),
                                  metrics.recall(), metrics.fMeasure()))


def classification_report(actual_data, prediction_data):
    # Calculate actual / predicted labels in rdd from
    prediction_and_labels = prepare_data(actual_data, prediction_data)

    # Calculate calculate class level metrics
    metrics = MulticlassMetrics(prediction_and_labels)
    classes = set(actual_data.rdd.map(lambda x: float(x.label)).collect())
    print('Class\tPrecision\tRecall\tF-Score')
    for c in sorted(classes):
        print('{}\t{}\t{}\t{}'.format(c,
                                      round(metrics.precision(c), 3),
                                      round(metrics.recall(c), 3),
                                      round(metrics.fMeasure(c), 3)))

In [58]:
#print out precision, recall, f1-score
overall_report(test_pca, prediction)

Accuracy	Precision	Recall	F-Score
0.9269	0.9269	0.9269	0.9269


In [61]:
# print out precision, recall, f1-score for each class
classification_report = classification_report(test_pca, prediction)

Class	Precision	Recall	F-Score
0.0	0.957	0.973	0.965
1.0	0.964	0.98	0.972
2.0	0.934	0.892	0.913
3.0	0.898	0.924	0.911
4.0	0.938	0.934	0.936
5.0	0.914	0.869	0.891
6.0	0.94	0.955	0.948
7.0	0.927	0.925	0.926
8.0	0.878	0.888	0.883
9.0	0.913	0.918	0.915
