In [1]:
# import findspark
# findspark.init()
# from pyspark.sql import SparkSession

# spark = SparkSession \
#     .builder \
#     .appName("Python Spark RF Classifier") \
#     .getOrCreate()

# sc=spark.sparkContext

In [8]:
#from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
#from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [30]:
#load and parse the data file,converitn it to a DataFrame
#path='hdfs://soit-hdp-pro-1.ucc.usyd.edu.au/share/MNIST/'
path='../Data/'
training_data=spark.read.csv(path+'Train-label-28x28.csv', header=False, inferSchema="true").withColumnRenamed('_c0','label')
testing_data=spark.read.csv(path+'Test-label-28x28.csv',header=False, inferSchema="true").withColumnRenamed('_c0','label')

In [31]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import monotonically_increasing_id

#Train Data
assembler=VectorAssembler(inputCols=training_data.columns[1:],outputCol='features')
newdata=assembler.transform(training_data)
train_data=newdata.select('label','features')
train_id = train_data.withColumn(
        '{}_id'.format(train_data), monotonically_increasing_id())
training=train_id.withColumnRenamed('DataFrame[label: int, features: vector]_id','ID')
training.show(5), training.count()

+-----+--------------------+---+
|label|            features| ID|
+-----+--------------------+---+
|    5|(784,[152,153,154...|  0|
|    0|(784,[127,128,129...|  1|
|    4|(784,[160,161,162...|  2|
|    1|(784,[158,159,160...|  3|
|    9|(784,[208,209,210...|  4|
+-----+--------------------+---+
only showing top 5 rows



(None, 60000)

In [49]:
# Test Data
assembler_test=VectorAssembler(inputCols=testing_data.columns[1:],outputCol='features')
newdata_test=assembler_test.transform(testing_data)
test_data=newdata_test.select('label','features')
test_id = test_data.withColumn(
        '{}_id'.format(test_data), monotonically_increasing_id())
testing=test_id.withColumnRenamed('DataFrame[label: int, features: vector]_id','ID')
testing.show(5), testing.count()

+-----+--------------------+---+
|label|            features| ID|
+-----+--------------------+---+
|    7|(784,[202,203,204...|  0|
|    2|(784,[94,95,96,97...|  1|
|    1|(784,[128,129,130...|  2|
|    0|(784,[124,125,126...|  3|
|    4|(784,[150,151,159...|  4|
+-----+--------------------+---+
only showing top 5 rows



(None, 10000)

In [62]:
#apply PCA
from pyspark.ml.feature import PCA


pca = PCA(k=100, inputCol="features", outputCol="feature")
pca_train = pca.fit(training)

#Apply PCA to train / test features
train_pca = pca_train.transform(training).select("label","feature","ID")
test_pca = pca_train.transform(testing).select("label","feature","ID")
train_pca.show(5)

+-----+--------------------+---+
|label|             feature| ID|
+-----+--------------------+---+
|    5|[880.731433034386...|  0|
|    0|[1768.51722024166...|  1|
|    4|[704.949236329314...|  2|
|    1|[-42.328192193772...|  3|
|    9|[374.043902028332...|  4|
+-----+--------------------+---+
only showing top 5 rows



In [93]:
%%time
#build up random forest model and train model
rf = RandomForestClassifier(labelCol="label", featuresCol="feature", numTrees=10)
model=rf.fit(train_pca)

CPU times: user 14.9 ms, sys: 5.38 ms, total: 20.3 ms
Wall time: 14.8 s


In [94]:
%%time
# Make predictions.
predictions = model.transform(test_pca)

CPU times: user 12.8 ms, sys: 4.82 ms, total: 17.6 ms
Wall time: 70 ms


In [95]:
# Select example rows to display.
predictions.select('label','feature','ID','probability','prediction').show()

+-----+--------------------+---+--------------------+----------+
|label|             feature| ID|         probability|prediction|
+-----+--------------------+---+--------------------+----------+
|    7|[424.527675108320...|  0|[0.05015182897710...|       7.0|
|    2|[777.495806467954...|  1|[0.05624565191437...|       6.0|
|    1|[-189.22802355912...|  2|[0.08565436069946...|       1.0|
|    0|[1990.70583089721...|  3|[0.27157347072812...|       0.0|
|    4|[946.077017433915...|  4|[0.03172361638453...|       4.0|
|    1|[-264.46945985278...|  5|[0.09749006756645...|       1.0|
|    4|[502.087335041735...|  6|[0.05443274314664...|       4.0|
|    9|[438.703086461166...|  7|[0.05737224402671...|       8.0|
|    5|[1019.13380549721...|  8|[0.05695701216925...|       2.0|
|    9|[725.804171346720...|  9|[0.02989357614245...|       9.0|
|    0|[1882.34258754238...| 10|[0.50891900760012...|       0.0|
|    6|[1091.09610327598...| 11|[0.08565696476486...|       6.0|
|    9|[587.861814723699.

In [96]:
%%time
#Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % accuracy)
print("Test Error = %g" % (1.0 - accuracy))


# rfModel = model.stages[2]
# print(rfModel)  # summary only

Accuracy = 0.7649
Test Error = 0.2351
CPU times: user 4.9 ms, sys: 3.37 ms, total: 8.27 ms
Wall time: 2.07 s


In [97]:
train_pca.join(predictions, 'ID').head(1)

[Row(ID=0, label=5, feature=DenseVector([880.7314, 329.9508, 197.6063, 1022.2037, 893.9353, 129.5378, 739.8278, 117.8132, -63.2179, -279.9267, 300.5174, -83.6115, 103.9882, 847.0763, 125.9195, 61.0834, -186.1493, 143.2418, -329.0186, 155.5136, 18.0902, -49.9165, 190.4536, 210.3358, 72.0857, -122.5584, 344.546, -162.1976, -131.0344, -132.7735, -82.9295, -46.1306, 120.4762, -33.7134, -142.8575, 179.0475, -62.1566, 203.2188, 152.7653, 344.4394, 203.9445, 161.9635, -8.4977, -91.7776, -141.3525, 15.1394, 134.7349, 67.0336, 14.7887, 37.6015, -97.6008, -127.4233, 122.8006, 42.2424, -13.1368, 176.1595, -144.1529, -45.6747, 154.3456, -122.5431, -83.4824, -4.0127, -86.0823, 89.5746, -175.2869, 140.876, 200.0559, -40.1264, -60.6752, -18.5383, -94.0646, 122.8287, -55.0722, 2.6127, -107.8599, 66.5914, -56.8551, -35.9199, 31.5544, -41.7844, -91.704, -26.7733, 26.6456, 55.6154, -34.5123, 10.4143, -164.1281, -39.9939, -47.0297, -20.3422, 30.9545, 39.0317, 58.4083, -78.3126, 20.5267, 60.4895, 49.6808, 

In [98]:
from pyspark.mllib.evaluation import MulticlassMetrics
def prepare_data(actual_data, prediction_data, on='ID'):
    return actual_data.join(prediction_data, on).rdd \
        .map(lambda x: (float(x.prediction), float(x.label)))


def overall_report(actual_data, prediction_data):
    # Calculate actual / predicted labels in rdd from
    prediction_and_labels = prepare_data(actual_data, prediction_data)

    # Calculate actual / predicted labels in rdd from
    metrics = MulticlassMetrics(prediction_and_labels)

    # Calculate overall level metrics
    # print('Precision:', metrics.precision(), type(metrics.precision()))
    # return sc.parallelize([(Vectors.dense(metrics.accuracy),
    #                         Vectors.dense(metrics.precision()),
    #                         Vectors.dense(metrics.recall()),
    #                         Vectors.dense(metrics.fMeasure()))]).toDF(['Accuracy', 'Precision', 'Recall', 'F - Score'])
    print('Accuracy\tPrecision\tRecall\tF-Score')
    print('{}\t{}\t{}\t{}'.format(metrics.accuracy, metrics.precision(),
                                  metrics.recall(), metrics.fMeasure()))


def classification_report(actual_data, prediction_data):
    # Calculate actual / predicted labels in rdd from
    prediction_and_labels = prepare_data(actual_data, prediction_data)

    # Calculate calculate class level metrics
    metrics = MulticlassMetrics(prediction_and_labels)
    classes = set(actual_data.rdd.map(lambda x: float(x.label)).collect())
    print('Class\tPrecision\tRecall\tF-Score')
    for c in sorted(classes):
        print('{}\t{}\t{}\t{}'.format(c,
                                      round(metrics.precision(c), 3),
                                      round(metrics.recall(c), 3),
                                      round(metrics.fMeasure(c), 3)))

In [99]:
#print out precision, recall, f1-score
overall_report(test_pca, predictions)

Accuracy	Precision	Recall	F-Score
0.7649	0.7649	0.7649	0.7649


In [100]:
# print out precision, recall, f1-score for each class
classification_report = classification_report(test_pca, predictions)

Class	Precision	Recall	F-Score
0.0	0.841	0.74	0.787
1.0	0.863	0.952	0.906
2.0	0.746	0.764	0.755
3.0	0.68	0.794	0.733
4.0	0.797	0.69	0.74
5.0	0.784	0.525	0.629
6.0	0.783	0.909	0.842
7.0	0.765	0.813	0.788
8.0	0.657	0.73	0.692
9.0	0.752	0.683	0.716
