In [27]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import monotonically_increasing_id 
from math import sqrt
from pyspark.sql.functions import sqrt

# Data 
train_data_labels = 'test/Train-label-28x28-2.csv'
test_data_labels = 'test/Test-label-28x28-2.csv'

def get_vector(data, col_name):
    assembler = VectorAssembler(inputCols=data.columns, outputCol=col_name)
    return assembler.transform(data).select(col_name)    

# Train Data
train = spark.read.csv(train_data_labels, header=False, inferSchema="true")
train_labels = get_vector(train.select('_c0'), 'train_labels')
train_features = get_vector(train.drop('_c0'), 'features')

# Test Data
test = spark.read.csv(test_data_labels, header=False, inferSchema="true")
test_labels = get_vector(test.select('_c0'), 'test_labels')
test_features = get_vector(test.drop('_c0'), 'features')

In [28]:
from pyspark.ml.feature import PCA

pca = PCA(k=50, inputCol="features", outputCol="pca_features")
pca_model = pca.fit(train_features)

# Apply PCA to train / test features
train_features_pca = pca_model.transform(train_features).select("pca_features")
test_features_pca = pca_model.transform(test_features).select("pca_features")

In [29]:
# Rename pca feature column values
train_features_pca = train_features_pca.withColumnRenamed("pca_features", "train_features")
test_features_pca = test_features_pca.withColumnRenamed("pca_features", "test_features")

In [30]:
# Develop a combined dataframe for all data

def combine_features_labels(feature_vector, label_vector, kind='train'):
    features = feature_vector.withColumn('{}_id'.format(kind), monotonically_increasing_id())
    labels = label_vector.withColumn('{}_id'.format(kind), monotonically_increasing_id())
    data = features.join(labels, '{}_id'.format(kind))
    return data

# Create combined train / test data
train_data = combine_features_labels(train_features_pca, train_labels, 'train')
test_data = combine_features_labels(test_features_pca, test_labels, 'test')

In [31]:
train_data.show(4), test_data.show(4), train_data.count(), test_data.count()

+--------+--------------------+------------+
|train_id|      train_features|train_labels|
+--------+--------------------+------------+
|       0|[850.672141291628...|       [5.0]|
|       1|[1699.40798562470...|       [0.0]|
|       2|[632.202682471733...|       [4.0]|
|       3|[-166.42740423231...|       [1.0]|
+--------+--------------------+------------+
only showing top 4 rows

+-------+--------------------+-----------+
|test_id|       test_features|test_labels|
+-------+--------------------+-----------+
|      0|[361.419311328079...|      [7.0]|
|      1|[873.283433767456...|      [2.0]|
|      2|[-197.20339527162...|      [1.0]|
|      3|[1906.37377339178...|      [0.0]|
+-------+--------------------+-----------+
only showing top 4 rows



(None, None, 600, 100)

In [32]:
# test_data.rdd.map(lambda x: x.test_features).collect()

In [33]:
train_features = sc.broadcast(train_data.rdd.map(lambda x: x.train_features).collect())
train_labels = sc.broadcast(train_data.rdd.map(lambda x: x.train_labels).collect())

In [196]:
# Calculate number of total examples
num_examples = len(train_labels.value)

In [71]:
from pyspark.mllib.linalg import Vectors
import numpy as np
import operator

K = 10

def cosine_distance(vec_one, vec_two):
    vec_one_sq = np.sqrt(vec_one.dot(vec_one))
    vec_two_sq = np.sqrt(vec_two.dot(vec_two))
    vec_one_two_dot = vec_one.dot(vec_two)
    similarity = vec_one_two_dot / (vec_one_sq * vec_two_sq)
    return float(similarity)

def majority_vote(sorted_dist):
    closest_instance = sorted_dist[0][1]
    neighbors = [neighbor[1] for neighbor in sorted_dist]
    counts = {label: neighbors.count(label) for label in neighbors}
    most_common_vote = max(counts.items(), key=operator.itemgetter(1))
    if most_common_vote[1] > 1:
        return float(most_common_vote[0])
    else:
        return float(closest_instance)
    
def compute_label(test_id, test_label, test_feature):
    results = []
    for i in range(num_examples):
        train_feature = train_features.value[i]
        train_label = train_labels.value[i]
        similarity = cosine_distance(train_feature, test_feature)
        results.append((similarity, train_label[0]))
    sorted_dist = sorted(results, reverse=True)[:K]
    prediction = majority_vote(sorted_dist)
    return (test_id, Vectors.dense(prediction))

# Compute KNN predictions
result = test_data.rdd.map(lambda x: compute_label(x.test_id, x.test_labels, x.test_features)).collect()

In [72]:
# Create dataframe of prediction resutlts
prediction_data = sc.parallelize(result).toDF(['test_id', 'prediction'])
prediction_data.show(3)

In [190]:
# Input dataframes to calculate summary statistics
test_data.show(3), prediction_data.show(3)

+-------+--------------------+-----------+
|test_id|       test_features|test_labels|
+-------+--------------------+-----------+
|      0|[361.419311328079...|      [7.0]|
|      1|[873.283433767456...|      [2.0]|
|      2|[-197.20339527162...|      [1.0]|
+-------+--------------------+-----------+
only showing top 3 rows

+-------+----------+
|test_id|prediction|
+-------+----------+
|      0|     [7.0]|
|      1|     [2.0]|
|      2|     [1.0]|
+-------+----------+
only showing top 3 rows



(None, None)

In [198]:
from pyspark.mllib.evaluation import MulticlassMetrics

# Calculation of statistics

def summary_statistics(metrics):
    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print('Precision\tRecall\tF-Score')
    print('{}\t{}\t{}'.format(metrics.precision(), metrics.recall(), metrics.fMeasure()))
    

def label_statistics(metrics, labels):
    print('Class\tPrecision\tRecall\tF-Score')
    for label in sorted(labels):
        print('{}\t{}\t{}\t{}'.format(label, 
                                      round(metrics.precision(label), 3), 
                                      round(metrics.recall(label), 3), 
                                      round(metrics.fMeasure(label), 3)))

        
def statistics(test_data, prediction_data):
    # Compute raw scores on the test set
    prediction_and_labels = test_data.join(prediction_data, 'test_id').rdd \
                            .map(lambda x: (float(x.prediction[0]), float(x.test_labels[0])))

    # Instantiate metrics object
    metrics = MulticlassMetrics(prediction_and_labels)

    # Overall statistics
    print("Summary Statistics\n")
    summary_statistics(metrics)

    # Statistics by class
    print("\nClass Summary Statistics\n")
    label_statistics(metrics, labels)
      

statistics(test_data, prediction_data)

Summary Statistics

Precision	Recall	F-Score
0.85	0.85	0.85

Class Summary Statistics

Class	Precision	Recall	F-Score
0.0	0.8	1.0	0.889
1.0	0.933	1.0	0.966
2.0	0.778	0.875	0.824
3.0	1.0	0.909	0.952
4.0	1.0	0.714	0.833
5.0	0.833	0.714	0.769
6.0	0.889	0.8	0.842
7.0	0.8	0.8	0.8
8.0	1.0	0.5	0.667
9.0	0.667	0.909	0.769
