## Spark KNN (Testing 10-05)

Visit PySparkShell at: <a>http://localhost:4040/jobs/</a>

In [None]:
# Set up spark context

# import findspark
# findspark.init()
# from pyspark.sql import SparkSession

# spark = SparkSession \
#     .builder \
#     .appName("Python Spark KNN Test") \
#     .getOrCreate()

In [14]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import monotonically_increasing_id 
from math import sqrt
# from pyspark.sql.functions import sqrt

# Data
train_features = spark.read.csv('../Data/Train-28x28.csv', header=False, inferSchema="true")
test_features = spark.read.csv('../Data/Test-28x28.csv', header=False, inferSchema="true")
train_labels = spark.read.csv('../Data/Train-label.csv', header=False, inferSchema="true")
test_labels = spark.read.csv('../Data/Test-label.csv', header=False, inferSchema="true")

def get_vector(data, col_name):
    assembler = VectorAssembler(inputCols=data.columns, outputCol=col_name)
    return assembler.transform(data).select(col_name)    

# Vectors
train_features = get_vector(train_features, 'features')
test_features = get_vector(test_features, 'features')
train_labels = get_vector(train_labels, 'train_labels')
test_labels = get_vector(test_labels, 'test_labels')

In [15]:
from pyspark.ml.feature import PCA

pca = PCA(k=153, inputCol="features", outputCol="pca_features")
pca_model = pca.fit(train_features)

# Apply PCA to train / test features
train_features_pca = pca_model.transform(train_features).select("pca_features")
test_features_pca = pca_model.transform(test_features).select("pca_features")

In [16]:
# Rename pca feature column values
train_features_pca = train_features_pca.withColumnRenamed("pca_features", "train_features")
test_features_pca = test_features_pca.withColumnRenamed("pca_features", "test_features")

In [17]:
# Develop a combined dataframe for all data

def combine_features_labels(feature_vector, label_vector, kind='train'):
    features = feature_vector.withColumn('{}_id'.format(kind), monotonically_increasing_id())
    labels = label_vector.withColumn('{}_id'.format(kind), monotonically_increasing_id())
    data = features.join(labels, '{}_id'.format(kind))
    return data

# Create combined train / test data
train_data = combine_features_labels(train_features_pca, train_labels, 'train')
test_data = combine_features_labels(test_features_pca, test_labels, 'test')

In [18]:
train_data.show(5), test_data.show(5), train_data.count(), test_data.count()

+--------+--------------------+------------+
|train_id|      train_features|train_labels|
+--------+--------------------+------------+
|       0|[880.731433034388...|       [5.0]|
|       1|[1768.51722024166...|       [0.0]|
|       2|[704.949236329314...|       [4.0]|
|       3|[-42.328192193770...|       [1.0]|
|       4|[374.043902028336...|       [9.0]|
+--------+--------------------+------------+
only showing top 5 rows

+-------+--------------------+-----------+
|test_id|       test_features|test_labels|
+-------+--------------------+-----------+
|      0|[424.527675108323...|      [7.0]|
|      1|[777.495806467955...|      [2.0]|
|      2|[-189.22802355912...|      [1.0]|
|      3|[1990.70583089721...|      [0.0]|
|      4|[946.077017433917...|      [4.0]|
+-------+--------------------+-----------+
only showing top 5 rows



(None, None, 15558, 3105)

In [19]:
# Generate cross-join => every training example is repeated for every test example; 
# e.g. 4 train examples and 2 test examples produce 4 * 2 = 8 new rows
cross = test_data.crossJoin(train_data)
cross.show(5), cross.count()

+-------+--------------------+-----------+--------+--------------------+------------+
|test_id|       test_features|test_labels|train_id|      train_features|train_labels|
+-------+--------------------+-----------+--------+--------------------+------------+
|      0|[424.527675108323...|      [7.0]|       0|[880.731433034388...|       [5.0]|
|      0|[424.527675108323...|      [7.0]|       1|[1768.51722024166...|       [0.0]|
|      0|[424.527675108323...|      [7.0]|       2|[704.949236329314...|       [4.0]|
|      0|[424.527675108323...|      [7.0]|       3|[-42.328192193770...|       [1.0]|
|      0|[424.527675108323...|      [7.0]|       4|[374.043902028336...|       [9.0]|
+-------+--------------------+-----------+--------+--------------------+------------+
only showing top 5 rows



(None, 48307590)

In [20]:
from math import sqrt

def cosine_distance(vector_one, vector_two):
    dot_product = vector_one.dot(vector_two)
    sum_sq_vector_one = sqrt(vector_one.dot(vector_one))
    sum_sq_vector_two = sqrt(vector_two.dot(vector_two))
    return float(dot_product / (sum_sq_vector_one * sum_sq_vector_two))

# find cosine distance between each train and test features set in each row
distance = cross.rdd \
            .map(lambda x: (x.test_id, (cosine_distance(x.train_features, x.test_features), x.train_labels[0]))) \
#             .collect()

# Output in the form (test_id, (distance, train_label))

# Output looks like:
#
# [(0, (0.4354039832219311, 5.0)),
#  (1, (0.350220604260178, 5.0)),
#  (2, (0.36093373903217113, 5.0)),
#  (3, (0.5668560199651423, 5.0)), 
#  ...]

In [None]:
from collections import Counter

# Group all values by key ie. the test_id and take the K nearest neighbors

K = 15

def majority_vote(neighbors):
    closest_instance = neighbors[0][1]
    votes = [n[1] for n in neighbors]
    counted_votes = Counter(votes)
    most_common_vote = counted_votes.most_common(1)[0]
    if most_common_vote[1] > 1:
        return float(most_common_vote[0])
    else:
        return float(closest_instance)

predictions = distance \
                .groupByKey() \
                .map(lambda x: (x[0], majority_vote(sorted(list(x[1]))[:K]))) \
                .collect()
            
# Output in the form (test_id, prediction)

# Output looks like:
#
# [(0, 1.0),
#  (1, 4.0),
#  (2, 4.0),
#  (3, 1.0),
#  (4, 1.0),
#  (5, 4.0),
#  (6, 1.0), ...]

In [None]:
# Convert predictions to dataframe

prediction_data = sc.parallelize(predictions).toDF(['test_id', 'prediction'])
prediction_data.show(5), prediction_data.count()

In [None]:
# Find accuracy

accuracy = test_data.join(prediction_data, 'test_id').rdd \
            .map(lambda x: x.test_labels[0] == x.prediction) \
            .filter(lambda x : x == True).count() \
            / test_data.count()

In [None]:
print('Accuracy: {}%'.format(accuracy))