In [1]:
import os
import sys
spark_path = os.environ['SPARK_HOME']
sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.10.9-src.zip")

import findspark
findspark.init()
import pyspark

In [2]:
number_cores = 6
memory_gb = 16
conf = (pyspark.SparkConf().setMaster('local[{}]'.format(number_cores)).set('spark.driver.memory', '{}g'.format(memory_gb)))
sc = pyspark.SparkContext(conf=conf)

In [3]:
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col
import numpy
spark = pyspark.sql.SparkSession(sc)

Run pip install numpy in pyspark environment, restart this notebook and rerun

In [4]:
dataA = [(0, Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]),),
         (1, Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]),),
         (2, Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]),),
         (3, Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]),),
         (4, Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]),),
         (5, Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]),),]
dfA = spark.createDataFrame(dataA, ["id", "features"])
mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=3)
model = mh.fit(dfA)
print("The hashed dataset where hashed values are stored in the column 'hashes':")
model.transform(dfA).show()

The hashed dataset where hashed values are stored in the column 'hashes':
+---+--------------------+--------------------+
| id|            features|              hashes|
+---+--------------------+--------------------+
|  0|(6,[0,1,2],[1.0,1...|[[4.42744551E8], ...|
|  1|(6,[2,3,4],[1.0,1...|[[4.48942786E8], ...|
|  2|(6,[0,2,4],[1.0,1...|[[1.458682805E9],...|
|  3|(6,[1,3,5],[1.0,1...|[[4.42744551E8], ...|
|  4|(6,[2,3,5],[1.0,1...|[[4.48942786E8], ...|
|  5|(6,[1,2,4],[1.0,1...|[[4.42744551E8], ...|
+---+--------------------+--------------------+



In [5]:
final = model.approxSimilarityJoin(dfA, dfA, 1.0, distCol="distance")
final.createOrReplaceTempView("final")
final.printSchema()

root
 |-- datasetA: struct (nullable = false)
 |    |-- id: long (nullable = true)
 |    |-- features: vector (nullable = true)
 |    |-- hashes: array (nullable = true)
 |    |    |-- element: vector (containsNull = true)
 |-- datasetB: struct (nullable = false)
 |    |-- id: long (nullable = true)
 |    |-- features: vector (nullable = true)
 |    |-- hashes: array (nullable = true)
 |    |    |-- element: vector (containsNull = true)
 |-- distance: double (nullable = false)



## Important


In [8]:
sql = """
select datasetA.id, datasetA.features, datasetB.id, datasetB.features, distance
from final where datasetA.id != datasetB.id order by datasetA.id, datasetB.id"""
spark.sql(sql).show()

+---+--------------------+---+--------------------+--------+
| id|            features| id|            features|distance|
+---+--------------------+---+--------------------+--------+
|  0|(6,[0,1,2],[1.0,1...|  2|(6,[0,2,4],[1.0,1...|     0.5|
|  0|(6,[0,1,2],[1.0,1...|  3|(6,[1,3,5],[1.0,1...|     0.8|
|  0|(6,[0,1,2],[1.0,1...|  5|(6,[1,2,4],[1.0,1...|     0.5|
|  1|(6,[2,3,4],[1.0,1...|  2|(6,[0,2,4],[1.0,1...|     0.5|
|  1|(6,[2,3,4],[1.0,1...|  4|(6,[2,3,5],[1.0,1...|     0.5|
|  1|(6,[2,3,4],[1.0,1...|  5|(6,[1,2,4],[1.0,1...|     0.5|
|  2|(6,[0,2,4],[1.0,1...|  0|(6,[0,1,2],[1.0,1...|     0.5|
|  2|(6,[0,2,4],[1.0,1...|  1|(6,[2,3,4],[1.0,1...|     0.5|
|  2|(6,[0,2,4],[1.0,1...|  5|(6,[1,2,4],[1.0,1...|     0.5|
|  3|(6,[1,3,5],[1.0,1...|  0|(6,[0,1,2],[1.0,1...|     0.8|
|  3|(6,[1,3,5],[1.0,1...|  4|(6,[2,3,5],[1.0,1...|     0.5|
|  3|(6,[1,3,5],[1.0,1...|  5|(6,[1,2,4],[1.0,1...|     0.8|
|  4|(6,[2,3,5],[1.0,1...|  1|(6,[2,3,4],[1.0,1...|     0.5|
|  4|(6,[2,3,5],[1.0,1..

In [31]:
dataB = [('C1', Vectors.sparse(7, [0, 1, 5, 6], [1.0, 1.0, 1.0, 1.0])),
         ('C2', Vectors.sparse(7, [2, 3, 4], [1.0, 1.0, 1.0])),
         ('C3', Vectors.sparse(7, [0, 5, 6], [1.0, 1.0, 1.0])),
         ('C4', Vectors.sparse(7, [1, 2, 3, 4], [1.0, 1.0, 1.0, 1.0]))]

## Note:
Linux environment with true Hadoop support wholeTextFiles, but we will try a different way:
    raw_data = sc.wholeTextFile().cache()

In [32]:
import glob
data_path = "/users/trush/CSC496/LocalitySensitiveHashing/data"
all_files = glob.glob("/users/trush/CSC496/LocalitySensitiveHashing/data/*.txt")

file_names = sc.parallelize(all_files)
file_names.collect()

['/users/trush/CSC496/LocalitySensitiveHashing/data/57_obama_2013.txt',
 '/users/trush/CSC496/LocalitySensitiveHashing/data/46_nixon_1969.txt',
 '/users/trush/CSC496/LocalitySensitiveHashing/data/27_cleveland_1893.txt',
 '/users/trush/CSC496/LocalitySensitiveHashing/data/32_wilson_1913.txt',
 '/users/trush/CSC496/LocalitySensitiveHashing/data/15_polk_1845.txt',
 '/users/trush/CSC496/LocalitySensitiveHashing/data/34_harding_1921.txt',
 '/users/trush/CSC496/LocalitySensitiveHashing/data/40_roosevelt_franklin_1945.txt',
 '/users/trush/CSC496/LocalitySensitiveHashing/data/4_jefferson_1801.txt',
 '/users/trush/CSC496/LocalitySensitiveHashing/data/48_carter_1977.txt',
 '/users/trush/CSC496/LocalitySensitiveHashing/data/41_truman_1949.txt',
 '/users/trush/CSC496/LocalitySensitiveHashing/data/6_madison_1809.txt',
 '/users/trush/CSC496/LocalitySensitiveHashing/data/14_harrison_1841.txt',
 '/users/trush/CSC496/LocalitySensitiveHashing/data/24_garfield_1881.txt',
 '/users/trush/CSC496/LocalitySen