In [1]:
import os
import sys
spark_path = os.environ['SPARK_HOME']
sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.10.9-src.zip")

import findspark
findspark.init()
import pyspark

In [2]:
number_cores = 6
memory_gb = 16
conf = (pyspark.SparkConf().setMaster('local[{}]'.format(number_cores)).set('spark.driver.memory', '{}g'.format(memory_gb)))
sc = pyspark.SparkContext(conf=conf)

In [3]:
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col
import numpy
spark = pyspark.sql.SparkSession(sc)

Run pip install numpy in pyspark environment, restart this notebook and rerun

In [4]:
dataA = [(0, Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]),),
         (1, Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]),),
         (2, Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]),),
         (3, Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]),),
         (4, Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]),),
         (5, Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]),),]
dfA = spark.createDataFrame(dataA, ["id", "features"])
mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=3)
model = mh.fit(dfA)
print("The hashed dataset where hashed values are stored in the column 'hashes':")
model.transform(dfA).show()

The hashed dataset where hashed values are stored in the column 'hashes':
+---+--------------------+--------------------+
| id|            features|              hashes|
+---+--------------------+--------------------+
|  0|(6,[0,1,2],[1.0,1...|[[2.24502626E8], ...|
|  1|(6,[2,3,4],[1.0,1...|[[1.19110179E8], ...|
|  2|(6,[0,2,4],[1.0,1...|[[1.19110179E8], ...|
|  3|(6,[1,3,5],[1.0,1...|[[6.02280753E8], ...|
|  4|(6,[2,3,5],[1.0,1...|[[6.02280753E8], ...|
|  5|(6,[1,2,4],[1.0,1...|[[1.19110179E8], ...|
+---+--------------------+--------------------+



In [5]:
final = model.approxSimilarityJoin(dfA, dfA, 1.0, distCol="distance")
final.createOrReplaceTempView("final")
final.printSchema()

root
 |-- datasetA: struct (nullable = false)
 |    |-- id: long (nullable = true)
 |    |-- features: vector (nullable = true)
 |    |-- hashes: array (nullable = true)
 |    |    |-- element: vector (containsNull = true)
 |-- datasetB: struct (nullable = false)
 |    |-- id: long (nullable = true)
 |    |-- features: vector (nullable = true)
 |    |-- hashes: array (nullable = true)
 |    |    |-- element: vector (containsNull = true)
 |-- distance: double (nullable = false)



## Important


In [6]:
sql = """
select datasetA.id, datasetA.features, datasetB.id, datasetB.features, distance
from final where datasetA.id != datasetB.id order by datasetA.id, datasetB.id"""
spark.sql(sql).show()

+---+--------------------+---+--------------------+--------+
| id|            features| id|            features|distance|
+---+--------------------+---+--------------------+--------+
|  0|(6,[0,1,2],[1.0,1...|  1|(6,[2,3,4],[1.0,1...|     0.8|
|  0|(6,[0,1,2],[1.0,1...|  2|(6,[0,2,4],[1.0,1...|     0.5|
|  0|(6,[0,1,2],[1.0,1...|  4|(6,[2,3,5],[1.0,1...|     0.8|
|  0|(6,[0,1,2],[1.0,1...|  5|(6,[1,2,4],[1.0,1...|     0.5|
|  1|(6,[2,3,4],[1.0,1...|  0|(6,[0,1,2],[1.0,1...|     0.8|
|  1|(6,[2,3,4],[1.0,1...|  2|(6,[0,2,4],[1.0,1...|     0.5|
|  1|(6,[2,3,4],[1.0,1...|  4|(6,[2,3,5],[1.0,1...|     0.5|
|  1|(6,[2,3,4],[1.0,1...|  5|(6,[1,2,4],[1.0,1...|     0.5|
|  2|(6,[0,2,4],[1.0,1...|  0|(6,[0,1,2],[1.0,1...|     0.5|
|  2|(6,[0,2,4],[1.0,1...|  1|(6,[2,3,4],[1.0,1...|     0.5|
|  2|(6,[0,2,4],[1.0,1...|  4|(6,[2,3,5],[1.0,1...|     0.8|
|  2|(6,[0,2,4],[1.0,1...|  5|(6,[1,2,4],[1.0,1...|     0.5|
|  3|(6,[1,3,5],[1.0,1...|  4|(6,[2,3,5],[1.0,1...|     0.5|
|  4|(6,[2,3,5],[1.0,1..

In [7]:
dataB = [('C1', Vectors.sparse(7, [0, 1, 5, 6], [1.0, 1.0, 1.0, 1.0])),
         ('C2', Vectors.sparse(7, [2, 3, 4], [1.0, 1.0, 1.0])),
         ('C3', Vectors.sparse(7, [0, 5, 6], [1.0, 1.0, 1.0])),
         ('C4', Vectors.sparse(7, [1, 2, 3, 4], [1.0, 1.0, 1.0, 1.0]))]

## Note:
Linux environment with true Hadoop support wholeTextFiles, but we will try a different way:
    raw_data = sc.wholeTextFile().cache()

In [8]:
import glob
data_path = "/users/trush/CSC496/LocalitySensitiveHashing/data"
all_files = glob.glob("/users/trush/CSC496/LocalitySensitiveHashing/data/*.txt")

file_names = sc.parallelize(all_files)
file_names.collect()

['/users/trush/CSC496/LocalitySensitiveHashing/data/57_obama_2013.txt',
 '/users/trush/CSC496/LocalitySensitiveHashing/data/46_nixon_1969.txt',
 '/users/trush/CSC496/LocalitySensitiveHashing/data/27_cleveland_1893.txt',
 '/users/trush/CSC496/LocalitySensitiveHashing/data/32_wilson_1913.txt',
 '/users/trush/CSC496/LocalitySensitiveHashing/data/15_polk_1845.txt',
 '/users/trush/CSC496/LocalitySensitiveHashing/data/34_harding_1921.txt',
 '/users/trush/CSC496/LocalitySensitiveHashing/data/40_roosevelt_franklin_1945.txt',
 '/users/trush/CSC496/LocalitySensitiveHashing/data/4_jefferson_1801.txt',
 '/users/trush/CSC496/LocalitySensitiveHashing/data/48_carter_1977.txt',
 '/users/trush/CSC496/LocalitySensitiveHashing/data/41_truman_1949.txt',
 '/users/trush/CSC496/LocalitySensitiveHashing/data/6_madison_1809.txt',
 '/users/trush/CSC496/LocalitySensitiveHashing/data/14_harrison_1841.txt',
 '/users/trush/CSC496/LocalitySensitiveHashing/data/24_garfield_1881.txt',
 '/users/trush/CSC496/LocalitySen

In [9]:
raw_data = file_names.map(lambda f: (f.split("data")[1][1:], open(f,mode='r').read()))
raw_data.take(5)

[('57_obama_2013.txt',
  "Barack Obama\t1/21/2013\tThank you. Thank you so much. Vice President Biden, Mr. Chief Justice, Members of the United States Congress, distinguished guests, and fellow citizens: Each time we gather to inaugurate a President we bear witness to the enduring strength of our Constitution. We affirm the promise of our democracy. We recall that what binds this Nation together is not the colors of our skin or the tenets of our faith or the origins of our names. What makes us exceptional--; what makes us American--; is our allegiance to an idea articulated in a declaration made more than two centuries ago: We hold these truths to be self-evident, that all men are created equal; that they are endowed by their Creator with certain unalienable rights; that among these are life, liberty, and the pursuit of happiness. Today we continue a never-ending journey to bridge the meaning of those words with the realities of our time. For history tells us that while these truths ma

In [10]:
%%time
raw_data.count()

CPU times: user 15.9 ms, sys: 1.46 ms, total: 17.4 ms
Wall time: 156 ms


57

In [11]:
raw_data.take(5)

[('57_obama_2013.txt',
  "Barack Obama\t1/21/2013\tThank you. Thank you so much. Vice President Biden, Mr. Chief Justice, Members of the United States Congress, distinguished guests, and fellow citizens: Each time we gather to inaugurate a President we bear witness to the enduring strength of our Constitution. We affirm the promise of our democracy. We recall that what binds this Nation together is not the colors of our skin or the tenets of our faith or the origins of our names. What makes us exceptional--; what makes us American--; is our allegiance to an idea articulated in a declaration made more than two centuries ago: We hold these truths to be self-evident, that all men are created equal; that they are endowed by their Creator with certain unalienable rights; that among these are life, liberty, and the pursuit of happiness. Today we continue a never-ending journey to bridge the meaning of those words with the realities of our time. For history tells us that while these truths ma

In [81]:
# unique words in speech
import string
translator = str.maketrans('', '', string.punctuation)

tokenized_data = raw_data.map(lambda p: (p[0], sorted(list(set((build_shingle(p[1])))))))
tokenized_data.take(5)

[('57_obama_2013.txt',
  ['\t1/2',
   '\tTha',
   "'lln",
   "'sco",
   "'sho",
   "'sil",
   "'spo",
   "'spr",
   "'sta",
   "'svi",
   "'swh",
   "'swo",
   "'ven",
   ',Mem',
   ',Mr.',
   ',ago',
   ',and',
   ',any',
   ',are',
   ',asc',
   ',aso',
   ',bec',
   ',but',
   ',com',
   ',dec',
   ',dis',
   ',eac',
   ',ent',
   ',for',
   ',fro',
   ',hav',
   ',hop',
   ',hum',
   ',itm',
   ',kno',
   ',lea',
   ',let',
   ',lib',
   ',lik',
   ',not',
   ',now',
   ',orb',
   ',orh',
   ',our',
   ',rea',
   ',ref',
   ',rev',
   ',sch',
   ',sea',
   ',som',
   ',sti',
   ',sun',
   ',tem',
   ',tha',
   ',the',
   ',toh',
   ',tot',
   ',und',
   ',was',
   ',wea',
   ',wed',
   ',weh',
   ',wel',
   ',wem',
   ',wer',
   ',whe',
   ',who',
   ',wit',
   '--;a',
   '--;f',
   '--;i',
   '--;n',
   '--;o',
   '--;s',
   '--;t',
   '--;u',
   '--;w',
   '-;an',
   '-;fo',
   '-;is',
   '-;no',
   '-;ou',
   '-;so',
   '-;th',
   '-;to',
   '-;un',
   '-;wh',
   '-cal',
   '-ca

In [78]:
s = 'l and commerce, schools and colleges to train our workers. Together, we discovered that'
shingling(s)
def build_shingle(s):
    s = s.replace(" ", "")
    shingle = []
    for i in range(4, len(s)):
        shingle.append(s[i-4:i])
    return shingle

build_shingle(s)

['land',
 'andc',
 'ndco',
 'dcom',
 'comm',
 'omme',
 'mmer',
 'merc',
 'erce',
 'rce,',
 'ce,s',
 'e,sc',
 ',sch',
 'scho',
 'choo',
 'hool',
 'ools',
 'olsa',
 'lsan',
 'sand',
 'andc',
 'ndco',
 'dcol',
 'coll',
 'olle',
 'lleg',
 'lege',
 'eges',
 'gest',
 'esto',
 'stot',
 'totr',
 'otra',
 'trai',
 'rain',
 'aino',
 'inou',
 'nour',
 'ourw',
 'urwo',
 'rwor',
 'work',
 'orke',
 'rker',
 'kers',
 'ers.',
 'rs.T',
 's.To',
 '.Tog',
 'Toge',
 'oget',
 'geth',
 'ethe',
 'ther',
 'her,',
 'er,w',
 'r,we',
 ',wed',
 'wedi',
 'edis',
 'disc',
 'isco',
 'scov',
 'cove',
 'over',
 'vere',
 'ered',
 'redt',
 'edth',
 'dtha']

In [38]:
all_words = tokenized_data.flatMap(lambda p: p[1]) \
                                   .map(lambda w: (w, 1)) \
                                   .reduceByKey(lambda x, y: x+y) \
                                   .map(lambda p: p[0]).collect()
all_words = sorted(all_words)

In [40]:
# shingles
print(all_words[0:20])
print(len(all_words))

['', '03', '04', '04\tabout', '04\tcalled', '04\tcitizens', '04\tfellow', '04\tfriends', '04\ti', '04\tin', '04\tmy', '04\tproceeding', '04\tthe', '04\tunwilling', '04\twhen', '05\telected', '05\tfellow', '1', '100000000', '120000000']
9347


In [24]:
total_words = len(all_words)
def buildVector(words):
    indexList = []
    checkList = []
    
    for w in words:
        indexList.append(all_words.index(w))
        checkList.append(1.0)
    return Vectors.sparse(total_words, indexList, checkList)

In [47]:
# The hard part is getting to this cell
dataC = tokenized_data.map(lambda p: (p[0], buildVector(p[1])))
#dataC.take(5)

In [45]:
dfC = spark.createDataFrame(dataC, ["id", "features"])
mhC = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)
modelC = mhC.fit(dfC)

print("The hashed dataset where hashed values are stored in the column 'hashes': ")
modelC.transform(dfC).show()

The hashed dataset where hashed values are stored in the column 'hashes': 
+--------------------+--------------------+--------------------+
|                  id|            features|              hashes|
+--------------------+--------------------+--------------------+
|   57_obama_2013.txt|(9347,[0,27,67,81...|[[895923.0], [413...|
|   46_nixon_1969.txt|(9347,[0,66,69,10...|[[634434.0], [525...|
|27_cleveland_1893...|(9347,[0,1,10,100...|[[3297551.0], [22...|
|  32_wilson_1913.txt|(9347,[100,113,12...|[[34027.0], [9770...|
|    15_polk_1845.txt|(9347,[0,1,6,35,1...|[[634434.0], [181...|
| 34_harding_1921.txt|(9347,[0,100,111,...|[[634434.0], [376...|
|40_roosevelt_fran...|(9347,[30,59,100,...|[[1.1393533E7], [...|
|4_jefferson_1801.txt|(9347,[0,1,7,100,...|[[895923.0], [149...|
|  48_carter_1977.txt|(9347,[0,31,100,1...|[[4420936.0], [16...|
|  41_truman_1949.txt|(9347,[0,25,100,1...|[[895923.0], [319...|
|  6_madison_1809.txt|(9347,[0,1,13,100...|[[973352.0], [790...|
|14_harrison_18

In [46]:
finalC = modelC.approxSimilarityJoin(dfC, dfC, 1.0, distCol="distance")
finalC.createOrReplaceTempView("final")
finalC.printSchema()

root
 |-- datasetA: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- features: vector (nullable = true)
 |    |-- hashes: array (nullable = true)
 |    |    |-- element: vector (containsNull = true)
 |-- datasetB: struct (nullable = false)
 |    |-- id: string (nullable = true)
 |    |-- features: vector (nullable = true)
 |    |-- hashes: array (nullable = true)
 |    |    |-- element: vector (containsNull = true)
 |-- distance: double (nullable = false)

