In [3]:
# References
# http://mccormickml.com/2015/06/12/minhash-tutorial-with-python-code/
# https://mattilyra.github.io/2017/05/23/document-deduplication-with-lsh.html

import findspark
import os
os.environ['PATH'].split(';')
findspark.init()
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import *
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import udf

spark = SparkSession.builder.appName("midterm").getOrCreate()

spark.sparkContext.setSystemProperty('spark.executor.memory', '8g')

data = spark.sparkContext.wholeTextFiles("cookbook_text")
data_new = data.map(lambda x: Row(name=x[0],content=x[1].strip()))
schemaString = "name content"
fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
schema = StructType(fields)
df1 = spark.createDataFrame(data_new, schema)
df1.show()

+--------------------+--------------------+
|                name|             content|
+--------------------+--------------------+
|file:/C:/Users/va...|The American Matr...|
|file:/C:/Users/va...|The American Woma...|
|file:/C:/Users/va...|Manual For Army C...|
|file:/C:/Users/va...|"Aunt Babette's" ...|
|file:/C:/Users/va...|THE 

IDEAL BAR...|
|file:/C:/Users/va...|A bookplate illus...|
|file:/C:/Users/va...|THE BLUE GRASS 
...|
|file:/C:/Users/va...|THE  BOSTON COOKI...|
|file:/C:/Users/va...|Breakfast, Lunche...|
|file:/C:/Users/va...|Practical  Housek...|
|file:/C:/Users/va...|Cooking in old Cr...|
|file:/C:/Users/va...|Dr. Chase's Recip...|
|file:/C:/Users/va...|Chinese-Japanese ...|
|file:/C:/Users/va...|Chocolate and Coc...|
|file:/C:/Users/va...|Common Sense in T...|
|file:/C:/Users/va...|The Complete Conf...|
|file:/C:/Users/va...|The Cook's Own Bo...|
|file:/C:/Users/va...|La Cuisine Creole...|
|file:/C:/Users/va...|Directions for Co...|
|file:/C:/Users/va...|Dishes &am

In [4]:
def get_shingles(text, char_ngram=5):
    """Create a set of overlapping character n-grams.
    
    Only full length character n-grams are created, that is the first character
    n-gram is the first `char_ngram` characters from text, no padding is applied.

    Each n-gram is spaced exactly one character apart.

    Parameters
    ----------

    text: str
        The string from which the character n-grams are created.

    char_ngram: int (default 5)
        Length of each character n-gram.
    """
    return list(set((text[head:head + char_ngram]) for head in range(0, len(text) - char_ngram)))

myudf = lambda y: get_shingles(y)
myUDF = udf(myudf,ArrayType(StringType()))
newDF = df1.withColumn("shingles", myUDF(df1.content))
newDF = newDF.drop("content")
newDF.show()

+--------------------+--------------------+
|                name|            shingles|
+--------------------+--------------------+
|file:/C:/Users/va...|[ Or, ,  befo, un...|
|file:/C:/Users/va...|[ted m, k, bo, ct...|
|file:/C:/Users/va...|[ted m,  Litt, rt...|
|file:/C:/Users/va...|[ted m, Glitt, gn...|
|file:/C:/Users/va...|[ors t, msen , te...|
|file:/C:/Users/va...|[ted m,  Litt, s ...|
|file:/C:/Users/va...|[ Litt, ets c, 
T...|
|file:/C:/Users/va...|[ted m, k, bo, us...|
|file:/C:/Users/va...|[ted m, ct ev, y,...|
|file:/C:/Users/va...|[k, bo, ted m, ct...|
|file:/C:/Users/va...|[k, bo, vin d, RN...|
|file:/C:/Users/va...|[ted m, k, bo, gn...|
|file:/C:/Users/va...|[led  , If to, hi...|
|file:/C:/Users/va...|[ted m,  MILL,  L...|
|file:/C:/Users/va...|[ted m, k, bo, y,...|
|file:/C:/Users/va...|[ted m,  Litt, y ...|
|file:/C:/Users/va...|[ted m, k, bo, et...|
|file:/C:/Users/va...|[ Litt, S-- U, y ...|
|file:/C:/Users/va...|[ted m,  Litt, y ...|
|file:/C:/Users/va...|[ted m,  L

In [5]:
from pyspark.ml.feature import CountVectorizer,MinHashLSH
cv = CountVectorizer(inputCol="shingles", outputCol="features", vocabSize=100000, minDF=10.0)
model = cv.fit(newDF)
result = model.transform(newDF)
mh = MinHashLSH(inputCol="features", outputCol="hashes", seed=12345)
model = mh.fit(result)
resultDF = model.transform(result)
print("Showing files with their hashes, the ones with same hash value have almost similar content")
resultDF.show()

Showing files with their hashes, the ones with same hash value have almost similar content
+--------------------+--------------------+--------------------+---------------+
|                name|            shingles|            features|         hashes|
+--------------------+--------------------+--------------------+---------------+
|file:/C:/Users/va...|[ Or, ,  befo, un...|(65009,[0,1,2,3,4...|[[2.6514721E7]]|
|file:/C:/Users/va...|[ted m, k, bo, ct...|(65009,[0,1,2,3,4...|    [[32321.0]]|
|file:/C:/Users/va...|[ted m,  Litt, rt...|(65009,[0,1,2,3,4...|    [[32321.0]]|
|file:/C:/Users/va...|[ted m, Glitt, gn...|(65009,[0,1,2,3,4...|    [[32321.0]]|
|file:/C:/Users/va...|[ors t, msen , te...|(65009,[0,1,2,3,4...|   [[482803.0]]|
|file:/C:/Users/va...|[ted m,  Litt, s ...|(65009,[0,1,2,3,4...|    [[32321.0]]|
|file:/C:/Users/va...|[ Litt, ets c, 
T...|(65009,[0,1,2,3,4...|    [[72152.0]]|
|file:/C:/Users/va...|[ted m, k, bo, us...|(65009,[0,1,2,3,4...|    [[32321.0]]|
|file:/C:/Users/va