# Find Anagrams in Sherlock Holmes

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Python Pair RDD Functions").getOrCreate()

file = "/home/jovyan/Resources/sherlock-holmes.txt"
lines = spark.sparkContext.textFile(file)

import re

# use regular expression to split lines into words and translate them into lower case
words = lines.flatMap(lambda w : re.split('[ \],.:;?!\-@#\(\)\\\*\"\/]*', w)).map(lambda w : w.lower())

# select words longer than 6 letters and make them distinct
dstnc = words.filter(lambda w : len(w) > 6).distinct()

# we're creating a pair RDD where the key is the original word sorted by letters 
# and the initial value is a singleton list with the word as an element
# any two words sharing the same sorted forms are anagrams

pairs = dstnc.map(lambda w : (''.join(sorted(w)), [w]))

# the reduce function is concatenating lists of words
# map is dropping the keys as they're no loner needed
# filter is returning only those lists, which are longer than 1 (true anagrams)

angrs = pairs.reduceByKey(lambda w1, w2 : w1 + w2).map(lambda p : p[1]).filter(lambda w : len(w) > 1)

for a in angrs.collect() :
  print(a)