# Find Anagrams in Sherlock Holmes

- #### Find all anagrams of words longer than 6 letters 
- #### Find the longest list of anagrams for word
- #### Find the longest word with anagrams

In [None]:
import org.apache.spark.sql.SparkSession

val spark = SparkSession.builder.appName("Scala Pair RDD Functions").getOrCreate

val lines = spark.sparkContext.textFile("/home/jovyan/Resources/sherlock-holmes.txt")

val pattern = "[a-z]+".r

// use regular expression to split lines into words and translate them into lower case
val words = lines.map(w => w.toLowerCase).flatMap(l => pattern.findAllIn(l))

// select words longer than 6 letters and make them distinct
val dstnc = words.filter(w => w.length > 6).distinct()

// we're creating a pair RDD where the key is the original word sorted by letters 
// and the initial value is a singleton list with the word as an element
// any two words sharing the same sorted forms are anagrams

val pairs = dstnc.map(w => (w.toList.sorted.mkString, Array(w)))

// the reduce function is concatenating lists of words
// map is dropping the keys as they're no loner needed
// filter is returning only those lists, which are longer than 1 (true anagrams)

val angrs = pairs.reduceByKey((w1, w2) => w1 ++ w2).map(p => p._2).filter(w => w.size > 1)

println("All anagrams of words longer than 6 letters\n")
for (l <- angrs.collect()) {
    for (a <- l) {
        print(a + " ")
    }
  println
}

val maxList = angrs.map(a => (a, a.size)).reduce((r, c) => if (r._2 > c._2) r else c)
println("\nThe longest list of anagrams for word\n")
for (a <- maxList._1) {
    print(a + " ")
}
println

val maxSize = angrs.map(a => (a, a(0).length)).reduce((r, c) => if (r._2 > c._2) r else c)
println("\nThe longest word with anagrams\n")
for (a <- maxSize._1) {
    print(a + " ")
}
println
