In [1]:
#  Replace with your container and storage account:  "wasbs://<container>@<storage account>.blob.core.windows.net/"
pathPrefix = "wasbs://marvel@vpldb.blob.core.windows.net/"
#  Fetch porgat.txt from storage account
file = sc.textFile(pathPrefix + "porgat.txt")

In [2]:
#  Remove the headers from the file:  lines starting with a star
noHeaders = file.filter(lambda x: len(x)>0 and x[0]!='*')
#  Extract a pair from each line:  the leading integer and a string for the rest of the line
paired = noHeaders.map(lambda l:  l.partition(' ')).filter(lambda t:  len(t)==3 and len(t[0])>0 and len(t[2])>0).map(lambda t: (int(t[0]), t[2]))
#  Filter relationships as they do not start with quotes, then split the integer list
scatteredRelationships = paired.filter(lambda (charId, text):  text[0]!='"').map(lambda (charId, text): (charId, [int(x) for x in text.split(' ')]))
#  Relationships for the same character id sometime spans more than a line in the file, so let's group them together
relationships = scatteredRelationships.reduceByKey(lambda pubList1, pubList2: pubList1 + pubList2)
#  Filter non-relationships as they start with quotes ; remove the quotes
nonRelationships = paired.filter(lambda (index, text):  text[0]=='"').map(lambda (index, text):  (index, text[1:-1].strip()))
#  Characters stop at a certain line (part of the initial header ; we hardcode it here)
characters = nonRelationships.filter(lambda (charId, name): charId<=6486)
#  Publications starts after the characters
publications = nonRelationships.filter(lambda (charId, name): charId>6486)

In [3]:
#  Let's find the characters appearing together most often

#  Let's take the relationship RDD and do a cartesian product with itself all possible duos ; we repartition to be able to scale
product = relationships.repartition(100).cartesian(relationships)
#  Let's then remap it to have the character ids together and intersect their publications (using Python's sets)
remapped = product.map(lambda ((charId1, pubList1), (charId2, pubList2)): ((charId1, charId2), list(set(pubList1) & set(pubList2))))
#  Let's eliminate doublons
noDoublons = remapped.filter(lambda ((charId1, charId2), pubList): charId1<charId2)
#  Let's remove empty publication list
noEmptyPublications = noDoublons.filter(lambda ((charId1, charId2), pubList): len(pubList)>0)
#  Let's flip the mapping in order to sort by length of publications & drop the publication lists themselves
sorted = noEmptyPublications.map(lambda ((charId1, charId2), pubList): (len(pubList), (charId1, charId2))).sortByKey(False)
#  Action:  let's output the first 10 results
top10 = sorted.take(10)

In [4]:
#  Join once for the first character ; we first need to flip the RDD to have charId1 as the key
name1 = sc.parallelize(top10).map(lambda (pubCount, (charId1, charId2)): (charId1, (charId2, pubCount))).join(characters)
#  Let's perform a similar join on the second character
name2 = name1.map(lambda (charId1, ((charId2, pubCount), name1)): (charId2, (name1, charId1, pubCount))).join(characters)
#  Let's format the RDD a bit
formattedTop10 = name2.map(lambda (charId2, ((name1, charId1, pubCount), name2)): (pubCount, (name1, charId1, name2, charId2)))

#  We need to sort the results again:  when we parallelized the top10 it got partitionned and each partition moved independantly
formattedTop10.sortByKey(False).collect()