In [1]:
import os
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
from pyspark import SparkContext
from pyspark.mllib.fpm import FPGrowth
import math

SparkContext.setSystemProperty('spark.executor.memory', '5g')

authors = sc.textFile("publications.txt")\
            .filter(lambda line : line.startswith("#@"))\
            .filter(lambda line : len(line.strip())>2)\
            .map(lambda line : list(set(line[2:].strip().split(","))))\
            .persist()

In [2]:
def getFrequentItems(authors, support):
    model = FPGrowth.train(authors, minSupport = support, numPartitions = 1)
    result = model.freqItemsets().collect()
    return result

In [3]:
# support threshold = 1e-4 
result = getFrequentItems(authors, 0.0001)
print("Number of frequent itemsets for min support of 1e-4 is ", len(result))

Number of frequent itemsets for min support of 1e-4 is  665


In [4]:
# support threshold = 1e-5
result = getFrequentItems(authors, 0.00001)
print("Number of frequent itemsets for min support of 1e-5 is ", len(result))

Number of frequent itemsets for min support of 1e-5 is  58955


In [5]:
# support threshold = 0.5e-5
result = getFrequentItems(authors, 0.000005)
print("Number of frequent itemsets for min support of 0.5e-5 is ", len(result))

Number of frequent itemsets for min support of 0.5e-5 is  168269


In [None]:
# support threshold = 1e-6
result = getFrequentItems(authors, 0.000001)
print("Number of frequent itemsets for min support of 1e-6 is ", len(result))

<b>As, we successively decrease the support threshold from 1e-4 to 1e-6, the size of FP-Tree increases rapidly and the number of frequent itemsets computed by FP-Growth algorithm grows almost exponentially.
<br>

Threshold  #itemsets <br>
1e-4    =>  665<br>
1e-5    =>  58955<br>
0.5e-5  =>  168269<br>
1e-6    =>  ??<br>
<br>

When the threshold is dropped to as low as 1e-6, the FP-Tree occupies the entire heap memory, and throws memory out of bounds Exception.<b>

In [7]:
import itertools
import operator

# prints a list of top 5 co-authors for a given author
def getTopCoAuthors(result, name):
    coAuthors = {}
    for fi in result:
        if name in fi.items:
            for coAuth in fi.items:
                if coAuth!=name:
                    coAuthors[coAuth] = fi.freq if coAuthors.get(coAuth) is None else coAuthors.get(coAuth)+fi.freq
                    
    sortedAuths = sorted(sorted(coAuthors.items(), key=operator.itemgetter(0)), key=operator.itemgetter(1), reverse=True)
    top5 = itertools.islice(sortedAuths, 5)
    for item in list(top5): 
        print(item[0])

In [8]:
# Top-5 co-authors for Rakesh Agrawal
name = "Rakesh Agrawal"
getTopCoAuthors(result, name)

Ramakrishnan Srikant
Jerry Kiernan
H. V. Jagadish
Michael J. Carey
Roberto J. Bayardo Jr.


In [9]:
# Top-5 co-authors for Jiawei Han
name = "Jiawei Han"
getTopCoAuthors(result, name)

Xifeng Yan
Philip S. Yu
Deng Cai
Xiaofei He
Hong Cheng


In [10]:
# Top-5 co-authors for Zoubin Ghahramani
name = "Zoubin Ghahramani"
getTopCoAuthors(result, name)

David L. Wild
Katherine A. Heller
Michael I. Jordan


In [11]:
# Top-5 co-authors for Christos Faloutsos
name = "Christos Faloutsos"
getTopCoAuthors(result, name)

Agma J. M. Traina
Caetano Traina Jr.
Hanghang Tong
Spiros Papadimitriou
Jimeng Sun
