In [1]:
import os
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
from pyspark import SparkContext
from pyspark.mllib.fpm import FPGrowth
import math

SparkContext.setSystemProperty('spark.executor.memory', '5g')

In [2]:
filePath = "publications.txt"
# filePath = "sample-data.txt"
file = open(filePath,'r')

authorVenues = {}   #{Author, list(Venues)}

for line in file:      
         
    # get list for authors for each publication    
    if line.startswith("#@"):
        line = line[2:].strip()
        if line != "":
            validAuthors = line.split(",")

    # construct list of venues for each author
    if line.startswith("#c"):
        venue = line[2:].strip()
        if venue != "":        
            for author in validAuthors:
                if authorVenues.get(author) is None:
                    authorVenues[author] = [venue]
                else:
                    authorVenues[author] = authorVenues.get(author) + [venue]
                authorVenues[author] = list(set(authorVenues.get(author)))                        

In [3]:
venues = sc.parallelize(list(authorVenues.values()))

In [6]:
support = 0.001
model = FPGrowth.train(venues, minSupport=support, numPartitions=1)
result = model.freqItemsets().collect()
print(len(result))

992


In [None]:
support = 0.0004 # or 0.00001
model = FPGrowth.train(venues, minSupport=support, numPartitions=1)
result = model.freqItemsets().collect()
print(len(result))

<b>On my system, when support is decreased to 0.4e-3 an 1e-4, FP-growth doesn't train the models, even when 5g of memory is assigned to Spark! It might be because, the size of FP-Tree grows exponentially along with the count of possible frequent itemsets. Thereby, consuming all the heap memory!<br>
It throws the following error:<br>
<h3>Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:46314)</h3>


Therefore, I gradually increased the threshold from 0.0004 and noticed that FP-Growth is able to train the models at threashold = 0.0006. So for the rest of the question (and better results), I have used the min-support as 0.0006.</b>

In [4]:
support = 0.0006
model = FPGrowth.train(venues, minSupport=support, numPartitions=1)
result = model.freqItemsets().collect()
print(len(result))

2176


In [5]:
import itertools
import operator

# prints a list of top 10 frequent venues for given seed conference
def getTopVenues(result, name):
    coVenues = {}
    for fi in result:
        if name in fi.items:
            for venue in fi.items:
                if venue!=name:
                    coVenues[venue] = fi.freq if coVenues.get(venue) is None else coVenues.get(venue)+fi.freq
#     print(coVenues)        
    sortedVenues = sorted(sorted(coVenues.items(), key=operator.itemgetter(0)), key=operator.itemgetter(1), reverse=True)
    top10 = itertools.islice(sortedVenues, 10)
    for item in list(top10): 
        print(item[0])  
    
# for some seed conferences like VLDB, we also have "VLDB Surveys" do wee need to consider them
# also "VLDB J."

In [6]:
name = "NIPS"
getTopVenues(result, name)

CoRR
ICML
Neural Computation


In [7]:
name = "KDD"
getTopVenues(result, name)

CoRR
ICDM


In [8]:
name = "VLDB"
getTopVenues(result, name)

ICDE
SIGMOD Conference
CoRR
IEEE Trans. Knowl. Data Eng.


In [9]:
name = "INFOCOM"
getTopVenues(result, name)

GLOBECOM
ICC
IEEE Journal on Selected Areas in Communications
IEEE/ACM Trans. Netw.
CoRR
Computer Networks
Computer Communications
ICDCS
WCNC
IEEE Trans. Parallel Distrib. Syst.


In [10]:
name = "ACL"
getTopVenues(result, name)

COLING
CoRR
LREC
