In [1]:
import os
from pyspark import SparkContext, SparkConf

os.environ['PYSPARK_SUBMIT_ARGS'] = '--jars /warcbase-core/target/warcbase-core-0.1.0-SNAPSHOT-fatjar.jar pyspark-shell'

conf = SparkConf().setAppName("warcbase-py")
sc = SparkContext(conf=conf)

print("READY")

In [48]:
from urlparse import urlparse
from operator import add

path = "/warcbase-core/src/test/resources/arc/example.arc.gz"

# This hooks in our archive file reader:
rdd = sc.newAPIHadoopFile(
        path,
        "org.warcbase.mapreduce.WacArcInputFormat",
        "org.apache.hadoop.io.LongWritable",
        "org.warcbase.io.ArcRecordWritable",
        valueConverter="org.warcbase.spark.pythonconverters.ArcRecordWritableToHtmlConverter", 
        conf=None)

# Note that we do not support the built-in keepValidPages as that's in the Scala extensions
# So, we re-implement here:
def keepValidPages(r):
    # Strip out empty records, ones without dates or robots.txt files:
    if r == None or r['date'] == None or r['url'].endswith("/robots.txt"):
        return False
    # Filter in based on allowed mime-types and URL patterns:
    if r['mime-type'] == "text/html" or r['mime-type'] == "application/xhtml+xml":
        return True
    elif r['url'].endswith("htm") or r['url'].endswith("html"):
        return True
    else:
        return False

# Filter the pages and extract the hostnames:
r = rdd.filter(lambda x: keepValidPages(x[1])).map(lambda x: urlparse(x[1]['url']).hostname)

# Now reduce so as to count frequency:
r = r.map(lambda x: (x, 1)).reduceByKey(add)

# Collect the results:
results = r.collect()

print(results)

[(u'deadlists.com', 2), (u'www.archive.org', 132), (u'www.hideout.com.br', 1)]


In [50]:
!pip install plotly


Collecting plotly
  Downloading plotly-2.0.7.tar.gz (923kB)
[K    100% |████████████████████████████████| 931kB 365kB/s ta 0:00:01
Building wheels for collected packages: plotly
  Running setup.py bdist_wheel for plotly ... [?25ldone
[?25h  Stored in directory: /home/jovyan/.cache/pip/wheels/83/86/c0/37a6cf53adca47bb03240f069c4348cea99f9e6f159948ee91
Successfully built plotly
Installing collected packages: plotly
Successfully installed plotly-2.0.7


In [58]:
import plotly.offline as py
import plotly.graph_objs as go

py.init_notebook_mode(connected=True)

labels=[]
values=[]

for host, count in results:
    labels.append(host)
    values.append(count)

trace=go.Pie(labels=labels,values=values)

py.iplot([trace], filename="temp")
