In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages graphframes:graphframes:0.6.0-spark2.3-s_2.11 pyspark-shell'

import pyspark
from pyspark.sql import *
from pyspark.sql.functions import udf

In [2]:
sc = pyspark.SparkContext("local[*]")
spark = SparkSession.builder.appName('notebook').getOrCreate()

In [3]:
from graphframes import *
import hashlib

### Load sample webgraph

In [4]:
# This dataset is already filtered to include only links between TLDs, not within TLDs. 
# I also filtered out common sites and resources for a cleaner sample graph.
raw_data = spark.read.parquet("data/outlinks_pq/*.snappy.parquet")
raw_data.count()

18399

In [5]:
# Rename columns to something decent. 
df = raw_data.withColumnRenamed("_c0", "parent")\
.withColumnRenamed("_c1", "parentTLD")\
.withColumnRenamed("_c2", "childTLD")\
.withColumnRenamed("_c3", "child")\
.filter("parentTLD is not null and childTLD is not null")

In [6]:
df.show(5)

+--------------+---------+----------------+--------------------+
|        parent|parentTLD|        childTLD|               child|
+--------------+---------+----------------+--------------------+
|http://msn.com|  msn.com|tradedoubler.com|https://clk.trade...|
|http://msn.com|  msn.com|   microsoft.com|https://go.micros...|
|http://msn.com|  msn.com|     outlook.com|http://www.outloo...|
|http://msn.com|  msn.com|   microsoft.com|https://advertisi...|
|http://msn.com|  msn.com|tradedoubler.com|https://clk.trade...|
+--------------+---------+----------------+--------------------+
only showing top 5 rows



In [7]:
# Select set of parents and children TLDs (your nodes) to assign id for each node.

aggcodes = df.select("parentTLD","childTLD").rdd.flatMap(lambda x: x).distinct()
aggcodes.count()

4613

In [8]:
def hashnode(x):
    return hashlib.sha1(x.encode("UTF-8")).hexdigest()[:8]

hashnode_udf = udf(hashnode)

In [9]:
vertices = aggcodes.map(lambda x: (hashnode(x), x)).toDF(["id","name"])

vertices.show(5)

+--------+----------------+
|      id|            name|
+--------+----------------+
|000db143|         msn.com|
|51a48ea2|tradedoubler.com|
|31312317|   microsoft.com|
|a45016f2|     outlook.com|
|2f5bf4c8|        bing.com|
+--------+----------------+
only showing top 5 rows



In [10]:
edges = df.select("parentTLD","childTLD")\
.withColumn("src", hashnode_udf("parentTLD"))\
.withColumn("dst", hashnode_udf("childTLD"))\
.select("src","dst")

edges.show(5)

+--------+--------+
|     src|     dst|
+--------+--------+
|000db143|51a48ea2|
|000db143|31312317|
|000db143|a45016f2|
|000db143|31312317|
|000db143|51a48ea2|
+--------+--------+
only showing top 5 rows



In [15]:
# create GraphFrame
graph = GraphFrame(vertices, edges)

In [17]:
graph

GraphFrame(v:[id: string, name: string], e:[src: string, dst: string])

### Label Propagation Algorithm

In [None]:
# Run LPA
communities = graph.labelPropagation(maxIter=5)

In [None]:
communities.persist().show(10)

In [None]:
print (f"There are {communities.select('label').distinct().count()} communities in sample graph.")

### Degrees and PageRank

In [16]:
# Count nodes by number of in-degrees

graph.inDegrees.join(vertices, on="id")\
.orderBy("inDegree", ascending=False).show(10)

+--------+--------+-------------+
|      id|inDegree|         name|
+--------+--------+-------------+
|465806fb|    1223|  twitter.com|
|b7c70898|    1154| facebook.com|
|baea954b|     584|   google.com|
|d84f4904|     374|pinterest.com|
|366b6783|     358|       sky.it|
|06252e37|     295|instagram.com|
|d7e222c8|     266|  youtube.com|
|1a8028a6|     172|        po.st|
|cd0cf82b|     167|    zoznam.sk|
|de740f90|     148|    apple.com|
+--------+--------+-------------+
only showing top 10 rows



In [17]:
# Run PageRank

results = graph.pageRank(resetProbability=0.01, maxIter=20)
results.vertices.select("id", "pagerank")\
.join(vertices, on="id").orderBy("pagerank", ascending=False)\
.show(10)

+--------+------------------+-------------+
|      id|          pagerank|         name|
+--------+------------------+-------------+
|15a17dcc| 72.02193053242661|messenger.com|
|b7c70898| 71.82925336390569| facebook.com|
|465806fb| 62.38821611443578|  twitter.com|
|d7e222c8| 47.78395449650086|  youtube.com|
|baea954b|29.768891546015652|   google.com|
|06252e37|21.365246391507494|instagram.com|
|d84f4904|13.619116925738885|pinterest.com|
|de740f90|10.538651654351058|    apple.com|
|1b274516| 4.731072973007598| linkedin.com|
|f0a549e2|4.4062056573357875|        ds.tl|
+--------+------------------+-------------+
only showing top 10 rows

