In [0]:
from pyspark import SparkFiles
from graphframes import GraphFrame
from pyspark.sql.functions import count, desc, asc

class Graphframes:
    def __init__(self, inp):
        
        spark.sparkContext.addFile(inp)
        self.reddit = spark.read.csv("file://"+SparkFiles.get("soc-redditHyperlinks-body.tsv"), sep=r'\t', header = True)
        
        self.createGraph()
        self.solutions()
    def createGraph(self):
        src = self.reddit.select("SOURCE_SUBREDDIT").distinct()
        dst = self.reddit.select("TARGET_SUBREDDIT").distinct()
        # spark dataframe union retains duplicates, hence we use distinct
        self.vertices = src.union(dst).distinct().withColumnRenamed("SOURCE_SUBREDDIT", "id")
        
        self.edges = self.reddit.select("SOURCE_SUBREDDIT", "TARGET_SUBREDDIT", "LINK_SENTIMENT")\
        .withColumnRenamed("SOURCE_SUBREDDIT", "src")\
        .withColumnRenamed("TARGET_SUBREDDIT", "dst")\
        .withColumnRenamed("LINK_SENTIMENT", "sentiment")
        
    def solutions(self):
        g = GraphFrame(self.vertices, self.edges) # step that created the graph
        g.cache()
                
        #Find the top 5 nodes with the highest outdegree and find the count of the number of outgoing edges in each
        display(g.outDegrees.orderBy(desc("outDegree")).head(5))
        
        #Find the top 5 nodes with the highest indegree and find the count of the number of incoming edges in each
        display(g.inDegrees.orderBy(desc("inDegree")).head(5))
        
        #Calculate PageRank for each of the nodes and output the top 5 nodes with the highest PageRank values
        ranks = g.pageRank(maxIter=10)
        ranks.cache()
        display(ranks.vertices.orderBy(desc("pagerank")).head(5))
        
        
        !mkdir /databricks/driver/checkpoints
        # we need checkpointing to save temperory data used in big processes like finding connected components
        sc.setCheckpointDir('/databricks/driver/checkpoints')
        
        # Run the connected components algorithm on it and find the top 5 components with the largest number of nodes.
        display(g.connectedComponents().orderBy(desc('component')).head(5))
        
        # Run the triangle counts algorithm on each of the vertices and output the top 5 vertices with the largest triangle count
        display(g.triangleCount().orderBy(desc("count")).head(5))

Graphframes("https://snap.stanford.edu/data/soc-redditHyperlinks-body.tsv")
        

id,outDegree
subredditdrama,4665
circlebroke,2358
shitliberalssay,1968
outoftheloop,1958
copypasta,1824


id,inDegree
askreddit,7329
iama,3694
pics,2779
writingprompts,2490
videos,2446


id,pagerank
askreddit,592.0040787061722
iama,484.09984860204776
videos,312.0979902738428
pics,242.5159664733212
leagueoflegends,189.4874381045425


id,component
stephaniemichelle,1692217114753
ultimatepatreon,1692217114753
challenger,1632087572613
srt,1632087572613
lifepluslair,1614907703381


count,id
31967,askreddit
26072,subredditdrama
24581,iama
15898,outoftheloop
11938,videos


Out[14]: <__main__.Graphframes at 0x7f8a9704f0d0>