In [81]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import col, lit, sum as spark_sum

In [82]:
import os
import json
from dotenv import dotenv_values

import pandas as pd

In [83]:
config = dotenv_values("../.env")

In [84]:
config.items()

odict_items([('SPLIT_DATASET_DIR', '/dataset/splitupDataset'), ('LOG_DIR', '/Crawler/logs'), ('OUTPUT_DIR', '../Crawler/output'), ('SCRAPED_DATA', '../Crawler/output')])

In [85]:
spark = SparkSession.builder.appName("PageRank").getOrCreate()

In [86]:
class PageNode:
    def __init__(self, url, forwardLinks):
        self.url = url
        self.forwardLinks = forwardLinks
    def getUrl(self):
        return self.url
    def getOutGoingLinks(self):
        return self.forwardLinks
    
    def __str__(self):
        return f"{self.url}\t=>\t{self.forwardLinks}\n"
    def __repr__(self):
        return f"{self.url}\t=>\t{self.forwardLinks}\n"

In [102]:
pageNodes = {}

dataFiles = os.listdir(config["SCRAPED_DATA"])
for file in (dataFiles):
    with open(os.path.join(config["SCRAPED_DATA"], file)) as f:
        edgeData = json.load(f)
        for url in edgeData['forwardLinks']:
            pageNodes[url] = []
        pageNodes[edgeData['url']]=edgeData['forwardLinks']
        
print(pageNodes)
print(len(pageNodes))

{'https://wpvip.com': [], 'https://apps.wordpress.com': [], 'https://www.bluehost.com': [], 'https://wordpress.com': [], 'https://123moviesfree.app': [], 'https://www.blogger.com': [], 'https://10wontips.blogspot.com': ['https://www.blogger.com'], 'https://18jp.fun': [], 'https://hinkhoj.com': [], 'http://twitter.com': [], 'http://facebook.com': [], 'http://instagram.com': [], 'http://www.pinterest.com': [], 'https://dribbble.com': ['http://twitter.com', 'http://facebook.com', 'http://instagram.com', 'http://www.pinterest.com'], 'https://www.facebook.com': [], 'https://www.instagram.com': [], 'https://1917.co.nz': ['https://www.facebook.com', 'https://www.instagram.com'], 'https://123moviesplay.cc': [], 'https://123moviesfree.icu': ['https://123moviesplay.cc'], 'https://1push.io': [], 'https://guanliqi.zhiouwang.com': [], 'https://vr.zhiouwang.com': [], 'https://zhichan.zhiouwang.com': [], 'https://wpa1.qq.com': [], 'https://1miba.com': ['https://guanliqi.zhiouwang.com', 'https://vr.zh

In [103]:
vertexGraph = spark.createDataFrame(
    [(value,) for value in pageNodes.keys()]
,["vertices"])
vertexGraph.show()
# vertexGraph = spark.createDataFrame(
#     pd.DataFrame({
#         "vertices":['a','b','c','d']
#     })
# )
# vertexGraph.show()

+--------------------+
|            vertices|
+--------------------+
|   https://wpvip.com|
|https://apps.word...|
|https://www.blueh...|
|https://wordpress...|
|https://123movies...|
|https://www.blogg...|
|https://10wontips...|
|    https://18jp.fun|
| https://hinkhoj.com|
|  http://twitter.com|
| http://facebook.com|
|http://instagram.com|
|http://www.pinter...|
|https://dribbble.com|
|https://www.faceb...|
|https://www.insta...|
|  https://1917.co.nz|
|https://123movies...|
|https://123movies...|
|    https://1push.io|
+--------------------+
only showing top 20 rows



In [105]:
edgeData = []
for key, item in pageNodes.items():
    for it in item:
        edgeData.append(Row(source=key, dest=it))

edgesSchema = StructType([StructField("source", StringType(), True),
                     StructField("dest", StringType(), True),]
                     )
edgeGraph = spark.createDataFrame(edgeData, schema=edgesSchema)
print(edgeGraph.count())
edgeGraph.show()

# edgeGraph = spark.createDataFrame(
#     pd.DataFrame({
#     "source":['a','b','b','c','d'],
#     "dest":['b','c','d','a','b']
#     })
# )
# edgeGraph.show()

1557
+--------------------+--------------------+
|              source|                dest|
+--------------------+--------------------+
|https://10wontips...|https://www.blogg...|
|https://dribbble.com|  http://twitter.com|
|https://dribbble.com| http://facebook.com|
|https://dribbble.com|http://instagram.com|
|https://dribbble.com|http://www.pinter...|
|  https://1917.co.nz|https://www.faceb...|
|  https://1917.co.nz|https://www.insta...|
|https://123movies...|https://123movies...|
|   https://1miba.com|https://guanliqi....|
|   https://1miba.com|https://vr.zhiouw...|
|   https://1miba.com|https://zhichan.z...|
|   https://1miba.com| https://wpa1.qq.com|
|https://theguardi...|https://www.wordi...|
|   https://12edit.ru|    https://youtu.be|
|   https://12edit.ru|https://www.youtu...|
|https://sportskee...|https://play.goog...|
|   https://2067.info|https://www.plesk...|
|   https://2067.info|https://docs.ples...|
|   https://2067.info|https://support.p...|
|   https://2067.info|https

In [91]:
initial_rank = 1.0 / vertexGraph.count()
ranks = vertexGraph.withColumn("rank", lit(initial_rank)).withColumnRenamed("vertices", "page")
print(ranks.count())
ranks.show()

1085
+--------------------+--------------------+
|                page|                rank|
+--------------------+--------------------+
|https://wordpress...|9.216589861751152E-4|
|https://123movies...|9.216589861751152E-4|
|https://10wontips...|9.216589861751152E-4|
|    https://18jp.fun|9.216589861751152E-4|
| https://hinkhoj.com|9.216589861751152E-4|
|https://dribbble.com|9.216589861751152E-4|
|  https://1917.co.nz|9.216589861751152E-4|
|https://123movies...|9.216589861751152E-4|
|    https://1push.io|9.216589861751152E-4|
|   https://1miba.com|9.216589861751152E-4|
|https://012mobile...|9.216589861751152E-4|
|https://123movies...|9.216589861751152E-4|
|   https://issuu.com|9.216589861751152E-4|
| https://1campus.net|9.216589861751152E-4|
|https://0xprocess...|9.216589861751152E-4|
|     https://1337.no|9.216589861751152E-4|
|https://123clickz...|9.216589861751152E-4|
|https://144tehran...|9.216589861751152E-4|
|  https://113366.com|9.216589861751152E-4|
|     https://1024.no|9.216

In [92]:
ranks.filter(ranks.page == "https://28chan.org").show()

+------------------+--------------------+
|              page|                rank|
+------------------+--------------------+
|https://28chan.org|9.216589861751152E-4|
+------------------+--------------------+



In [94]:
print(ranks.count())
ranks.show()

1085
+--------------------+--------------------+
|                page|                rank|
+--------------------+--------------------+
|https://wordpress...|9.216589861751152E-4|
|https://123movies...|9.216589861751152E-4|
|https://10wontips...|9.216589861751152E-4|
|    https://18jp.fun|9.216589861751152E-4|
| https://hinkhoj.com|9.216589861751152E-4|
|https://dribbble.com|9.216589861751152E-4|
|  https://1917.co.nz|9.216589861751152E-4|
|https://123movies...|9.216589861751152E-4|
|    https://1push.io|9.216589861751152E-4|
|   https://1miba.com|9.216589861751152E-4|
|https://012mobile...|9.216589861751152E-4|
|https://123movies...|9.216589861751152E-4|
|   https://issuu.com|9.216589861751152E-4|
| https://1campus.net|9.216589861751152E-4|
|https://0xprocess...|9.216589861751152E-4|
|     https://1337.no|9.216589861751152E-4|
|https://123clickz...|9.216589861751152E-4|
|https://144tehran...|9.216589861751152E-4|
|  https://113366.com|9.216589861751152E-4|
|     https://1024.no|9.216

In [95]:
cont = edgeGraph.join(ranks, [(edgeGraph.source == ranks.page) | (edgeGraph.dest == ranks.page)] ).drop("page")
cont.show()

+--------------------+--------------------+--------------------+
|              source|                dest|                rank|
+--------------------+--------------------+--------------------+
|https://wordpress...|   https://wpvip.com|9.216589861751152E-4|
|https://wordpress...|https://apps.word...|9.216589861751152E-4|
|https://wordpress...|https://www.blueh...|9.216589861751152E-4|
|https://10wontips...|https://www.blogg...|9.216589861751152E-4|
|https://dribbble.com|  http://twitter.com|9.216589861751152E-4|
|https://dribbble.com| http://facebook.com|9.216589861751152E-4|
|https://dribbble.com|http://instagram.com|9.216589861751152E-4|
|https://dribbble.com|http://www.pinter...|9.216589861751152E-4|
|  https://1917.co.nz|https://www.faceb...|9.216589861751152E-4|
|  https://1917.co.nz|https://www.insta...|9.216589861751152E-4|
|https://123movies...|https://123movies...|9.216589861751152E-4|
|   https://1miba.com|https://guanliqi....|9.216589861751152E-4|
|   https://1miba.com|htt

In [96]:
counts = spark.createDataFrame(cont.groupBy("source").count().collect())
cont = cont.join(counts, ["source"])
cont.show()



+--------------------+--------------------+--------------------+-----+
|              source|                dest|                rank|count|
+--------------------+--------------------+--------------------+-----+
| https://state.mn.us|https://www.sos.s...|9.216589861751152E-4|    4|
| https://state.mn.us|https://www.reven...|9.216589861751152E-4|    4|
| https://state.mn.us|https://drive.mn.gov|9.216589861751152E-4|    4|
| https://state.mn.us|      https://mn.gov|9.216589861751152E-4|    4|
|    https://1728.org|https://www.nist.gov|9.216589861751152E-4|    7|
|    https://1728.org| http://www.nist.gov|9.216589861751152E-4|    7|
|    https://1728.org|http://www.timean...|9.216589861751152E-4|    7|
|    https://1728.org|https://www.faceb...|9.216589861751152E-4|    7|
|    https://1728.org|https://www.twitt...|9.216589861751152E-4|    7|
|    https://1728.org|https://www.freef...|9.216589861751152E-4|    7|
|    https://1728.org|http://www.standu...|9.216589861751152E-4|    7|
| http

                                                                                

In [97]:
cont = cont.withColumn("Conts", col("rank")/col("count"))
cont.show()



+--------------------+--------------------+--------------------+-----+--------------------+
|              source|                dest|                rank|count|               Conts|
+--------------------+--------------------+--------------------+-----+--------------------+
| https://state.mn.us|https://www.sos.s...|9.216589861751152E-4|    4|2.304147465437788E-4|
| https://state.mn.us|https://www.reven...|9.216589861751152E-4|    4|2.304147465437788E-4|
| https://state.mn.us|https://drive.mn.gov|9.216589861751152E-4|    4|2.304147465437788E-4|
| https://state.mn.us|      https://mn.gov|9.216589861751152E-4|    4|2.304147465437788E-4|
|    https://1728.org|https://www.nist.gov|9.216589861751152E-4|    7|1.316655694535878...|
|    https://1728.org| http://www.nist.gov|9.216589861751152E-4|    7|1.316655694535878...|
|    https://1728.org|http://www.timean...|9.216589861751152E-4|    7|1.316655694535878...|
|    https://1728.org|https://www.faceb...|9.216589861751152E-4|    7|1.31665569

                                                                                

In [98]:
contA = cont.groupBy("dest").agg(spark_sum("Conts").alias("sumCont"))
contA.show()



+--------------------+--------------------+
|                dest|             sumCont|
+--------------------+--------------------+
|https://fandom.ze...|2.633311389071757...|
|https://www.lineb...|2.710761724044456...|
|https://www.pinte...|6.583278472679394E-5|
|https://xmljatsre...|3.072196620583717...|
|https://www.netvi...|4.608294930875576E-4|
|https://scienceso...|3.686635944700461E-5|
| https://mochajs.org| 5.76036866359447E-5|
| http://facebook.com|3.328213005632360...|
|https://bigzip.11...|1.152073732718894E-4|
|https://api.stack...|2.880184331797235E-5|
|     https://qiwa.sa|2.633311389071757...|
|https://unabridge...|1.536098310291858...|
|https://stu.17zwd...|1.024065540194572...|
|http://www.histor...|1.335737661123355...|
|http://www.kavame...|9.216589861751152E-4|
|https://www.nxt.s...|9.216589861751152E-5|
|https://help.gymg...|1.024065540194572...|
|https://saudiares...|1.843317972350230...|
|https://desktop.l...|2.710761724044456...|
|http://www.uclh.n...|6.98226504

                                                                                

In [99]:
ranks = contA.withColumn("rank", lit(0.2/4)+0.8*col("sumCont")).select(col("dest").alias("page"), "rank")
ranks.show()

                                                                                

+--------------------+--------------------+
|                page|                rank|
+--------------------+--------------------+
|https://fandom.ze...| 0.05002106649111258|
|https://www.lineb...| 0.05002168609379236|
|https://www.pinte...|0.050052666227781435|
|https://xmljatsre...|  0.0502457757296467|
|https://www.netvi...| 0.05036866359447005|
|https://scienceso...| 0.05002949308755761|
| https://mochajs.org|0.050046082949308755|
| http://facebook.com| 0.05026625704045059|
|https://bigzip.11...|0.050092165898617515|
|https://api.stack...| 0.05002304147465438|
|     https://qiwa.sa| 0.05002106649111258|
|https://unabridge...| 0.05012288786482335|
|https://stu.17zwd...| 0.05008192524321557|
|http://www.histor...| 0.05001068590128899|
|http://www.kavame...|  0.0507373271889401|
|https://www.nxt.s...| 0.05007373271889401|
|https://help.gymg...| 0.05008192524321557|
|https://saudiares...| 0.05014746543778802|
|https://desktop.l...| 0.05002168609379236|
|http://www.uclh.n...| 0.0500055