In [None]:
!pip install pyspark

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Web Content Indexing").getOrCreate()
json_df = spark.read.json("part-r-00000")
json_df.show()

+--------------------+--------------------+--------------------+-----------------+-----------------+--------------------+------------+--------------------+----------+-------------+--------------------+
|           fetchTime|            metadata|        modifiedTime|retriesSinceFetch|retryIntervalDays|retryIntervalSeconds|       score|           signature|statusCode|   statusName|                 url|
+--------------------+--------------------+--------------------+-----------------+-----------------+--------------------+------------+--------------------+----------+-------------+--------------------+
|Mon Nov 13 21:20:...|{NULL, NULL, NULL...|Thu Jan 01 07:30:...|                0|               30|             2592000|4.0064103E-4|                null|         1| db_unfetched|  http://www.3u.com/|
|Wed Dec 13 21:00:...|{text/html, temp_...|Mon Nov 13 21:00:...|                0|               30|             2592000| 0.106155306|                null|         4|db_redir_temp|   https://a

In [2]:
from pyspark.ml.feature import Tokenizer
from pyspark.sql.functions import col, rank
from pyspark.sql.window import Window

json_df = json_df.dropDuplicates().na.drop()

tokenizer = Tokenizer(inputCol="url", outputCol="keywords")
json_df = tokenizer.transform(json_df)

windowSpec = Window.orderBy(col("score").desc())
ranked_json_df = json_df.withColumn("rank", rank().over(windowSpec))

indexed_json_df = ranked_json_df.select("url", "keywords", "rank")
indexed_json_df.write.parquet("output")



In [3]:
parquet_df = spark.read.parquet("output/part-00000-7170e01f-c3fb-47a4-b6c9-9e7cce45d50a-c000.snappy.parquet")
parquet_df.show()

+--------------------+--------------------+----+
|                 url|            keywords|rank|
+--------------------+--------------------+----+
|   https://amanz.my/| [https://amanz.my/]|   1|
|https://techlagi....|[https://techlagi...|   2|
|https://fonts.goo...|[https://fonts.go...|   3|
|https://techlagi....|[https://techlagi...|   4|
|   https://amanz.me/| [https://amanz.me/]|   5|
|https://www.googl...|[https://www.goog...|   6|
|https://techlagi....|[https://techlagi...|   7|
|https://techlagi....|[https://techlagi...|   8|
|https://techlagi.my/|[https://techlagi...|   9|
|https://techlagi....|[https://techlagi...|  10|
|https://techlagi....|[https://techlagi...|  11|
|https://amanz.my/...|[https://amanz.my...|  12|
|https://stats.wp....|[https://stats.wp...|  12|
|https://amanz.my/...|[https://amanz.my...|  12|
|https://amanz.my/...|[https://amanz.my...|  12|
|https://fonts.gst...|[https://fonts.gs...|  12|
|https://static.cl...|[https://static.c...|  12|
|https://amanz.my/..

In [4]:
spark.stop()