In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, lower, col, collect_set

spark = SparkSession.builder.appName("InvertedIndex").getOrCreate()

documents = [
    (1, "To be, or not to be, that is the question."),
    (2, "All the world's a stage, and all the men and women merely players."),
    (3, "The fault, dear Brutus, is not in our stars, but in ourselves, that we are underlings.")
]

df = spark.createDataFrame(documents, ["doc_id", "text"])

words_df = df.withColumn("word", explode(split(lower(col("text")), "\\W+")))

words_df = words_df.filter(col("word") != "")

inverted_index_df = words_df.groupBy("word").agg(collect_set("doc_id").alias("doc_ids"))

inverted_index_df.show()


+---------+---------+
|     word|  doc_ids|
+---------+---------+
|      not|   [1, 3]|
|       be|      [1]|
|       is|   [1, 3]|
|      the|[1, 2, 3]|
| question|      [1]|
|     that|   [1, 3]|
|       or|      [1]|
|       to|      [1]|
|      men|      [2]|
|    stars|      [3]|
|       in|      [3]|
|   merely|      [2]|
|      but|      [3]|
|  players|      [2]|
|      our|      [3]|
|    women|      [2]|
|ourselves|      [3]|
|     dear|      [3]|
|      and|      [2]|
|      are|      [3]|
+---------+---------+
only showing top 20 rows

