In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import format_number
from pyspark.sql.functions import col, lit, when, udf, avg, mean, sum, max, min, count, countDistinct, desc, asc, round
from pyspark.sql.types import StringType
from pyspark.sql.functions import first, last


In [2]:

spark = SparkSession.builder \
    .master("local") \
    .appName("dataframe_app") \
    .config("spark.executer.memory", "16gb") \
    .getOrCreate()

sc = spark.sparkContext
sc

## 1. Window Functions

## 2. PySpark Window Ranking functions

In [3]:
column_names = ["language", "framework", "users"]
data = [
    ("Python", "Django", 20000),
    ("Python", "FastAPI", 20000),
    ("JavaScript", "AngularJS", 5000),
    ("JavaScript", "ReactJS", 7000),
    ("Python", "Flask", 9000)
]
df = spark.createDataFrame(data, column_names)
df.show()

+----------+---------+-----+
|  language|framework|users|
+----------+---------+-----+
|    Python|   Django|20000|
|    Python|  FastAPI|20000|
|JavaScript|AngularJS| 5000|
|JavaScript|  ReactJS| 7000|
|    Python|    Flask| 9000|
+----------+---------+-----+



### Define Window

In [None]:
from pyspark.sql.window import Window
import pyspark.sql.functions as F

window_spec = Window.partitionBy("language").orderBy(F.desc("users"))

### Window Function row_number()

In [7]:
df_new = df.withColumn("row_number", F.row_number().over(window_spec))
df_new.show()

+----------+---------+-----+----------+
|  language|framework|users|row_number|
+----------+---------+-----+----------+
|JavaScript|  ReactJS| 7000|         1|
|JavaScript|AngularJS| 5000|         2|
|    Python|   Django|20000|         1|
|    Python|  FastAPI|20000|         2|
|    Python|    Flask| 9000|         3|
+----------+---------+-----+----------+



### Window Function rank()

In [12]:
df_new = df.withColumn("rank", F.rank().over(window_spec))
df_new.show()

+----------+---------+-----+----+
|  language|framework|users|rank|
+----------+---------+-----+----+
|JavaScript|  ReactJS| 7000|   1|
|JavaScript|AngularJS| 5000|   2|
|    Python|   Django|20000|   1|
|    Python|  FastAPI|20000|   1|
|    Python|    Flask| 9000|   3|
+----------+---------+-----+----+



### Window Function dense_rank()

In [13]:
df_new = df.withColumn("rank", F.dense_rank().over(window_spec))
df_new.show()

+----------+---------+-----+----+
|  language|framework|users|rank|
+----------+---------+-----+----+
|JavaScript|  ReactJS| 7000|   1|
|JavaScript|AngularJS| 5000|   2|
|    Python|   Django|20000|   1|
|    Python|  FastAPI|20000|   1|
|    Python|    Flask| 9000|   2|
+----------+---------+-----+----+



### percent_rank Window Function

In [14]:
df_new = df.withColumn("percent_rank", F.percent_rank().over(window_spec))
df_new.show()

+----------+---------+-----+------------+
|  language|framework|users|percent_rank|
+----------+---------+-----+------------+
|JavaScript|  ReactJS| 7000|         0.0|
|JavaScript|AngularJS| 5000|         1.0|
|    Python|   Django|20000|         0.0|
|    Python|  FastAPI|20000|         0.0|
|    Python|    Flask| 9000|         1.0|
+----------+---------+-----+------------+



### ntile Window Function

In [15]:
df_new = df.withColumn("ntile", F.ntile(4).over(window_spec))
df_new.show()

+----------+---------+-----+-----+
|  language|framework|users|ntile|
+----------+---------+-----+-----+
|JavaScript|  ReactJS| 7000|    1|
|JavaScript|AngularJS| 5000|    2|
|    Python|   Django|20000|    1|
|    Python|  FastAPI|20000|    2|
|    Python|    Flask| 9000|    3|
+----------+---------+-----+-----+



# Broadcast

## Create Broadcast Variable

In [None]:
# Access the SparkContext via the SparkSession
sc = spark.sparkContext

# Create a list
languages = ["Python", "Java"]

# Broadcast the list
broadcasted_languages = sc.broadcast(languages)

## Use Broadcast Variable

In [None]:
# Filter DataFrame using the broadcasted variable
filtered_df = df.filter(col("language").isin(broadcasted_languages.value))

# Show the filtered DataFrame
filtered_df.show()

+--------+---------+-----+
|language|framework|users|
+--------+---------+-----+
|  Python|   Django|20000|
|  Python|  FastAPI| 9000|
|    Java|   Spring| 7000|
+--------+---------+-----+

