As Windows Functions retornam um único valor para cada grupo de linhas. O PySpark oferece suporte a 3 tipos de Windows Functions:

- Ranking functions
- Analytic functions
- Aggregate functions

Documentação:

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window

In [0]:
dados = [
          ('Anderson', 'Vendas', 'SP', 1500.00, 34, 1000.00),
          ('Kennedy', 'Vendas', 'CE', 1200.00, 56, 2000.00),
          ('Bruno', 'Vendas', 'SP', 1100.00, 30, 2300.00),
          ('Maria', 'Financas', 'CE', 3600.00, 24, 2300.00),
          ('Eduardo', 'Financas', 'CE', 4500.00, 40, 2400.00),
          ('Maria Eduarda', 'Financas', 'CE', 4500.00, 40, 2400.00),
          ('Mendes', 'Financas', 'RS', 8000.00, 36, 1900.00),
          ('Kethlyn', 'Financas', 'RS', 1200.00, 53, 1500.00),
          ('Thiago', 'Marketing', 'GO', 1100.00, 25, 1800.00),
          ('Carla', 'Marketing', 'GO', 2600.00, 50, 2100.00)
]

schema = ["nome", "departamento", "estado", "salario", "idade", "bonus"]

df = spark.createDataFrame(data = dados, schema = schema)
df.printSchema()
df.display()

nome,departamento,estado,salario,idade,bonus
Anderson,Vendas,SP,1500.0,34,1000.0
Kennedy,Vendas,CE,1200.0,56,2000.0
Bruno,Vendas,SP,1100.0,30,2300.0
Maria,Financas,CE,3600.0,24,2300.0
Eduardo,Financas,CE,4500.0,40,2400.0
Maria Eduarda,Financas,CE,4500.0,40,2400.0
Mendes,Financas,RS,8000.0,36,1900.0
Kethlyn,Financas,RS,1200.0,53,1500.0
Thiago,Marketing,GO,1100.0,25,1800.0
Carla,Marketing,GO,2600.0,50,2100.0


In [0]:
#RETORNA O NÚMERO DA LINHA DE ACORDO COM A COLUNA QUE FOI PARTICIONADA

w0 = Window.partitionBy(F.col("departamento")).orderBy("salario")

#withColumn cria uma nova coluna no dataframe
df.withColumn("row_number", F.row_number().over(w0)).display()

nome,departamento,estado,salario,idade,bonus,row_number
Kethlyn,Financas,RS,1200.0,53,1500.0,1
Maria,Financas,CE,3600.0,24,2300.0,2
Eduardo,Financas,CE,4500.0,40,2400.0,3
Maria Eduarda,Financas,CE,4500.0,40,2400.0,4
Mendes,Financas,RS,8000.0,36,1900.0,5
Thiago,Marketing,GO,1100.0,25,1800.0,1
Carla,Marketing,GO,2600.0,50,2100.0,2
Bruno,Vendas,SP,1100.0,30,2300.0,1
Kennedy,Vendas,CE,1200.0,56,2000.0,2
Anderson,Vendas,SP,1500.0,34,1000.0,3


In [0]:
df.withColumn("rank", F.rank().over(w0)).display()

nome,departamento,estado,salario,idade,bonus,rank
Kethlyn,Financas,RS,1200.0,53,1500.0,1
Maria,Financas,CE,3600.0,24,2300.0,2
Eduardo,Financas,CE,4500.0,40,2400.0,3
Maria Eduarda,Financas,CE,4500.0,40,2400.0,3
Mendes,Financas,RS,8000.0,36,1900.0,5
Thiago,Marketing,GO,1100.0,25,1800.0,1
Carla,Marketing,GO,2600.0,50,2100.0,2
Bruno,Vendas,SP,1100.0,30,2300.0,1
Kennedy,Vendas,CE,1200.0,56,2000.0,2
Anderson,Vendas,SP,1500.0,34,1000.0,3


In [0]:
df.withColumn("dense_rank", F.dense_rank().over(w0)).display()

nome,departamento,estado,salario,idade,bonus,dense_rank
Kethlyn,Financas,RS,1200.0,53,1500.0,1
Maria,Financas,CE,3600.0,24,2300.0,2
Eduardo,Financas,CE,4500.0,40,2400.0,3
Maria Eduarda,Financas,CE,4500.0,40,2400.0,3
Mendes,Financas,RS,8000.0,36,1900.0,4
Thiago,Marketing,GO,1100.0,25,1800.0,1
Carla,Marketing,GO,2600.0,50,2100.0,2
Bruno,Vendas,SP,1100.0,30,2300.0,1
Kennedy,Vendas,CE,1200.0,56,2000.0,2
Anderson,Vendas,SP,1500.0,34,1000.0,3


In [0]:
df.withColumn("percent_rank", F.percent_rank().over(w0)).display()

nome,departamento,estado,salario,idade,bonus,percent_rank
Kethlyn,Financas,RS,1200.0,53,1500.0,0.0
Maria,Financas,CE,3600.0,24,2300.0,0.25
Eduardo,Financas,CE,4500.0,40,2400.0,0.5
Maria Eduarda,Financas,CE,4500.0,40,2400.0,0.5
Mendes,Financas,RS,8000.0,36,1900.0,1.0
Thiago,Marketing,GO,1100.0,25,1800.0,0.0
Carla,Marketing,GO,2600.0,50,2100.0,1.0
Bruno,Vendas,SP,1100.0,30,2300.0,0.0
Kennedy,Vendas,CE,1200.0,56,2000.0,0.5
Anderson,Vendas,SP,1500.0,34,1000.0,1.0


In [0]:
df.withColumn("lag", F.lag("salario", 2).over(w0)).display()

nome,departamento,estado,salario,idade,bonus,lag
Kethlyn,Financas,RS,1200.0,53,1500.0,
Maria,Financas,CE,3600.0,24,2300.0,
Eduardo,Financas,CE,4500.0,40,2400.0,1200.0
Maria Eduarda,Financas,CE,4500.0,40,2400.0,3600.0
Mendes,Financas,RS,8000.0,36,1900.0,4500.0
Thiago,Marketing,GO,1100.0,25,1800.0,
Carla,Marketing,GO,2600.0,50,2100.0,
Bruno,Vendas,SP,1100.0,30,2300.0,
Kennedy,Vendas,CE,1200.0,56,2000.0,
Anderson,Vendas,SP,1500.0,34,1000.0,1100.0


In [0]:
  df.withColumn("lead", F.lead("salario", 2).over(w0)).display()

nome,departamento,estado,salario,idade,bonus,lead
Kethlyn,Financas,RS,1200.0,53,1500.0,4500.0
Maria,Financas,CE,3600.0,24,2300.0,4500.0
Eduardo,Financas,CE,4500.0,40,2400.0,8000.0
Maria Eduarda,Financas,CE,4500.0,40,2400.0,
Mendes,Financas,RS,8000.0,36,1900.0,
Thiago,Marketing,GO,1100.0,25,1800.0,
Carla,Marketing,GO,2600.0,50,2100.0,
Bruno,Vendas,SP,1100.0,30,2300.0,1500.0
Kennedy,Vendas,CE,1200.0,56,2000.0,
Anderson,Vendas,SP,1500.0,34,1000.0,


In [0]:
(df.withColumn("row", F.row_number().over(w0))
   .withColumn("avg", F.avg(F.col("salario")).over(w0))
   .withColumn("sum", F.sum(F.col("salario")).over(w0))
   .withColumn("min", F.min(F.col("salario")).over(w0))
   .withColumn("max", F.max(F.col("salario")).over(w0))
   .select("row", "departamento", "avg", "sum", "min", "max").display()

)

row,departamento,avg,sum,min,max
1,Financas,1200.0,1200.0,1200.0,1200.0
2,Financas,2400.0,4800.0,1200.0,3600.0
3,Financas,3450.0,13800.0,1200.0,4500.0
4,Financas,3450.0,13800.0,1200.0,4500.0
5,Financas,4360.0,21800.0,1200.0,8000.0
1,Marketing,1100.0,1100.0,1100.0,1100.0
2,Marketing,1850.0,3700.0,1100.0,2600.0
1,Vendas,1100.0,1100.0,1100.0,1100.0
2,Vendas,1150.0,2300.0,1100.0,1200.0
3,Vendas,1266.6666666666667,3800.0,1100.0,1500.0
