In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.appName("test").getOrCreate()

In [3]:
data = [
    [1, "Nadal"],
    [2, "Federer"],
    [3, "Novak"],
]
players_df = spark.createDataFrame(data, ["player_id", "player_name"])
players_df.show()

+---------+-----------+
|player_id|player_name|
+---------+-----------+
|        1|      Nadal|
|        2|    Federer|
|        3|      Novak|
+---------+-----------+



In [4]:
data = [
    [2017, 2, 1, 1, 2],
    [2018, 3, 1, 3, 2],
    [2019, 3, 1, 1, 3],
]
championship_df = spark.createDataFrame(data, ["year", "wimbledon", "fr_open", "us_open", "au_open"])
championship_df.show()

+----+---------+-------+-------+-------+
|year|wimbledon|fr_open|us_open|au_open|
+----+---------+-------+-------+-------+
|2017|        2|      1|      1|      2|
|2018|        3|      1|      3|      2|
|2019|        3|      1|      1|      3|
+----+---------+-------+-------+-------+



In [5]:
df = (
    championship_df.select("wimbledon")
    .unionAll(championship_df.select("fr_open"))
    .unionAll(championship_df.select("us_open"))
    .unionAll(championship_df.select("au_open"))   
).withColumnRenamed("wimbledon", "id")
df.show()

+---+
| id|
+---+
|  2|
|  3|
|  3|
|  1|
|  1|
|  1|
|  1|
|  3|
|  1|
|  2|
|  2|
|  3|
+---+



In [6]:
res_df = (
    players_df
    .join(df, players_df.player_id == df.id)
    .groupBy("player_id", "player_name")
    .agg(count("player_id").alias("wins"))
    .select("player_id", "player_name", "wins")
)
res_df.show()

+---------+-----------+----+
|player_id|player_name|wins|
+---------+-----------+----+
|        1|      Nadal|   5|
|        2|    Federer|   3|
|        3|      Novak|   4|
+---------+-----------+----+

