In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.appName("test").getOrCreate()

In [3]:
data = [
    (10, 20, 50),
    (20, 10, 12),
    (10, 30, 20),
    (30, 40, 100),
    (30, 40, 200),
    (30, 40, 200),
    (40, 30, 500),
]
df = spark.createDataFrame(data, ["from_id", "to_id", "duration"])
df.show()

+-------+-----+--------+
|from_id|to_id|duration|
+-------+-----+--------+
|     10|   20|      50|
|     20|   10|      12|
|     10|   30|      20|
|     30|   40|     100|
|     30|   40|     200|
|     30|   40|     200|
|     40|   30|     500|
+-------+-----+--------+



In [4]:
df1 = (
    df
    .withColumn("person_1", when(col("from_id") < col("to_id"), col("from_id")).otherwise(col("to_id")))
    .withColumn("person_2", when(col("from_id") < col("to_id"), col("to_id")).otherwise(col("from_id")))
    .select("person_1", "person_2", "duration")
)
df1.show()

+--------+--------+--------+
|person_1|person_2|duration|
+--------+--------+--------+
|      10|      20|      50|
|      10|      20|      12|
|      10|      30|      20|
|      30|      40|     100|
|      30|      40|     200|
|      30|      40|     200|
|      30|      40|     500|
+--------+--------+--------+



In [5]:
df1.groupBy(col("person_1"), col("person_2")).agg(count("*"), sum(col("duration"))).show()

+--------+--------+--------+-------------+
|person_1|person_2|count(1)|sum(duration)|
+--------+--------+--------+-------------+
|      10|      20|       2|           62|
|      10|      30|       1|           20|
|      30|      40|       4|         1000|
+--------+--------+--------+-------------+



In [6]:
df.createOrReplaceTempView("calls")

In [7]:
query = """
WITH cte AS (
    SELECT
        *,
        CASE WHEN from_id < to_id THEN from_id ELSE to_id END person_1,
        CASE WHEN from_id < to_id THEN to_id ELSE from_id END person_2
    FROM calls
)
SELECT
    person_1,
    person_2,
    COUNT(*) AS call_count,
    SUM(duration) AS total_duration
FROM cte
GROUP BY person_1, person_2
"""
spark.sql(query).show()

+--------+--------+----------+--------------+
|person_1|person_2|call_count|total_duration|
+--------+--------+----------+--------------+
|      10|      20|         2|            62|
|      10|      30|         1|            20|
|      30|      40|         4|          1000|
+--------+--------+----------+--------------+

