### Задание 1.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [None]:
# Инициализация SparkSession
spark = SparkSession.builder.appName("session_length").getOrCreate()

In [None]:
# Исходные данные
data = [
    ((1, 1562007679), (1, 1562007710), (1, 1562007720), (1, 1562007750)),
    ((2, 1564682430), (2, 1564682450), (2, 1564682480))
]

In [None]:
# Преобразование данных в список кортежей
rows = []
for row in data:
    rows.extend([(i[0], i[1]) for i in row])

In [None]:
# Создание DataFrame
columns = ["id", "timestamp"]
df = spark.createDataFrame(rows, columns)

In [None]:
# Преобразование timestamp в формат даты
df = df.withColumn("date", F.to_date(F.from_unixtime("timestamp")))

In [None]:
# Создание окна по id и дате
window_spec = Window.partitionBy("id", "date").orderBy("timestamp")

In [None]:
# Рассчитываем разницу между первым и последним действием в рамках сессии
session_length = F.last("timestamp").over(window_spec) - F.first("timestamp").over(window_spec)

In [None]:
# Получаем усредненную длину сессии для каждого id
result_df = df.withColumn("session_length", session_length) \
    .groupBy("id") \
    .agg(F.avg("session_length").alias("avg_session_length"))

result_df.show()

+---+------------------+
| id|avg_session_length|
+---+------------------+
|  1|             35.75|
|  2|23.333333333333332|
+---+------------------+



In [None]:
# Остановка SparkSession
spark.stop

### Задание 2.

In [159]:
from pyspark.sql.functions import sum as spark_sum, avg as spark_avg, lag, coalesce

# Инициализация SparkSession
spark = SparkSession.builder\
    .master("local[1]")\
    .appName("task_47")\
    .config("spark.executor.memory", "10g")\
    .config("spark.executor.cores", 5)\
    .config("spark.dynamicAllocation.enabled", "true")\
    .config("spark.dynamicAllocation.maxExecutors", 5)\
    .config("spark.shuffle.service.enabled", "true")\
    .getOrCreate()

In [160]:
# Данные для технических недель
week_str_p = (('1', '01.08—06.08'), ('2', '07.08—13.08'), ('3', '14.08—20.08'), ('4', '21.08—27.08'), ('5', '28.08—31.08'))
week_str_s = spark.createDataFrame(data=week_str_p, schema=['week', 'week_str'])

# Данные о среднедневном спросе
demand_p = (('1', '01', 100), ('1', '02', 110), ('2', '01', 120), ('2', '02', 90), ('3', '01', 70), ('3', '02', 80))
demand_s = spark.createDataFrame(data=demand_p, schema=['product', 'location', 'demand'])

# Данные о складских запасах
stock_p = (('1', '01', 1000), ('1', '02', 400), ('2', '01', 300), ('2', '02', 250))
stock_s = spark.createDataFrame(data=stock_p, schema=['product', 'location', 'stock'])

In [161]:
week_str_s.createOrReplaceTempView('week_str')
demand_s.createOrReplaceTempView('demand')
stock_s.createOrReplaceTempView('stock')

In [162]:
sql = """
    with tmp_1 as (
        select d.*, coalesce(s.stock, 0) as stock
        from demand d
        left join stock s using(product, location)
    ),

    tmp_2 as (
        select tmp_1.*, ws.week_str
        from tmp_1 cross join week_str ws
    ),

    tmp_3 as (
        select tmp_2.*,
            sum(demand) over (partition by product, location, week_str order by location) as total_demand
        from tmp_2
    ),

    tmp_4 as (
        select tmp_3.*,
            lag(stock - total_demand, 1, 0) over (partition by product, location order by week_str) as closing_stock
        from tmp_3
    ),

    tmp_5 as (
        select tmp_4.*,
            case
                when closing_stock > demand then demand
                when closing_stock > 0 and closing_stock <= demand then closing_stock
                else 0
            end as sales
        from tmp_4
    ),

    tmp_6 as (
        select tmp_5.*,
            min(closing_stock) over (partition by product, location, week_str) as stock_at_end
        from tmp_5
    )

    select
        tmp_6.week_str,
        tmp_6.product,
        tmp_6.location,
        sum(tmp_6.sales) as sales,
        avg(tmp_6.stock_at_end) as stock_at_end
    from tmp_6
    group by tmp_6.week_str, tmp_6.product, tmp_6.location
    order by 1, 2, 3
"""

# Выполнение запроса и вывод результатов
result_df = spark.sql(sql)
result_df.show(truncate=False)

+-----------+-------+--------+-----+------------+
|week_str   |product|location|sales|stock_at_end|
+-----------+-------+--------+-----+------------+
|01.08—06.08|1      |01      |0    |0.0         |
|01.08—06.08|1      |02      |0    |0.0         |
|01.08—06.08|2      |01      |0    |0.0         |
|01.08—06.08|2      |02      |0    |0.0         |
|01.08—06.08|3      |01      |0    |0.0         |
|01.08—06.08|3      |02      |0    |0.0         |
|07.08—13.08|1      |01      |100  |900.0       |
|07.08—13.08|1      |02      |110  |290.0       |
|07.08—13.08|2      |01      |120  |180.0       |
|07.08—13.08|2      |02      |90   |160.0       |
|07.08—13.08|3      |01      |0    |-70.0       |
|07.08—13.08|3      |02      |0    |-80.0       |
|14.08—20.08|1      |01      |100  |900.0       |
|14.08—20.08|1      |02      |110  |290.0       |
|14.08—20.08|2      |01      |120  |180.0       |
|14.08—20.08|2      |02      |90   |160.0       |
|14.08—20.08|3      |01      |0    |-70.0       |


In [None]:
# Остановка SparkSession
spark.stop