In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date, col, sum as sum_

spark = SparkSession.builder.appName("Requests").getOrCreate()

# 0. Чтение данных из csv файла в DataFrame
df = spark.read.csv("web_server_logs.csv", header=True, inferSchema=True)
# df.show()

# 1. Вывести 10 самых активных IP
print("Top 10 active IP addresses:")
top10_df = df.groupBy(col("ip")) \
            .agg(count("*").alias("request_count")) \
            .orderBy(col("request_count"), ascending=False).limit(10)
top10_df.show()

# 2. Посчитать количество запросов для каждого HTTP метода
print("Request count by HTTP method:")
methods_df = df.groupBy(col("method")) \
              .agg(count("*").alias("method_count"))
methods_df.show()

# 3. Посчитать количество запросов с кодом ответа 404
numbers_of_404_response = df.filter(df.response_code == 404).count()
print(f'Number of 404 response codes: {numbers_of_404_response}')

# 4. Сгруппировать по дате, просуммировать размер ответов, отсортировать по дате
print("Total response size by day:")
response_size_df = df.groupBy(to_date(col("timestamp")).alias("date")) \
                    .agg(sum_(col("response_size")).alias("total_response_size")) \
                    .orderBy("date")
response_size_df.show()


Top 10 active IP addresses:
+---------------+-------------+
|             ip|request_count|
+---------------+-------------+
|170.114.216.117|            2|
|145.213.176.157|            2|
|  93.212.31.127|            2|
|   14.105.0.140|            1|
|    80.55.130.9|            1|
|  3.189.118.222|            1|
|   74.184.95.68|            1|
|  162.188.76.17|            1|
|  136.15.103.52|            1|
|  14.220.33.207|            1|
+---------------+-------------+

Request count by HTTP method:
+------+------------+
|method|method_count|
+------+------------+
|  POST|       25150|
|DELETE|       24669|
|   PUT|       25048|
|   GET|       25133|
+------+------------+

Number of 404 response codes: 24965
Total response size by day:
+----------+-------------------+
|      date|total_response_size|
+----------+-------------------+
|2024-01-01|            2039630|
|2024-01-02|            2216806|
|2024-01-03|            2201143|
|2024-01-04|            2236519|
|2024-01-05|         