<a href="https://colab.research.google.com/github/tempiatine/stepik_hw/blob/main/pyspark_hw.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
# !pip install pyspark

In [26]:
from pyspark.sql import SparkSession

# Создание SparkSession
spark = SparkSession.builder.appName("Read CSV logs").getOrCreate()
from pyspark.sql.functions import col, count, desc, date_trunc, sum, to_date

# Чтение CSV-файла
df = spark.read.csv("/content/sample_data/web_server_logs.csv", header=True, inferSchema=True)

# Нахождение топ-10 IP с наибольшим количеством запросов
ip_requests = df.groupBy('ip').count().withColumnRenamed("count", "request_count").orderBy(desc('request_count')).limit(10)

# Нахождение количества запросов по каждому HTTP-методу
http_requests = df.groupBy('method').count().withColumnRenamed("count", "method_count")

# Нахождение количества запросов с кодом ответа 404
notFound_count = df.filter(col('response_code') == '404').groupBy('response_code').count().collect()[0][1]

# Нахождение суммарного размера ответов на каждую дату
date_size = df.withColumn("date", date_trunc("day", col("timestamp"))).groupBy('date').sum('response_size').withColumnRenamed("sum(response_size)", "total_response_size").withColumn("date", to_date(col("date"))).orderBy('date')

df.show()
print('Top 10 active IP addresses:')
ip_requests.show()

print('Request count by HTTP method:')
http_requests.show()

print(f'Number of 404 response codes: {notFound_count}')

print('Total response size by date:')
date_size.show()

+---------------+--------------------+------+--------------------+-------------+-------------+
|             ip|           timestamp|method|                 url|response_code|response_size|
+---------------+--------------------+------+--------------------+-------------+-------------+
| 143.48.209.129|2024-01-04 22:20:...|DELETE|        posts/search|          500|         6316|
|    35.18.70.54|2024-01-29 15:09:...|   GET|        explore/blog|          301|         1472|
|153.253.235.106|2024-03-07 11:42:...|   GET|categories/search...|          301|          348|
|   31.92.178.11|2024-01-31 23:03:...|   PUT|           posts/tag|          301|          533|
|  91.198.130.78|2024-05-03 02:33:...|   PUT|                 app|          500|          834|
|  42.113.122.35|2024-08-04 17:12:...|   GET|categories/app/ca...|          200|         5907|
|    198.196.9.4|2024-06-28 19:36:...|DELETE|      tag/categories|          500|         9113|
|  14.207.175.45|2024-07-21 18:31:...|DELETE|     