In [46]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, col, to_date, sum

spark = SparkSession.builder\
.appName('log_analysis')\
.getOrCreate()

df = spark.read.option('header', 'true').csv('/content/web_server_logs.csv')

first_task_p_1 = df.groupBy('ip')\
.agg(count('*').alias('requests_count'))
first_task = first_task_p_1.orderBy(col('requests_count').desc()).limit(10)
print('Top 10 active IP adresses:')
first_task.show()

second_task = df.groupBy('method')\
.agg(count('*').alias('method_count'))
print('Request count by HTTP method:')
second_task.show()

third_task_p_1 = df.filter(col('response_code') == 404)
third_task = third_task_p_1.count()
print(f'Number of 404 response codes: {third_task}')

fourth_task = df.withColumn('date', to_date('response_size'))
total_response_size = fourth_task.groupBy('date')\
.agg(sum('response_size').alias('total_response_size'))\
.orderBy('date')\
.limit(15)
print('Total response size by day:')
total_response_size.show()

spark.stop()

Top 10 active IP adresses:
+---------------+--------------+
|             ip|requests_count|
+---------------+--------------+
|172.213.173.234|             2|
|182.242.200.232|             1|
|135.132.106.211|             1|
|  20.175.54.229|             1|
|   94.61.241.41|             1|
|  95.202.182.60|             1|
|   28.52.106.76|             1|
|   74.89.158.98|             1|
| 216.195.222.29|             1|
|153.171.255.153|             1|
+---------------+--------------+

Request count by HTTP method:
+------+------------+
|method|method_count|
+------+------------+
|  POST|       24918|
|DELETE|       25098|
|   PUT|       24910|
|   GET|       25074|
+------+------------+

Number of 404 response codes: 25206
Total response size by day:
+----------+-------------------+
|      date|total_response_size|
+----------+-------------------+
|      NULL|          4996306.0|
|1000-01-01|            12000.0|
|1001-01-01|            11011.0|
|1002-01-01|            15030.0|
|1003-01