In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=d466aeed90fb071d9ea0f125cd8e13b4e252250ed5f2769865edad585dcbf758
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp, date_format, rand, expr
from pyspark.sql.functions import to_timestamp, dayofyear, date_format, sum, min, max, count, desc
from pyspark.sql.functions import to_date, col

# Initializing Spark session
spark = SparkSession.builder \
    .appName("Generate Semi-Structured Logs") \
    .getOrCreate()

# Generating semi-structured log data
num_logs = 1000000

logs_df = spark.range(0, num_logs) \
    .withColumn('timestamp', current_timestamp()) \
    .withColumn('remote_host', expr("concat_ws('.', floor(rand() * 256), floor(rand() * 256), floor(rand() * 256), floor(rand() * 256))")) \
    .withColumn('request_method', expr("CASE WHEN rand() < 0.25 THEN 'GET' WHEN rand() < 0.5 THEN 'POST' WHEN rand() < 0.75 THEN 'PUT' ELSE 'DELETE' END")) \
    .withColumn('request_endpoint', expr("concat('/api/', substring('abcdefghijklmnopqrstuvwxyz0123456789', floor(rand() * 25) + 1, 10))")) \
    .withColumn('protocol', expr("CASE WHEN rand() < 0.5 THEN 'HTTP/1.1' ELSE 'HTTP/2.0' END")) \
    .withColumn('status_code', expr("CASE WHEN rand() < 0.8 THEN 200 WHEN rand() < 0.9 THEN 404 ELSE 500 END")) \
    .withColumn('content_size', expr("floor(rand() * 10000)"))

# Saving the DataFrame as a Parquet file
logs_df.write.mode('overwrite').parquet('semi_structured_logs.parquet')


In [None]:
# Loading semi-structured logs data
logs_df = spark.read.parquet("semi_structured_logs.parquet")

# Displaying the DataFrame
logs_df.show(10, truncate=False)

+---+--------------------------+---------------+--------------+----------------+--------+-----------+------------+
|id |timestamp                 |remote_host    |request_method|request_endpoint|protocol|status_code|content_size|
+---+--------------------------+---------------+--------------+----------------+--------+-----------+------------+
|0  |2024-05-23 03:36:43.914361|95.248.202.206 |POST          |/api/xyz0123456 |HTTP/1.1|200        |103         |
|1  |2024-05-23 03:36:43.914361|64.183.242.35  |PUT           |/api/nopqrstuvw |HTTP/2.0|404        |984         |
|2  |2024-05-23 03:36:43.914361|143.243.244.70 |POST          |/api/jklmnopqrs |HTTP/1.1|404        |3125        |
|3  |2024-05-23 03:36:43.914361|151.163.97.102 |PUT           |/api/wxyz012345 |HTTP/1.1|200        |7570        |
|4  |2024-05-23 03:36:43.914361|96.94.40.254   |GET           |/api/tuvwxyz012 |HTTP/2.0|200        |1972        |
|5  |2024-05-23 03:36:43.914361|188.69.206.97  |GET           |/api/mnopqrstuv |

In [None]:
# Converting and extracting timestamp information
logs_df = logs_df.withColumn('timestamp', to_timestamp('timestamp'))
logs_df = logs_df.withColumn('day', dayofyear('timestamp'))
logs_df = logs_df.withColumn('date', date_format('timestamp', 'yyyy-MM-dd'))
logs_df.select('timestamp', 'day', 'date').show(10, truncate=False)

+--------------------------+---+----------+
|timestamp                 |day|date      |
+--------------------------+---+----------+
|2024-05-23 03:36:43.914361|144|2024-05-23|
|2024-05-23 03:36:43.914361|144|2024-05-23|
|2024-05-23 03:36:43.914361|144|2024-05-23|
|2024-05-23 03:36:43.914361|144|2024-05-23|
|2024-05-23 03:36:43.914361|144|2024-05-23|
|2024-05-23 03:36:43.914361|144|2024-05-23|
|2024-05-23 03:36:43.914361|144|2024-05-23|
|2024-05-23 03:36:43.914361|144|2024-05-23|
|2024-05-23 03:36:43.914361|144|2024-05-23|
|2024-05-23 03:36:43.914361|144|2024-05-23|
+--------------------------+---+----------+
only showing top 10 rows



In [None]:
# Calculating statistics related to content size
# Top endpoints content
top_endpoints = logs_df.groupBy("request_endpoint").count().orderBy("count", ascending=False)
top_endpoints.show(10)

+----------------+-----+
|request_endpoint|count|
+----------------+-----+
| /api/opqrstuvwx|40418|
| /api/rstuvwxyz0|40406|
| /api/bcdefghijk|40278|
| /api/yz01234567|40204|
| /api/hijklmnopq|40176|
| /api/efghijklmn|40157|
| /api/jklmnopqrs|40120|
| /api/xyz0123456|40062|
| /api/tuvwxyz012|40054|
| /api/qrstuvwxyz|40050|
+----------------+-----+
only showing top 10 rows



In [None]:
# Top endpoints transferring maximum content
top_endpoints_by_content = logs_df.groupBy("request_endpoint").sum("content_size") \
    .orderBy(desc("sum(content_size)")).withColumnRenamed("sum(content_size)", "total_content_size")
top_endpoints_by_content.show(10)

+----------------+------------------+
|request_endpoint|total_content_size|
+----------------+------------------+
| /api/jklmnopqrs|         202151349|
| /api/rstuvwxyz0|         201705725|
| /api/bcdefghijk|         201626507|
| /api/opqrstuvwx|         201434078|
| /api/efghijklmn|         200986721|
| /api/tuvwxyz012|         200737050|
| /api/yz01234567|         200578404|
| /api/hijklmnopq|         200242344|
| /api/lmnopqrstu|         200057268|
| /api/xyz0123456|         200011341|
+----------------+------------------+
only showing top 10 rows



In [None]:
# Daily visited content size
daily_content_size = logs_df.withColumn("date", to_date(col("timestamp"))).groupBy("date").agg(sum("content_size").alias("daily_content_size")).orderBy("date")
daily_content_size.show()

+----------+------------------+
|      date|daily_content_size|
+----------+------------------+
|2024-05-23|        4996661557|
+----------+------------------+



In [None]:
# Min, Max and Count of content size
content_size_stats = logs_df.agg(min('content_size').alias('min_size'), max('content_size').alias('max_size'), count('content_size').alias('count_size'))
content_size_stats.show()

+--------+--------+----------+
|min_size|max_size|count_size|
+--------+--------+----------+
|       0|    9999|   1000000|
+--------+--------+----------+



In [None]:
# Response Code Analysis
response_code_analysis = logs_df.groupBy('status_code').count().orderBy('count', ascending=False)
response_code_analysis.show()

+-----------+------+
|status_code| count|
+-----------+------+
|        200|800121|
|        404|179885|
|        500| 19994|
+-----------+------+



In [None]:
# Frequent Visitors
frequent_visitors = logs_df.groupBy("remote_host").count().filter(col("count") >= 1).orderBy("count", ascending=False)
frequent_visitors.show(10)

+---------------+-----+
|    remote_host|count|
+---------------+-----+
|    54.92.82.63|    2|
|   70.66.149.51|    2|
|  76.248.35.128|    2|
|    8.27.145.49|    2|
|  50.13.107.208|    2|
|141.222.124.137|    2|
|   15.142.137.0|    2|
|   0.17.146.237|    2|
| 160.13.250.135|    2|
|    115.23.89.6|    2|
+---------------+-----+
only showing top 10 rows



In [None]:
# Identifying IP addresses accessing the server more than 10 times
frequent_ip_addresses = logs_df.groupBy('remote_host').agg(count('*').alias('access_count')).filter('access_count > 10').orderBy(desc('access_count'))
frequent_ip_addresses.show()

+-----------+------------+
|remote_host|access_count|
+-----------+------------+
+-----------+------------+



In [None]:
# Analyzing bad requests including the top 10 latest 404 requests with their endpoints and time
latest_404_requests = logs_df.filter(col("status_code") == 404).orderBy(col("timestamp"),\
                       ascending=False).select("timestamp", "request_endpoint").limit(10)
latest_404_requests.show(truncate=False)

+--------------------------+----------------+
|timestamp                 |request_endpoint|
+--------------------------+----------------+
|2024-05-23 03:36:43.914361|/api/nopqrstuvw |
|2024-05-23 03:36:43.914361|/api/ghijklmnop |
|2024-05-23 03:36:43.914361|/api/jklmnopqrs |
|2024-05-23 03:36:43.914361|/api/yz01234567 |
|2024-05-23 03:36:43.914361|/api/hijklmnopq |
|2024-05-23 03:36:43.914361|/api/defghijklm |
|2024-05-23 03:36:43.914361|/api/yz01234567 |
|2024-05-23 03:36:43.914361|/api/xyz0123456 |
|2024-05-23 03:36:43.914361|/api/nopqrstuvw |
|2024-05-23 03:36:43.914361|/api/jklmnopqrs |
+--------------------------+----------------+



In [None]:
# Calculating percentage of successful requests
total_requests = logs_df.count()
successful_requests = logs_df.filter(col("status_code") == 200).count()
error_requests = total_requests - successful_requests

# Calculating success and error percentages
success_percentage = (successful_requests / total_requests) * 100
error_percentage = 100 - success_percentage

# Defining thresholds for anomaly detection
success_threshold = 95  # Threshold for successful requests percentage
error_threshold = 5  # Threshold for error requests percentage

# Transformation for anomaly detection
def detect_anomalies(success_percentage, error_percentage):
    if success_percentage < success_threshold:
        return "Anomaly: Low success rate detected"
    elif error_percentage > error_threshold:
        return "Anomaly: High error rate detected"
    else:
        return "No anomalies detected"

# Applying transformation to detect anomalies
anomaly_detection_result = detect_anomalies(success_percentage, error_percentage)

# Printing analysis results and anomaly detection
print(f"Successful Requests: {success_percentage:.2f}%")
print(f"Error Requests: {error_percentage:.2f}%")
print(anomaly_detection_result)

Successful Requests: 80.01%
Error Requests: 19.99%
Anomaly: Low success rate detected
