In [29]:
!pip install pyspark



In [30]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp, date_format, rand, expr
from pyspark.sql.functions import to_timestamp, dayofyear, date_format, sum, min, max, count, desc
from pyspark.sql.functions import to_date, col

# Initializing Spark session
spark = SparkSession.builder \
    .appName("Generate Semi-Structured Logs") \
    .getOrCreate()

# Generating semi-structured log data
num_logs = 1000000

logs_df = spark.range(0, num_logs) \
    .withColumn('timestamp', current_timestamp()) \
    .withColumn('remote_host', expr("concat_ws('.', floor(rand() * 256), floor(rand() * 256), floor(rand() * 256), floor(rand() * 256))")) \
    .withColumn('request_method', expr("CASE WHEN rand() < 0.25 THEN 'GET' WHEN rand() < 0.5 THEN 'POST' WHEN rand() < 0.75 THEN 'PUT' ELSE 'DELETE' END")) \
    .withColumn('request_endpoint', expr("concat('/api/', substring('abcdefghijklmnopqrstuvwxyz0123456789', floor(rand() * 25) + 1, 10))")) \
    .withColumn('protocol', expr("CASE WHEN rand() < 0.5 THEN 'HTTP/1.1' ELSE 'HTTP/2.0' END")) \
    .withColumn('status_code', expr("CASE WHEN rand() < 0.8 THEN 200 WHEN rand() < 0.9 THEN 404 ELSE 500 END")) \
    .withColumn('content_size', expr("floor(rand() * 10000)"))

# Saving the DataFrame as a Parquet file
logs_df.write.mode('overwrite').parquet('semi_structured_logs.parquet')


In [31]:
# Loading semi-structured logs data
logs_df = spark.read.parquet("semi_structured_logs.parquet")

# Displaying the DataFrame
logs_df.show(10, truncate=False)

+---+--------------------------+---------------+--------------+----------------+--------+-----------+------------+
|id |timestamp                 |remote_host    |request_method|request_endpoint|protocol|status_code|content_size|
+---+--------------------------+---------------+--------------+----------------+--------+-----------+------------+
|0  |2024-05-22 17:39:42.278127|178.185.210.79 |GET           |/api/jklmnopqrs |HTTP/2.0|200        |3645        |
|1  |2024-05-22 17:39:42.278127|19.77.85.16    |GET           |/api/rstuvwxyz0 |HTTP/2.0|500        |4510        |
|2  |2024-05-22 17:39:42.278127|119.87.76.235  |PUT           |/api/klmnopqrst |HTTP/1.1|200        |9292        |
|3  |2024-05-22 17:39:42.278127|127.189.105.109|PUT           |/api/uvwxyz0123 |HTTP/2.0|200        |4612        |
|4  |2024-05-22 17:39:42.278127|42.8.51.108    |POST          |/api/stuvwxyz01 |HTTP/1.1|200        |4082        |
|5  |2024-05-22 17:39:42.278127|32.73.75.122   |PUT           |/api/fghijklmno |

In [32]:
# Converting and extracting timestamp information
logs_df = logs_df.withColumn('timestamp', to_timestamp('timestamp'))
logs_df = logs_df.withColumn('day', dayofyear('timestamp'))
logs_df = logs_df.withColumn('date', date_format('timestamp', 'yyyy-MM-dd'))
logs_df.select('timestamp', 'day', 'date').show(10, truncate=False)

+--------------------------+---+----------+
|timestamp                 |day|date      |
+--------------------------+---+----------+
|2024-05-22 17:39:42.278127|143|2024-05-22|
|2024-05-22 17:39:42.278127|143|2024-05-22|
|2024-05-22 17:39:42.278127|143|2024-05-22|
|2024-05-22 17:39:42.278127|143|2024-05-22|
|2024-05-22 17:39:42.278127|143|2024-05-22|
|2024-05-22 17:39:42.278127|143|2024-05-22|
|2024-05-22 17:39:42.278127|143|2024-05-22|
|2024-05-22 17:39:42.278127|143|2024-05-22|
|2024-05-22 17:39:42.278127|143|2024-05-22|
|2024-05-22 17:39:42.278127|143|2024-05-22|
+--------------------------+---+----------+
only showing top 10 rows



In [33]:
# Calculating statistics related to content size
# Top endpoints content
top_endpoints = logs_df.groupBy("request_endpoint").count().orderBy("count", ascending=False)
top_endpoints.show(10)

+----------------+-----+
|request_endpoint|count|
+----------------+-----+
| /api/lmnopqrstu|40334|
| /api/pqrstuvwxy|40296|
| /api/mnopqrstuv|40227|
| /api/vwxyz01234|40191|
| /api/bcdefghijk|40188|
| /api/xyz0123456|40153|
| /api/efghijklmn|40137|
| /api/defghijklm|40128|
| /api/qrstuvwxyz|40099|
| /api/opqrstuvwx|40094|
+----------------+-----+
only showing top 10 rows



In [34]:
# Top endpoints transferring maximum content
top_endpoints_by_content = logs_df.groupBy("request_endpoint").sum("content_size") \
    .orderBy(desc("sum(content_size)")).withColumnRenamed("sum(content_size)", "total_content_size")
top_endpoints_by_content.show(10)

+----------------+------------------+
|request_endpoint|total_content_size|
+----------------+------------------+
| /api/efghijklmn|         201948829|
| /api/pqrstuvwxy|         201722927|
| /api/bcdefghijk|         201714371|
| /api/lmnopqrstu|         201379009|
| /api/xyz0123456|         200792321|
| /api/defghijklm|         200688391|
| /api/abcdefghij|         200579581|
| /api/vwxyz01234|         200535832|
| /api/rstuvwxyz0|         200525906|
| /api/hijklmnopq|         200360306|
+----------------+------------------+
only showing top 10 rows



In [35]:
# Daily visited content size
daily_content_size = logs_df.withColumn("date", to_date(col("timestamp"))).groupBy("date").agg(sum("content_size").alias("daily_content_size")).orderBy("date")
daily_content_size.show()

+----------+------------------+
|      date|daily_content_size|
+----------+------------------+
|2024-05-22|        4996097166|
+----------+------------------+



In [36]:
# Min, Max and Count of content size
content_size_stats = logs_df.agg(min('content_size').alias('min_size'), max('content_size').alias('max_size'), count('content_size').alias('count_size'))
content_size_stats.show()

+--------+--------+----------+
|min_size|max_size|count_size|
+--------+--------+----------+
|       0|    9999|   1000000|
+--------+--------+----------+



In [37]:
# Response Code Analysis
response_code_analysis = logs_df.groupBy('status_code').count().orderBy('count', ascending=False)
response_code_analysis.show()

+-----------+------+
|status_code| count|
+-----------+------+
|        200|800661|
|        404|179381|
|        500| 19958|
+-----------+------+



In [38]:
# Frequent Visitors
frequent_visitors = logs_df.groupBy("remote_host").count().filter(col("count") >= 1).orderBy("count", ascending=False)
frequent_visitors.show(10)

+---------------+-----+
|    remote_host|count|
+---------------+-----+
| 246.247.69.141|    2|
|170.156.228.209|    2|
|   99.29.37.121|    2|
| 99.176.129.220|    2|
| 165.116.58.114|    2|
|   147.86.4.122|    2|
| 158.68.230.167|    2|
| 145.251.215.38|    2|
|  229.163.61.43|    2|
|188.171.225.195|    2|
+---------------+-----+
only showing top 10 rows



In [39]:
# Identifying IP addresses accessing the server more than 10 times
frequent_ip_addresses = logs_df.groupBy('remote_host').agg(count('*').alias('access_count')).filter('access_count > 10').orderBy(desc('access_count'))
frequent_ip_addresses.show()

+-----------+------------+
|remote_host|access_count|
+-----------+------------+
+-----------+------------+



In [40]:
# Analyzing bad requests including the top 10 latest 404 requests with their endpoints and time
latest_404_requests = logs_df.filter(col("status_code") == 404).orderBy(col("timestamp"),\
                       ascending=False).select("timestamp", "request_endpoint").limit(10)
latest_404_requests.show(truncate=False)

+--------------------------+----------------+
|timestamp                 |request_endpoint|
+--------------------------+----------------+
|2024-05-22 17:39:42.278127|/api/rstuvwxyz0 |
|2024-05-22 17:39:42.278127|/api/opqrstuvwx |
|2024-05-22 17:39:42.278127|/api/mnopqrstuv |
|2024-05-22 17:39:42.278127|/api/opqrstuvwx |
|2024-05-22 17:39:42.278127|/api/vwxyz01234 |
|2024-05-22 17:39:42.278127|/api/efghijklmn |
|2024-05-22 17:39:42.278127|/api/klmnopqrst |
|2024-05-22 17:39:42.278127|/api/ijklmnopqr |
|2024-05-22 17:39:42.278127|/api/mnopqrstuv |
|2024-05-22 17:39:42.278127|/api/mnopqrstuv |
+--------------------------+----------------+



In [41]:
# Calculating percentage of successful requests
total_requests = logs_df.count()
successful_requests = logs_df.filter(col("status_code") == 200).count()
error_requests = total_requests - successful_requests

# Calculating success and error percentages
success_percentage = (successful_requests / total_requests) * 100
error_percentage = 100 - success_percentage

# Defining thresholds for anomaly detection
success_threshold = 95  # Threshold for successful requests percentage
error_threshold = 5  # Threshold for error requests percentage

# Transformation for anomaly detection
def detect_anomalies(success_percentage, error_percentage):
    if success_percentage < success_threshold:
        return "Anomaly: Low success rate detected"
    elif error_percentage > error_threshold:
        return "Anomaly: High error rate detected"
    else:
        return "No anomalies detected"

# Applying transformation to detect anomalies
anomaly_detection_result = detect_anomalies(success_percentage, error_percentage)

# Printing analysis results and anomaly detection
print(f"Successful Requests: {success_percentage:.2f}%")
print(f"Error Requests: {error_percentage:.2f}%")
print(anomaly_detection_result)

Successful Requests: 80.07%
Error Requests: 19.93%
Anomaly: Low success rate detected
