In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=5c4dcec18912813dfef0ef15a49f15e6fd270c24f166ba06fb1598f87e03ef9c
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp, date_format, rand, expr
from pyspark.sql.functions import to_timestamp, dayofyear, date_format, sum, min, max, count, desc
from pyspark.sql.functions import to_date, col

# Initializing Spark session
spark = SparkSession.builder \
    .appName("Generate Semi-Structured Logs") \
    .getOrCreate()

# Generating semi-structured log data
num_logs = 10000

logs_df = spark.range(0, num_logs) \
    .withColumn('timestamp', current_timestamp()) \
    .withColumn('remote_host', expr("concat_ws('.', floor(rand() * 256), floor(rand() * 256), floor(rand() * 256), floor(rand() * 256))")) \
    .withColumn('request_method', expr("CASE WHEN rand() < 0.25 THEN 'GET' WHEN rand() < 0.5 THEN 'POST' WHEN rand() < 0.75 THEN 'PUT' ELSE 'DELETE' END")) \
    .withColumn('request_endpoint', expr("concat('/api/', substring('abcdefghijklmnopqrstuvwxyz0123456789', floor(rand() * 25) + 1, 10))")) \
    .withColumn('protocol', expr("CASE WHEN rand() < 0.5 THEN 'HTTP/1.1' ELSE 'HTTP/2.0' END")) \
    .withColumn('status_code', expr("CASE WHEN rand() < 0.8 THEN 200 WHEN rand() < 0.9 THEN 404 ELSE 500 END")) \
    .withColumn('content_size', expr("floor(rand() * 10000)"))

# Saving the DataFrame as a Parquet file
logs_df.write.mode('overwrite').parquet('semi_structured_logs.parquet')


In [3]:
# Loading semi-structured logs data
logs_df = spark.read.parquet("semi_structured_logs.parquet")

# Displaying the DataFrame
logs_df.show(10, truncate=False)

+---+--------------------------+---------------+--------------+----------------+--------+-----------+------------+
|id |timestamp                 |remote_host    |request_method|request_endpoint|protocol|status_code|content_size|
+---+--------------------------+---------------+--------------+----------------+--------+-----------+------------+
|0  |2024-05-23 22:50:17.663725|240.165.154.114|PUT           |/api/rstuvwxyz0 |HTTP/2.0|200        |1269        |
|1  |2024-05-23 22:50:17.663725|99.99.153.79   |POST          |/api/uvwxyz0123 |HTTP/2.0|200        |3166        |
|2  |2024-05-23 22:50:17.663725|250.71.119.48  |POST          |/api/ghijklmnop |HTTP/2.0|404        |8960        |
|3  |2024-05-23 22:50:17.663725|99.205.146.19  |GET           |/api/stuvwxyz01 |HTTP/2.0|200        |6501        |
|4  |2024-05-23 22:50:17.663725|80.137.87.129  |PUT           |/api/efghijklmn |HTTP/1.1|200        |3999        |
|5  |2024-05-23 22:50:17.663725|55.149.204.221 |POST          |/api/rstuvwxyz0 |

In [4]:
# Converting and extracting timestamp information
logs_df = logs_df.withColumn('timestamp', to_timestamp('timestamp'))
logs_df = logs_df.withColumn('day', dayofyear('timestamp'))
logs_df = logs_df.withColumn('date', date_format('timestamp', 'yyyy-MM-dd'))
logs_df.select('timestamp', 'day', 'date').show(10, truncate=False)

+--------------------------+---+----------+
|timestamp                 |day|date      |
+--------------------------+---+----------+
|2024-05-23 22:50:17.663725|144|2024-05-23|
|2024-05-23 22:50:17.663725|144|2024-05-23|
|2024-05-23 22:50:17.663725|144|2024-05-23|
|2024-05-23 22:50:17.663725|144|2024-05-23|
|2024-05-23 22:50:17.663725|144|2024-05-23|
|2024-05-23 22:50:17.663725|144|2024-05-23|
|2024-05-23 22:50:17.663725|144|2024-05-23|
|2024-05-23 22:50:17.663725|144|2024-05-23|
|2024-05-23 22:50:17.663725|144|2024-05-23|
|2024-05-23 22:50:17.663725|144|2024-05-23|
+--------------------------+---+----------+
only showing top 10 rows



In [5]:
# Calculating statistics related to content size
# Top endpoints content
top_endpoints = logs_df.groupBy("request_endpoint").count().orderBy("count", ascending=False)
top_endpoints.show(10)

+----------------+-----+
|request_endpoint|count|
+----------------+-----+
| /api/qrstuvwxyz|  440|
| /api/mnopqrstuv|  435|
| /api/cdefghijkl|  424|
| /api/uvwxyz0123|  417|
| /api/efghijklmn|  417|
| /api/defghijklm|  417|
| /api/pqrstuvwxy|  416|
| /api/yz01234567|  409|
| /api/vwxyz01234|  407|
| /api/tuvwxyz012|  406|
+----------------+-----+
only showing top 10 rows



In [6]:
# Top endpoints transferring maximum content
top_endpoints_by_content = logs_df.groupBy("request_endpoint").sum("content_size") \
    .orderBy(desc("sum(content_size)")).withColumnRenamed("sum(content_size)", "total_content_size")
top_endpoints_by_content.show(10)

+----------------+------------------+
|request_endpoint|total_content_size|
+----------------+------------------+
| /api/mnopqrstuv|           2207205|
| /api/qrstuvwxyz|           2178786|
| /api/efghijklmn|           2150369|
| /api/defghijklm|           2092927|
| /api/cdefghijkl|           2083379|
| /api/xyz0123456|           2076025|
| /api/yz01234567|           2072820|
| /api/tuvwxyz012|           2071670|
| /api/vwxyz01234|           2059728|
| /api/pqrstuvwxy|           2055175|
+----------------+------------------+
only showing top 10 rows



In [7]:
# Daily visited content size
daily_content_size = logs_df.withColumn("date", to_date(col("timestamp"))).groupBy("date").agg(sum("content_size").alias("daily_content_size")).orderBy("date")
daily_content_size.show()

+----------+------------------+
|      date|daily_content_size|
+----------+------------------+
|2024-05-23|          50022447|
+----------+------------------+



In [8]:
# Min, Max and Count of content size
content_size_stats = logs_df.agg(min('content_size').alias('min_size'), max('content_size').alias('max_size'), count('content_size').alias('count_size'))
content_size_stats.show()

+--------+--------+----------+
|min_size|max_size|count_size|
+--------+--------+----------+
|       1|    9999|     10000|
+--------+--------+----------+



In [9]:
# Response Code Analysis
response_code_analysis = logs_df.groupBy('status_code').count().orderBy('count', ascending=False)
response_code_analysis.show()

+-----------+-----+
|status_code|count|
+-----------+-----+
|        200| 7955|
|        404| 1868|
|        500|  177|
+-----------+-----+



In [10]:
# Frequent Visitors
frequent_visitors = logs_df.groupBy("remote_host").count().filter(col("count") >= 1).orderBy("count", ascending=False)
frequent_visitors.show(10)

+---------------+-----+
|    remote_host|count|
+---------------+-----+
| 216.195.84.125|    1|
|   97.71.253.35|    1|
|   40.15.63.164|    1|
|148.220.200.168|    1|
| 201.102.231.52|    1|
|  80.69.207.137|    1|
|    64.31.53.87|    1|
|   5.179.143.26|    1|
|   254.80.20.55|    1|
|  118.159.21.22|    1|
+---------------+-----+
only showing top 10 rows



In [11]:
# Identifying IP addresses accessing the server more than 10 times
frequent_ip_addresses = logs_df.groupBy('remote_host').agg(count('*').alias('access_count')).filter('access_count > 10').orderBy(desc('access_count'))
frequent_ip_addresses.show()

+-----------+------------+
|remote_host|access_count|
+-----------+------------+
+-----------+------------+



In [12]:
# Analyzing bad requests including the top 10 latest 404 requests with their endpoints and time
latest_404_requests = logs_df.filter(col("status_code") == 404).orderBy(col("timestamp"),\
                       ascending=False).select("timestamp", "request_endpoint").limit(10)
latest_404_requests.show(truncate=False)

+--------------------------+----------------+
|timestamp                 |request_endpoint|
+--------------------------+----------------+
|2024-05-23 22:50:17.663725|/api/tuvwxyz012 |
|2024-05-23 22:50:17.663725|/api/ghijklmnop |
|2024-05-23 22:50:17.663725|/api/vwxyz01234 |
|2024-05-23 22:50:17.663725|/api/nopqrstuvw |
|2024-05-23 22:50:17.663725|/api/mnopqrstuv |
|2024-05-23 22:50:17.663725|/api/fghijklmno |
|2024-05-23 22:50:17.663725|/api/opqrstuvwx |
|2024-05-23 22:50:17.663725|/api/mnopqrstuv |
|2024-05-23 22:50:17.663725|/api/fghijklmno |
|2024-05-23 22:50:17.663725|/api/cdefghijkl |
+--------------------------+----------------+



In [13]:
# Calculating percentage of successful requests
total_requests = logs_df.count()
successful_requests = logs_df.filter(col("status_code") == 200).count()
error_requests = total_requests - successful_requests

# Calculating success and error percentages
success_percentage = (successful_requests / total_requests) * 100
error_percentage = 100 - success_percentage

# Defining thresholds for anomaly detection
success_threshold = 95  # Threshold for successful requests percentage
error_threshold = 5  # Threshold for error requests percentage

# Transformation for anomaly detection
def detect_anomalies(success_percentage, error_percentage):
    if success_percentage < success_threshold:
        return "Anomaly: Low success rate detected"
    elif error_percentage > error_threshold:
        return "Anomaly: High error rate detected"
    else:
        return "No anomalies detected"

# Applying transformation to detect anomalies
anomaly_detection_result = detect_anomalies(success_percentage, error_percentage)

# Printing analysis results and anomaly detection
print(f"Successful Requests: {success_percentage:.2f}%")
print(f"Error Requests: {error_percentage:.2f}%")
print(anomaly_detection_result)

Successful Requests: 79.55%
Error Requests: 20.45%
Anomaly: Low success rate detected
