### Problem 
1.You are given a raw log file which contains messages in JSON format.

Example: `{“timestamp”: “2025-10-13 20:05:05”, “level”: “ERROR”, “message”: “DB failed to connect”, “service”: “UserService”}`

The log file could be very large also in GBs.  
Please create a report which contains with the following:  
•	Count of errors per service  :done
•	Most common error messages  
•	Count of each log levels  :done
•	Most 5 occurred logs with their count  

---



https://stackoverflow.com/questions/10382253/reading-rather-large-json-files

http://pypi.python.org/pypi/ijson/

https://pythonspeed.com/articles/json-memory-streaming/

In [3]:
# using ijson

import ijson
import heapq
import time

start_time = time.perf_counter()

log_level_freq = {}     # {Key.ERROR: 56, 'WARNING': 12}
error_service_freq = {} # {'UserService': 12, 'AuthService': 12}
top_logs = []           # a heap stores top k logs
top_errors = []         # a heap stores top k error
log_counter = {}        # {'log_message_x': 2}
error_counter = {}      # {'error_message_1': 4}

TOP_K = 5
FILE_PATH = 'data/logs_1000mb.json'
class Key:
    LEVEL = 'level'
    ERROR = 'ERROR'
    SERVICE = 'service'
    MESSAGE = 'message'

with open(FILE_PATH, 'r') as f:
    for log in ijson.items(f, 'item'):
        # print(record)
        if log[Key.LEVEL]:
            log_level_freq[log[Key.LEVEL]] = log_level_freq.get(log[Key.LEVEL], 0) + 1
        if log[Key.LEVEL] == Key.ERROR and log[Key.SERVICE]:
            error_service_freq[log[Key.SERVICE]] = error_service_freq.get(log[Key.SERVICE], 0) + 1

        log_key = (log.get(Key.LEVEL,''), log.get(Key.SERVICE,''), log.get(Key.MESSAGE,''))
        log_count = log_counter.get(log_key, 0) + 1

        log_counter[log_key] = log_count
        heapq.heappush(top_logs, (log_count, log_key))
        if len(top_logs) > TOP_K:
            heapq.heappop(top_logs)

        if log.get(Key.LEVEL) == Key.ERROR:
            msg = log.get(Key.MESSAGE,'')
            err_count = error_counter.get(msg, 0) + 1
            error_counter[msg] = err_count
            heapq.heappush(top_errors, (err_count, msg))
            if len(top_errors) > TOP_K:
                heapq.heappop(top_errors)


print('log per level:', log_level_freq)
print('\nerror per service:', error_service_freq)

most_occured_logs = sorted(top_logs, reverse=True)
most_occured_errors = sorted(top_errors, reverse=True)

print("\nmost occured errors: ", most_occured_errors)
print("\nmost occured logs: ", most_occured_logs)


end_time = time.perf_counter()
execution_time = end_time - start_time
print(f"\nExecution time: {execution_time:.4f} seconds")


error per service: {'InventoryService': 335647, 'AuthService': 336236, 'OrderService': 335761, 'AnalyticsService': 336052, 'PaymentService': 335728}

most occured errors:  [(280735, 'Cache miss occurred'), (280734, 'Cache miss occurred'), (280733, 'Cache miss occurred'), (280732, 'Cache miss occurred'), (280731, 'Cache miss occurred')]

most occured logs:  [(56497, ('CRITICAL', 'InventoryService', 'User login successful')), (56496, ('CRITICAL', 'InventoryService', 'User login successful')), (56495, ('CRITICAL', 'InventoryService', 'User login successful')), (56494, ('CRITICAL', 'InventoryService', 'User login successful')), (56493, ('CRITICAL', 'InventoryService', 'User login successful'))]

Execution time: 26.9762 seconds


In [None]:
import heapq

class KthLargest:
    def __init__(self, k: int, data):
        """"""
        self.minHeap = []
        self.k = k
        for e in data:
            heapq.heappush(self.minHeap, e)
            if len(self.minHeap) > k:
                heapq.heappop(self.minHeap)

    def add(self, val: int) -> int:
        heapq.heappush(self.minHeap, val)
        if len(self.minHeap) > self.k:
            heapq.heappop(self.minHeap)
        return self.minHeap[0]

In [4]:
# using json



import json
import heapq
import time

start_time = time.perf_counter()

log_level_freq = {}     # {Key.ERROR: 56, 'WARNING': 12}
error_service_freq = {} # {'UserService': 12, 'AuthService': 12}
top_logs = []           # a heap stores top k logs
top_errors = []         # a heap stores top k error
log_counter = {}        # {'log_message_x': 2}
error_counter = {}      # {'error_message_1': 4}
FILE_PATH = 'data/logs_1000mb.json'
TOP_K = 5
class Key:
    LEVEL = 'level'
    ERROR = 'ERROR'
    SERVICE = 'service'
    MESSAGE = 'message'

f = open(FILE_PATH)
import json
logs = json.load(f)
for index, log in enumerate(logs):
        if log[Key.LEVEL]:
            log_level_freq[log[Key.LEVEL]] = log_level_freq.get(log[Key.LEVEL], 0) + 1
        if log[Key.LEVEL] == Key.ERROR and log[Key.SERVICE]:
            error_service_freq[log[Key.SERVICE]] = error_service_freq.get(log[Key.SERVICE], 0) + 1

        log_key = (log.get(Key.LEVEL,''), log.get(Key.SERVICE,''), log.get(Key.MESSAGE,''))
        log_count = log_counter.get(log_key, 0) + 1

        log_counter[log_key] = log_count
        heapq.heappush(top_logs, (log_count, log_key))
        if len(top_logs) > TOP_K:
            heapq.heappop(top_logs)

        if log.get(Key.LEVEL) == Key.ERROR:
            msg = log.get(Key.MESSAGE,'')
            err_count = error_counter.get(msg, 0) + 1
            error_counter[msg] = err_count
            heapq.heappush(top_errors, (err_count, msg))
            if len(top_errors) > TOP_K:
                heapq.heappop(top_errors)


print('log per level:', log_level_freq)
print('\nerror per service:', error_service_freq)

most_occured_logs = sorted(top_logs, reverse=True)
most_occured_errors = sorted(top_errors, reverse=True)

print("\nmost occured errors: ", most_occured_errors)
print("\nmost occured logs: ", most_occured_logs)


end_time = time.perf_counter()
execution_time = end_time - start_time
print(f"\nExecution time: {execution_time:.4f} seconds")


error per service: {'InventoryService': 335647, 'AuthService': 336236, 'OrderService': 335761, 'AnalyticsService': 336052, 'PaymentService': 335728}

most occured errors:  [(280735, 'Cache miss occurred'), (280734, 'Cache miss occurred'), (280733, 'Cache miss occurred'), (280732, 'Cache miss occurred'), (280731, 'Cache miss occurred')]

most occured logs:  [(56497, ('CRITICAL', 'InventoryService', 'User login successful')), (56496, ('CRITICAL', 'InventoryService', 'User login successful')), (56495, ('CRITICAL', 'InventoryService', 'User login successful')), (56494, ('CRITICAL', 'InventoryService', 'User login successful')), (56493, ('CRITICAL', 'InventoryService', 'User login successful'))]

Execution time: 26.9048 seconds
