Data Generation & Ingestion.
Simulate 100,000 API requests across multiple endpoints, regions, and consumers to represent real-world API usage for anomaly detection and BI insights.

In [2]:
import pandas as pd
import random
import uuid
from datetime import datetime, timedelta
import numpy as np
import os

Fields:
- timestamp, request_id, api_endpoint
- response_time_ms, status_code
- cpu_usage, memory_usage
- region, user_agent, consumer_id

In [3]:
# Configuration
n_records = 100000
start_date = datetime.utcnow() - timedelta(days=180)
endpoints = ["/login", "/verify", "/transaction", "/balance", "/logout", "/admin/reset", "/user/profile", "/data/fetch"]
status_codes = [200, 200, 200, 401, 403, 404, 500]
regions = ["US", "EU", "ASIA", "AFRICA", "LATAM"]
user_agents = ["PostmanRuntime", "Mozilla/5.0", "Python-urllib/3.9", "curl/7.68.0"]
consumer_ids = [f"C{str(i).zfill(4)}" for i in range(1, 201)]

def generate_api_logs(n):
    logs = []
    for _ in range(n):
        timestamp = start_date + timedelta(seconds=random.randint(0, int((datetime.utcnow() - start_date).total_seconds())))
        api_endpoint = random.choices(endpoints, weights=[20, 15, 25, 10, 10, 2, 10, 8])[0]
        response_time = round(np.abs(np.random.normal(loc=200, scale=50)), 2)
        status = random.choices(status_codes, weights=[80, 5, 5, 2, 2, 3, 3])[0]
        cpu = round(random.uniform(10, 95), 2)
        memory = round(random.uniform(20, 90), 2)
        request_id = str(uuid.uuid4())
        region = random.choice(regions)
        user_agent = random.choice(user_agents)
        consumer_id = random.choice(consumer_ids)

        logs.append({
            "timestamp": timestamp.isoformat(),
            "request_id": request_id,
            "api_endpoint": api_endpoint,
            "response_time_ms": response_time,
            "status_code": status,
            "cpu_usage": cpu,
            "memory_usage": memory,
            "region": region,
            "user_agent": user_agent,
            "consumer_id": consumer_id
        })
    
    return pd.DataFrame(logs)

  start_date = datetime.utcnow() - timedelta(days=180)


In [4]:
# Generate and Save
df = generate_api_logs(n_records)

os.makedirs("../data", exist_ok=True)
df.to_csv("../data/api_logs_simulated.csv", index=False)
print("API Logs saved to /data/api_logs_simulated.csv")

  timestamp = start_date + timedelta(seconds=random.randint(0, int((datetime.utcnow() - start_date).total_seconds())))


API Logs saved to /data/api_logs_simulated.csv


In [4]:
df.head()

Unnamed: 0,timestamp,request_id,api_endpoint,response_time_ms,status_code,cpu_usage,memory_usage,region,user_agent,consumer_id
0,2025-02-14T16:19:46.096855,e4b17461-f590-4e27-8f5f-17832d5720f2,/verify,162.56,200,92.0,43.17,ASIA,PostmanRuntime,C0121
1,2025-02-17T12:05:36.096855,f8783a16-fe7a-4a45-b27f-cc204ee3d524,/logout,131.69,200,51.19,53.33,LATAM,Mozilla/5.0,C0134
2,2024-10-05T17:31:59.096855,64f82ac9-ba4f-4551-a13a-e6883272e26c,/data/fetch,193.6,200,10.61,48.31,EU,Python-urllib/3.9,C0153
3,2025-02-06T02:36:56.096855,516a920a-578a-495e-b6e0-373842cbca0a,/verify,174.67,200,69.18,64.61,AFRICA,Mozilla/5.0,C0099
4,2025-03-15T11:26:52.096855,66c14b23-c662-48a6-a121-5c1a8a780bdd,/transaction,185.27,200,36.62,78.7,EU,Python-urllib/3.9,C0096


In [5]:
import os
os.path.exists("../data/api_logs_simulated.csv")

True