Data Processing

In [None]:
#Load the Data
#This script loads the data from the CSV file and displays the first few rows, info, and description of the dataset.
#Import necessary libraries
import pandas as pd

df = pd.read_csv("../data/api_logs_simulated.csv")
df.head()
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   timestamp         100000 non-null  object 
 1   request_id        100000 non-null  object 
 2   api_endpoint      100000 non-null  object 
 3   response_time_ms  100000 non-null  float64
 4   status_code       100000 non-null  int64  
 5   cpu_usage         100000 non-null  float64
 6   memory_usage      100000 non-null  float64
 7   region            100000 non-null  object 
 8   user_agent        100000 non-null  object 
 9   consumer_id       100000 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 7.6+ MB


Unnamed: 0,response_time_ms,status_code,cpu_usage,memory_usage
count,100000.0,100000.0,100000.0,100000.0
mean,200.243631,223.17513,52.581181,55.099813
std,50.261172,70.981668,24.544254,20.195929
min,0.64,200.0,10.0,20.0
25%,166.27,200.0,31.38,37.64
50%,200.31,200.0,52.715,55.15
75%,234.04,200.0,73.85,72.61
max,441.3,500.0,95.0,90.0


In [None]:
# Check for missing values
df.isnull().sum()
df.isna().sum()
df.dropna(inplace=True)

In [None]:
# Check for missing values in the 'response_time_ms' column
df['response_time_ms'] = df['response_time_ms'].fillna(df['response_time_ms'].median())
df['response_time_ms'].isnull().sum()
df['response_time_ms'].isna().sum()

0

In [None]:
#Remove Duplicates (based on request_id)
df = df.drop_duplicates(subset='request_id')
df['request_id'].duplicated().sum()
df['request_id'].nunique()

100000

In [None]:
#Convert Timestamps
df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)

In [None]:
# Detect Outliers (Response Time)
Q1 = df['response_time_ms'].quantile(0.25)
Q3 = df['response_time_ms'].quantile(0.75)
IQR = Q3 - Q1
df['is_latency_outlier'] = ((df['response_time_ms'] < (Q1 - 1.5 * IQR)) | (df['response_time_ms'] > (Q3 + 1.5 * IQR)))


Feature engineering

In [8]:
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek
df['error_flag'] = df['status_code'].apply(lambda x: 1 if x >= 400 else 0)
df['latency_category'] = pd.qcut(df['response_time_ms'], q=4, labels=['low', 'medium', 'high', 'critical'])


In [None]:
# Create New Features
# Extracting date and time features from the timestamp

df['date'] = df['timestamp'].dt.date
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek
df['week_of_year'] = df['timestamp'].dt.isocalendar().week
df['month'] = df['timestamp'].dt.month
df['is_weekend'] = df['day_of_week'].isin([5, 6])
df['is_peak_hour'] = df['hour'].between(9, 18)  # Business hours

In [None]:
# Create a new column for the time of day


df['latency_zscore'] = (df['response_time_ms'] - df['response_time_ms'].mean()) / df['response_time_ms'].std()

# Flag extreme outliers
df['is_latency_outlier'] = df['latency_zscore'].abs() > 3

# Bin latency into categories
df['latency_category'] = pd.qcut(df['response_time_ms'], q=4, labels=['low', 'medium', 'high', 'critical'])

# Resource load indicators
df['cpu_load_category'] = pd.cut(df['cpu_usage'], bins=[0, 50, 75, 100], labels=['normal', 'elevated', 'high'])
df['memory_load_category'] = pd.cut(df['memory_usage'], bins=[0, 50, 75, 100], labels=['normal', 'elevated', 'high'])


In [11]:
# Flag error responses
df['is_error'] = df['status_code'] >= 400

# Error type
df['error_type'] = df['status_code'].apply(
    lambda x: 'client_error' if 400 <= x < 500 else 'server_error' if x >= 500 else 'success'
)

In [None]:
# Request count per consumer per day

consumer_day_counts = df.groupby(['consumer_id', 'date']).size().reset_index(name='requests_per_day')
df = df.merge(consumer_day_counts, on=['consumer_id', 'date'], how='left')

# Endpoint popularity (rolling average)

endpoint_popularity = df.groupby('api_endpoint')['request_id'].transform('count')
df['endpoint_popularity'] = endpoint_popularity


In [None]:
# Create a unique identifier for each API call

df['api_signature'] = df['api_endpoint'] + "_" + df['status_code'].astype(str)

In [None]:
# Create a new column for the time of day

df['error_flag'] = df['status_code'].apply(lambda x: 1 if x >= 400 else 0)
daily_stats = df.groupby(['consumer_id', 'date']).agg({
    'response_time_ms': 'mean',
    'error_flag': 'sum',
    'cpu_usage': 'mean',
    'memory_usage': 'max'
}).reset_index()

In [None]:
# Merge daily stats back to the original dataframe

df.to_csv("../data/api_logs_engineered.csv", index=False)
print("Enhanced dataset saved to /data/api_logs_engineered.csv")

Enhanced dataset saved to /data/api_logs_engineered.csv
