#### detection of the surging of events in an irregular time series

__Inter-event Time Analysis:__

Calculates the time between consecutive events.
Flags events where the inter-event time is significantly shorter than average.


__Sliding Window Event Count:__

Counts events within a sliding time window.
Flags time points where the event count is significantly higher than average.


__Kernel Density Estimation:__

Uses KDE to estimate the density of events over time.
Flags events in regions of unusually high density.

__Inter-event Time Analysis:__

    WITH ordered_events AS (
      SELECT 
        timestamp,
        LAG(timestamp) OVER (ORDER BY timestamp) AS prev_timestamp
      FROM events
    ),
    inter_event_times AS (
      SELECT 
        timestamp,
        EXTRACT(EPOCH FROM (timestamp - prev_timestamp)) AS iet_seconds
      FROM ordered_events
      WHERE prev_timestamp IS NOT NULL
    )
    SELECT 
      timestamp,
      iet_seconds,
      AVG(iet_seconds) OVER () AS mean_iet,
      STDDEV(iet_seconds) OVER () AS stddev_iet
    FROM inter_event_times;

In [4]:
import pandas as pd
import numpy as np

def inter_event_time_analysis(timestamps, threshold=2):
    df = pd.DataFrame({'timestamp': pd.to_datetime(timestamps)}).sort_values('timestamp')
    df['iet'] = df['timestamp'].diff().dt.total_seconds()
    
    mean_iet = df['iet'].mean()
    std_iet = df['iet'].std()
    
    df['is_anomaly'] = df['iet'] < (mean_iet - threshold * std_iet)
    
    return df[df['is_anomaly']]['timestamp'].tolist()

# Usage
anomalies = inter_event_time_analysis(timestamps)
print("Inter-event Time Anomalies:", anomalies)

NameError: name 'timestamps' is not defined

__Sliding Window Event Count:__

    WITH event_counts AS (
      SELECT 
        e1.timestamp,
        COUNT(*) AS event_count
      FROM events e1
      JOIN events e2 ON e2.timestamp BETWEEN e1.timestamp - INTERVAL '7 days' AND e1.timestamp
      GROUP BY e1.timestamp
    )
    SELECT 
      timestamp,
      event_count,
      AVG(event_count) OVER () AS mean_count,
      STDDEV(event_count) OVER () AS stddev_count
    FROM event_counts;

In [None]:
import pandas as pd
import numpy as np

def inter_event_time_analysis(timestamps, threshold=2):
    df = pd.DataFrame({'timestamp': pd.to_datetime(timestamps)}).sort_values('timestamp')
    df['iet'] = df['timestamp'].diff().dt.total_seconds()
    
    mean_iet = df['iet'].mean()
    std_iet = df['iet'].std()
    
    df['is_anomaly'] = df['iet'] < (mean_iet - threshold * std_iet)
    
    return df[df['is_anomaly']]['timestamp'].tolist()

# Usage
anomalies = inter_event_time_analysis(timestamps)
print("Inter-event Time Anomalies:", anomalies)

__Sliding Window Event Count:__

    WITH event_counts AS (
      SELECT 
        e1.timestamp,
        COUNT(*) AS event_count
      FROM events e1
      JOIN events e2 ON e2.timestamp BETWEEN e1.timestamp - INTERVAL '7 days' AND e1.timestamp
      GROUP BY e1.timestamp
    )
    SELECT 
      timestamp,
      event_count,
      AVG(event_count) OVER () AS mean_count,
      STDDEV(event_count) OVER () AS stddev_count
    FROM event_counts;

In [None]:
import pandas as pd
import numpy as np

def sliding_window_count(timestamps, window_size=7, threshold=2):
    df = pd.DataFrame({'timestamp': pd.to_datetime(timestamps)}).sort_values('timestamp')
    window = pd.Timedelta(days=window_size)
    
    df['count'] = df.apply(lambda row: sum((row['timestamp'] - window <= ts <= row['timestamp']) for ts in df['timestamp']), axis=1)
    
    mean_count = df['count'].mean()
    std_count = df['count'].std()
    
    df['is_anomaly'] = df['count'] > (mean_count + threshold * std_count)
    
    return df[df['is_anomaly']]['timestamp'].tolist()

# Usage
anomalies = sliding_window_count(timestamps)
print("Sliding Window Count Anomalies:", anomalies)

In [None]:
# __Kernel Density Estimation:__
import pandas as pd
import numpy as np
from sklearn.neighbors import KernelDensity

def kde_analysis(timestamps, bandwidth=7, threshold=2):
    df = pd.DataFrame({'timestamp': pd.to_datetime(timestamps)}).sort_values('timestamp')
    X = df['timestamp'].astype(int).values.reshape(-1, 1)
    
    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth * 86400)  # convert days to seconds
    kde.fit(X)
    
    log_dens = kde.score_samples(X)
    
    mean_log_dens = np.mean(log_dens)
    std_log_dens = np.std(log_dens)
    
    df['is_anomaly'] = log_dens < (mean_log_dens - threshold * std_log_dens)
    
    return df[df['is_anomaly']]['timestamp'].tolist()

# Usage
anomalies = kde_analysis(timestamps)
print("KDE Anomalies:", anomalies)


In [None]:
# visualizaiton

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.neighbors import KernelDensity

def inter_event_time_analysis(dates, threshold=2):
    df = pd.DataFrame({'date': pd.to_datetime(dates)}).sort_values('date')
    df['iet'] = df['date'].diff().dt.days
    
    mean_iet = df['iet'].mean()
    std_iet = df['iet'].std()
    
    df['is_anomaly'] = df['iet'] < (mean_iet - threshold * std_iet)
    
    return df[df['is_anomaly']]['date'].tolist()

def sliding_window_count(dates, window_size=7, threshold=2):
    df = pd.DataFrame({'date': pd.to_datetime(dates)}).sort_values('date')
    
    df['count'] = df.apply(lambda row: sum((row['date'] - pd.Timedelta(days=window_size) <= d <= row['date']) for d in df['date']), axis=1)
    
    mean_count = df['count'].mean()
    std_count = df['count'].std()
    
    df['is_anomaly'] = df['count'] > (mean_count + threshold * std_count)
    
    return df[df['is_anomaly']]['date'].tolist()

def kde_analysis(dates, bandwidth=7, threshold=2):
    df = pd.DataFrame({'date': pd.to_datetime(dates)}).sort_values('date')
    X = df['date'].astype(int).values.reshape(-1, 1) // 10**9 // 86400  # convert to days since epoch
    
    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth)
    kde.fit(X)
    
    log_dens = kde.score_samples(X)
    
    mean_log_dens = np.mean(log_dens)
    std_log_dens = np.std(log_dens)
    
    df['is_anomaly'] = log_dens < (mean_log_dens - threshold * std_log_dens)
    
    return df[df['is_anomaly']]['date'].tolist()

def visualize_anomalies(dates, iet_anomalies, swc_anomalies, kde_anomalies):
    dates = pd.to_datetime(dates)
    
    fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(15, 15), sharex=True)
    
    # Common x-axis settings
    plt.xlabel('Date')
    plt.xlim(dates.min(), dates.max())
    
    # Inter-event Time Analysis
    ax1.scatter(dates, [1]*len(dates), alpha=0.5, label='Events')
    ax1.scatter(iet_anomalies, [1]*len(iet_anomalies), color='red', s=100, marker='*', label='IET Anomalies')
    ax1.set_title('Inter-event Time Analysis')
    ax1.legend()
    ax1.set_yticks([])
    
    # Sliding Window Count
    ax2.scatter(dates, [1]*len(dates), alpha=0.5, label='Events')
    ax2.scatter(swc_anomalies, [1]*len(swc_anomalies), color='green', s=100, marker='*', label='SWC Anomalies')
    ax2.set_title('Sliding Window Count')
    ax2.legend()
    ax2.set_yticks([])
    
    # Kernel Density Estimation
    ax3.scatter(dates, [1]*len(dates), alpha=0.5, label='Events')
    ax3.scatter(kde_anomalies, [1]*len(kde_anomalies), color='orange', s=100, marker='*', label='KDE Anomalies')
    ax3.set_title('Kernel Density Estimation')
    ax3.legend()
    ax3.set_yticks([])
    
    # Format x-axis
    for ax in [ax1, ax2, ax3]:
        ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
        ax.xaxis.set_major_locator(mdates.AutoDateLocator())
    
    plt.tight_layout()
    plt.show()

# Example usage
# Replace this with your actual dates
dates = pd.date_range(start='2023-01-01', end='2023-12-31', freq='D')
dates = dates[np.random.rand(len(dates)) > 0.7]  # Randomly select about 30% of the days

iet_anomalies = inter_event_time_analysis(dates)
swc_anomalies = sliding_window_count(dates)
kde_anomalies = kde_analysis(dates)

visualize_anomalies(dates, iet_anomalies, swc_anomalies, kde_anomalies)