#### detection of the surging of events in an irregular time series

__Inter-event Time Analysis:__

Calculates the time between consecutive events.
Flags events where the inter-event time is significantly shorter than average.


__Sliding Window Event Count:__

Counts events within a sliding time window.
Flags time points where the event count is significantly higher than average.


__Kernel Density Estimation:__

Uses KDE to estimate the density of events over time.
Flags events in regions of unusually high density.

__Inter-event Time Analysis:__

    WITH ordered_events AS (
      SELECT 
        timestamp,
        LAG(timestamp) OVER (ORDER BY timestamp) AS prev_timestamp
      FROM events
    ),
    inter_event_times AS (
      SELECT 
        timestamp,
        EXTRACT(EPOCH FROM (timestamp - prev_timestamp)) AS iet_seconds
      FROM ordered_events
      WHERE prev_timestamp IS NOT NULL
    )
    SELECT 
      timestamp,
      iet_seconds,
      AVG(iet_seconds) OVER () AS mean_iet,
      STDDEV(iet_seconds) OVER () AS stddev_iet
    FROM inter_event_times;

In [4]:
import pandas as pd
import numpy as np

def inter_event_time_analysis(timestamps, threshold=2):
    df = pd.DataFrame({'timestamp': pd.to_datetime(timestamps)}).sort_values('timestamp')
    df['iet'] = df['timestamp'].diff().dt.total_seconds()
    
    mean_iet = df['iet'].mean()
    std_iet = df['iet'].std()
    
    df['is_anomaly'] = df['iet'] < (mean_iet - threshold * std_iet)
    
    return df[df['is_anomaly']]['timestamp'].tolist()

# Usage
anomalies = inter_event_time_analysis(timestamps)
print("Inter-event Time Anomalies:", anomalies)

NameError: name 'timestamps' is not defined

__Sliding Window Event Count:__

    WITH event_counts AS (
      SELECT 
        e1.timestamp,
        COUNT(*) AS event_count
      FROM events e1
      JOIN events e2 ON e2.timestamp BETWEEN e1.timestamp - INTERVAL '7 days' AND e1.timestamp
      GROUP BY e1.timestamp
    )
    SELECT 
      timestamp,
      event_count,
      AVG(event_count) OVER () AS mean_count,
      STDDEV(event_count) OVER () AS stddev_count
    FROM event_counts;

In [None]:
import pandas as pd
import numpy as np

def inter_event_time_analysis(timestamps, threshold=2):
    df = pd.DataFrame({'timestamp': pd.to_datetime(timestamps)}).sort_values('timestamp')
    df['iet'] = df['timestamp'].diff().dt.total_seconds()
    
    mean_iet = df['iet'].mean()
    std_iet = df['iet'].std()
    
    df['is_anomaly'] = df['iet'] < (mean_iet - threshold * std_iet)
    
    return df[df['is_anomaly']]['timestamp'].tolist()

# Usage
anomalies = inter_event_time_analysis(timestamps)
print("Inter-event Time Anomalies:", anomalies)

__Sliding Window Event Count:__

    WITH event_counts AS (
      SELECT 
        e1.timestamp,
        COUNT(*) AS event_count
      FROM events e1
      JOIN events e2 ON e2.timestamp BETWEEN e1.timestamp - INTERVAL '7 days' AND e1.timestamp
      GROUP BY e1.timestamp
    )
    SELECT 
      timestamp,
      event_count,
      AVG(event_count) OVER () AS mean_count,
      STDDEV(event_count) OVER () AS stddev_count
    FROM event_counts;

In [None]:
import pandas as pd
import numpy as np

def sliding_window_count(timestamps, window_size=7, threshold=2):
    df = pd.DataFrame({'timestamp': pd.to_datetime(timestamps)}).sort_values('timestamp')
    window = pd.Timedelta(days=window_size)
    
    df['count'] = df.apply(lambda row: sum((row['timestamp'] - window <= ts <= row['timestamp']) for ts in df['timestamp']), axis=1)
    
    mean_count = df['count'].mean()
    std_count = df['count'].std()
    
    df['is_anomaly'] = df['count'] > (mean_count + threshold * std_count)
    
    return df[df['is_anomaly']]['timestamp'].tolist()

# Usage
anomalies = sliding_window_count(timestamps)
print("Sliding Window Count Anomalies:", anomalies)

In [None]:
# __Kernel Density Estimation:__
import pandas as pd
import numpy as np
from sklearn.neighbors import KernelDensity

def kde_analysis(timestamps, bandwidth=7, threshold=2):
    df = pd.DataFrame({'timestamp': pd.to_datetime(timestamps)}).sort_values('timestamp')
    X = df['timestamp'].astype(int).values.reshape(-1, 1)
    
    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth * 86400)  # convert days to seconds
    kde.fit(X)
    
    log_dens = kde.score_samples(X)
    
    mean_log_dens = np.mean(log_dens)
    std_log_dens = np.std(log_dens)
    
    df['is_anomaly'] = log_dens < (mean_log_dens - threshold * std_log_dens)
    
    return df[df['is_anomaly']]['timestamp'].tolist()

# Usage
anomalies = kde_analysis(timestamps)
print("KDE Anomalies:", anomalies)


In [None]:
# visualizaiton


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.neighbors import KernelDensity

def inter_event_time_analysis(dates, threshold=2):
    df = pd.DataFrame({'date': pd.to_datetime(dates)}).sort_values('date')
    df['iet'] = df['date'].diff().dt.days
    
    mean_iet = df['iet'].mean()
    std_iet = df['iet'].std()
    
    df['is_dormant'] = df['iet'] > (mean_iet + threshold * std_iet)
    df['is_surge'] = (df['iet'] < (mean_iet - threshold * std_iet)) & (df['iet'] > 0)
    
    return {
        'dormant': df[df['is_dormant']]['date'].tolist(),
        'surge': df[df['is_surge']]['date'].tolist()
    }

def sliding_window_count(dates, window_size=7, threshold=2):
    df = pd.DataFrame({'date': pd.to_datetime(dates)}).sort_values('date')
    
    df['count'] = df.apply(lambda row: sum((row['date'] - pd.Timedelta(days=window_size) <= d <= row['date']) for d in df['date']), axis=1)
    
    mean_count = df['count'].mean()
    std_count = df['count'].std()
    
    df['is_dormant'] = df['count'] < (mean_count - threshold * std_count)
    df['is_surge'] = df['count'] > (mean_count + threshold * std_count)
    
    return {
        'dormant': df[df['is_dormant']]['date'].tolist(),
        'surge': df[df['is_surge']]['date'].tolist()
    }

def kde_analysis(dates, bandwidth=7, threshold=2):
    df = pd.DataFrame({'date': pd.to_datetime(dates)}).sort_values('date')
    
    earliest_date = df['date'].min()
    X = (df['date'] - earliest_date).dt.total_seconds().div(86400).values.reshape(-1, 1)
    
    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth)
    kde.fit(X)
    
    log_dens = kde.score_samples(X)
    
    mean_log_dens = np.mean(log_dens)
    std_log_dens = np.std(log_dens)
    
    df['is_dormant'] = log_dens < (mean_log_dens - threshold * std_log_dens)
    df['is_surge'] = log_dens > (mean_log_dens + threshold * std_log_dens)
    
    return {
        'dormant': df[df['is_dormant']]['date'].tolist(),
        'surge': df[df['is_surge']]['date'].tolist()
    }

def visualize_anomalies(dates, iet_anomalies, swc_anomalies, kde_anomalies):
    dates = pd.to_datetime(dates)
    
    fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(15, 15), sharex=True)
    
    plt.xlabel('Date')
    plt.xlim(dates.min(), dates.max())
    
    # Inter-event Time Analysis
    ax1.scatter(dates, [1]*len(dates), alpha=0.5, label='Events')
    ax1.scatter(iet_anomalies['dormant'], [1]*len(iet_anomalies['dormant']), color='blue', s=100, marker='v', label='IET Dormant')
    ax1.scatter(iet_anomalies['surge'], [1]*len(iet_anomalies['surge']), color='red', s=100, marker='^', label='IET Surge')
    ax1.set_title('Inter-event Time Analysis')
    ax1.legend()
    ax1.set_yticks([])
    
    # Sliding Window Count
    ax2.scatter(dates, [1]*len(dates), alpha=0.5, label='Events')
    ax2.scatter(swc_anomalies['dormant'], [1]*len(swc_anomalies['dormant']), color='blue', s=100, marker='v', label='SWC Dormant')
    ax2.scatter(swc_anomalies['surge'], [1]*len(swc_anomalies['surge']), color='red', s=100, marker='^', label='SWC Surge')
    ax2.set_title('Sliding Window Count')
    ax2.legend()
    ax2.set_yticks([])
    
    # Kernel Density Estimation
    ax3.scatter(dates, [1]*len(dates), alpha=0.5, label='Events')
    ax3.scatter(kde_anomalies['dormant'], [1]*len(kde_anomalies['dormant']), color='blue', s=100, marker='v', label='KDE Dormant')
    ax3.scatter(kde_anomalies['surge'], [1]*len(kde_anomalies['surge']), color='red', s=100, marker='^', label='KDE Surge')
    ax3.set_title('Kernel Density Estimation')
    ax3.legend()
    ax3.set_yticks([])
    
    for ax in [ax1, ax2, ax3]:
        ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
        ax.xaxis.set_major_locator(mdates.AutoDateLocator())
    
    plt.tight_layout()
    plt.show()

# Example usage
dates = pd.date_range(start='2023-01-01', end='2023-12-31', freq='D')
dates = dates[np.random.rand(len(dates)) > 0.7]  # Randomly select about 30% of the days

# Add some artificial surges and dormant periods
surge_dates = pd.date_range(start='2023-03-01', end='2023-03-10', freq='D')
dormant_dates = pd.date_range(start='2023-06-01', end='2023-06-20', freq='3D')
dates = dates.union(surge_dates).union(dormant_dates)

iet_anomalies = inter_event_time_analysis(dates)
swc_anomalies = sliding_window_count(dates)
kde_anomalies = kde_analysis(dates)

visualize_anomalies(dates, iet_anomalies, swc_anomalies, kde_anomalies)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

def compute_cumulative_sums_and_detect_surges(dates, window_size=7, surge_threshold=1.5):
    # Convert dates to datetime if they're not already
    dates = pd.to_datetime(dates)
    
    # Create a DataFrame with the dates and sort it
    df = pd.DataFrame({'date': dates}).sort_values('date')
    
    # Count events per day
    df['event_count'] = 1
    df = df.groupby('date').count().reset_index()
    
    # Ensure we have all dates in the range
    date_range = pd.date_range(start=df['date'].min(), end=df['date'].max())
    df = df.set_index('date').reindex(date_range, fill_value=0).reset_index()
    df = df.rename(columns={'index': 'date'})
    
    # Compute cumulative sum before each date
    df['cum_sum_before'] = df['event_count'].cumsum().shift(1, fill_value=0)
    
    # Compute cumulative sum including and after each date within the window
    df['cum_sum_after'] = df['event_count'].rolling(window=window_size, min_periods=1).sum()
    
    # Detect surges
    df['surge_ratio'] = df['cum_sum_after'] / (df['cum_sum_before'] + 1)  # Add 1 to avoid division by zero
    df['is_surge'] = df['surge_ratio'] > surge_threshold
    
    return df

def visualize_surges(df):
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10), sharex=True)
    
    # Plot cumulative sums
    ax1.plot(df['date'], df['cum_sum_before'], label='Cumulative Sum Before')
    ax1.plot(df['date'], df['cum_sum_after'], label='Cumulative Sum After (7-day window)')
    ax1.set_title('Cumulative Sums and Surge Detection')
    ax1.legend()
    
    # Highlight surge periods
    surge_periods = df[df['is_surge']]
    ax1.scatter(surge_periods['date'], surge_periods['cum_sum_after'], 
                color='red', s=50, zorder=5, label='Surge Detected')
    
    # Plot surge ratio
    ax2.plot(df['date'], df['surge_ratio'], label='Surge Ratio')
    ax2.axhline(y=surge_threshold, color='r', linestyle='--', label=f'Surge Threshold ({surge_threshold})')
    ax2.set_title('Surge Ratio')
    ax2.set_yscale('log')  # Use log scale for better visibility
    ax2.legend()
    
    # Format x-axis
    for ax in [ax1, ax2]:
        ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
        ax.xaxis.set_major_locator(mdates.AutoDateLocator())
    
    plt.xlabel('Date')
    plt.tight_layout()
    plt.show()

# Example usage
np.random.seed(42)
date_range = pd.date_range(start='2023-01-01', end='2023-12-31')
dates = np.random.choice(date_range, size=500)  # 500 random events

# Add some artificial surge periods
surge_dates = pd.date_range(start='2023-03-01', end='2023-03-10')
surge_dates = np.concatenate([surge_dates, pd.date_range(start='2023-07-15', end='2023-07-20')])
dates = np.concatenate([dates, surge_dates])

# Compute cumulative sums and detect surges
surge_threshold = 1.5
result_df = compute_cumulative_sums_and_detect_surges(dates, surge_threshold=surge_threshold)

# Display the first few rows of the result
print(result_df.head(10))

# Display surge periods
print("\nSurge periods detected:")
print(result_df[result_df['is_surge']][['date', 'surge_ratio']])

# Visualize the results
visualize_surges(result_df)