In [2]:
import os
import random, math
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from torch_geometric.nn import GCNConv
from torch_geometric_temporal.nn.recurrent import TGCN, EvolveGCNH, A3TGCN
from torch_geometric.utils import dropout_edge
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning import Trainer
from torch.utils.data import DataLoader, TensorDataset, Sampler
from collections import defaultdict

# Set environment variables for reproducibility and safety
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import precision_score, recall_score, accuracy_score

# 1. Configuration & Seeding
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

  from .autonotebook import tqdm as notebook_tqdm


## 1. Creating Temporal Features

In [None]:
type = 'book'

interaction_df = pd.read_csv(f'./data/{type}/{type}_processed_interactions.csv')
interaction_df['timestamp'] = pd.to_datetime(interaction_df['timestamp'])

interaction_df['year'] = interaction_df['timestamp'].dt.year
interaction_df['month'] = interaction_df['timestamp'].dt.month
interaction_df['week'] = interaction_df['timestamp'].dt.isocalendar().week
interaction_df['day'] = interaction_df['timestamp'].dt.month

### 1.1 Creating Temporal Statistical Features

In [None]:
# A. Handle Cyclical Features (Month, Week, Season)
# We transform "Month" into two dimensions: sin_month and cos_month.
# This places months on a unit circle.
def encode_cyclic(data, max_val):
    data_norm = 2 * np.pi * data / max_val
    return np.sin(data_norm), np.cos(data_norm)

# Simple season mapping (1:Winter, 2:Spring, 3:Summer, 4:Fall)
interaction_df['season'] = interaction_df['month'].apply(lambda x: (x%12 + 3)//3)

interaction_df['month_sin'], interaction_df['month_cos'] = encode_cyclic(interaction_df['month'], 12)
interaction_df['week_sin'], interaction_df['week_cos'] = encode_cyclic(interaction_df['week'], 52)
interaction_df['season_sin'], interaction_df['season_cos'] = encode_cyclic(interaction_df['season'], 4)

feature_cols = [
    'month_sin', 'month_cos', 
    'week_sin', 'week_cos',
    'season_sin', 'season_cos'
]

f_stat = interaction_df[feature_cols]

# Apply K-Means
kmeans = KMeans(n_clusters=8, random_state=42, n_init=10)
interaction_df['fstat_label'] = kmeans.fit_predict(f_stat)

# ---------------------------------------------------------
# 5. Visualizing the clusters (using just Month vs Year for simplicity)
# ---------------------------------------------------------
plt.figure(figsize=(10, 6))
plt.scatter(interaction_df['month'], interaction_df['season'], c=interaction_df['fstat_label'], cmap='viridis', s=50, alpha=0.6)
plt.xlabel('Month')
plt.ylabel('Year')
plt.title('Clustering Results on Temporal Features')
plt.colorbar(label='Cluster ID')
plt.grid(True, alpha=0.3)
plt.show()

### 1.2 Creating Temporal Structure Features

In [None]:
interaction_df['day_count'] = interaction_df.groupby['day']

In [None]:
def calculate_structural_features(df, interaction_col='interactions', gaps=[90, 30, 7, 1]):
    """
    Calculates 1st and 2nd order structural features (z' and z'') for specified gaps.
    
    Parameters:
    - df: DataFrame containing time-series data.
    - interaction_col: Name of the column containing interaction counts (z(i)).
    - gaps: List of window sizes (default: [90, 30, 7, 1] for season, month, week, day).
    
    Returns:
    - DataFrame with new columns for each gap (z_prime and z_double_prime).
    """
    
    # We work on a copy to avoid Modifying the original dataframe
    result_df = df.copy()
    
    # Ensure data is sorted by time
    result_df = result_df.sort_index()
    
    # Extract the base interaction series z(i)
    z = result_df[interaction_col]
    
    feature_list = []
    
    for gap in gaps:
        # --- 1. First-Order Structural Feature: z'_{gap}(t) ---
        # Formula: (Sum(current_window) - Sum(previous_window)) / gap
        
        # Calculate rolling sum for the current window [t-gap, t]
        # This corresponds to Sum_{i=t-gap}^{t} z(i)
        current_sum = z.rolling(window=gap, min_periods=gap).sum()
        
        # The previous window sum is just the current sum shifted by 'gap'
        # This corresponds to Sum_{i=t-2gap}^{t-gap} z(i)
        prev_sum = current_sum.shift(gap)
        
        # Calculate z'
        z_prime = (current_sum - prev_sum) / gap
        
        # Rename for storage
        z_prime_col_name = f'z_prime_{gap}'
        result_df[z_prime_col_name] = z_prime
        
        # --- 2. Second-Order Structural Feature: z''_{gap}(t) ---
        # Formula: (Sum(current_window_of_z') - Sum(previous_window_of_z')) / gap
        
        # Now we apply the same rolling logic to the z_prime series we just created
        current_sum_prime = z_prime.rolling(window=gap, min_periods=gap).sum()
        prev_sum_prime = current_sum_prime.shift(gap)
        
        # Calculate z''
        z_double_prime = (current_sum_prime - prev_sum_prime) / gap
        
        # Rename for storage
        z_double_prime_col_name = f'z_double_prime_{gap}'
        result_df[z_double_prime_col_name] = z_double_prime
        
        # --- 3. Padding (Handling Initial NaNs) ---
        # The paper states: "padding... with the nearest timestamp's temporal structural feature"
        # Because we used rolling windows, the beginning of the series will have NaNs.
        # We use backfill (bfill) to propagate the first valid observation backwards.
        result_df[z_prime_col_name] = result_df[z_prime_col_name].bfill()
        result_df[z_double_prime_col_name] = result_df[z_double_prime_col_name].bfill()
        
        # Keep track of feature names for the final concatenation
        feature_list.extend([z_prime_col_name, z_double_prime_col_name])

    # Return only the extracted features (concatenated as per Eq 2)
    return result_df[feature_list]

# --- Example Usage ---

# 1. Create Dummy Data (e.g., 1 year of daily interaction counts)
dates = pd.date_range(start='2023-01-01', end='2023-12-31', freq='D')
# Create a trend: Interactions generally increase over the year with some random noise
interactions = np.linspace(10, 100, len(dates)) + np.random.normal(0, 5, len(dates))
df = pd.DataFrame({'date': dates, 'interactions': interactions}).set_index('date')

# 2. Extract Features
f_stru = calculate_structural_features(df, gaps=[90, 30, 7, 1])

# 3. Display Result
print("Input Data (Tail):")
print(df.tail(5))
print("\nExtracted Structural Features (Tail):")
print(f_stru.tail(5))

# Check shape (Should have 8 columns: 2 for each of the 4 gaps)
print(f"\nFeature Matrix Shape: {f_stru.shape}")

SyntaxError: unterminated string literal (detected at line 1) (181905961.py, line 1)