In [1]:
import os
import random, math
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl
from torch_geometric.nn import GCNConv
from torch_geometric_temporal.nn.recurrent import TGCN, EvolveGCNH, A3TGCN
from torch_geometric.utils import dropout_edge
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning import Trainer
from torch.utils.data import DataLoader, TensorDataset, Sampler
from collections import defaultdict

# Set environment variables for reproducibility and safety
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import precision_score, recall_score, accuracy_score

# 1. Configuration & Seeding
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
name = 'movie'
n_clusters = 4

## 1. Creating Temporal Features

In [3]:
interaction_df = pd.read_csv(f'./data/{name}/{name}_processed_interactions.csv')
interaction_df['timestamp'] = pd.to_datetime(interaction_df['timestamp'])
interaction_df = interaction_df.sort_values(by='timestamp').reset_index(drop=True)

interaction_df['year'] = interaction_df['timestamp'].dt.year
interaction_df['month'] = interaction_df['timestamp'].dt.month
interaction_df['week'] = interaction_df['timestamp'].dt.isocalendar().week
# Simple season mapping (1:Winter, 2:Spring, 3:Summer, 4:Fall)
interaction_df['season'] = interaction_df['month'].apply(lambda x: (x%12 + 3)//3)

### 1.1 Creating Temporal Statistical Features

In [4]:
# A. Handle Cyclical Features (Month, Week, Season)
# We transform "Month" into two dimensions: sin_month and cos_month.
# This places months on a unit circle.
def encode_cyclic(data, max_val):
    data_norm = 2 * np.pi * data / max_val
    return np.sin(data_norm), np.cos(data_norm)

In [5]:
interaction_df['month_sin'], interaction_df['month_cos'] = encode_cyclic(interaction_df['month'], 12)
interaction_df['week_sin'], interaction_df['week_cos'] = encode_cyclic(interaction_df['week'], 52)
interaction_df['season_sin'], interaction_df['season_cos'] = encode_cyclic(interaction_df['season'], 4)

feature_cols = [
    'season_sin', 'season_cos',
    'month_sin', 'month_cos', 
    'week_sin', 'week_cos',
]

f_stat = interaction_df[['timestamp'] + feature_cols] # dataframe
f_stat = f_stat.drop_duplicates()

print(f"f_stat.shape: {f_stat.shape}")    # Feature Matrix Shape: (5919, 7)
print(f'f_stat.columns: {f_stat.columns}')


f_stat.shape: (5231, 7)
f_stat.columns: Index(['timestamp', 'season_sin', 'season_cos', 'month_sin', 'month_cos',
       'week_sin', 'week_cos'],
      dtype='object')


### 1.2 Creating Temporal Structure Features

In [6]:
def calculate_structural_features(df, interaction_count_col='interaction_count', 
                                gaps=[90, 30, 7, 1]):
    """
    Calculates 1st and 2nd order structural features (z' and z'') for specified gaps.
    print(f_stat.tail(1))
    Parameters:
    - df: DataFrame containing time-series data.
    - interaction_count_col: Name of the column containing interaction counts (z(i)).
    - gaps: List of window sizes (default: [90, 30, 7, 1] for season, month, week, day).
    
    Returns:
    - DataFrame with new columns for each gap (z_prime and z_double_prime).
    """
    
    # We work on a copy to avoid Modifying the original dataframe
    result_df = df.copy()
    result_df = result_df.sort_values(by='timestamp').reset_index(drop=True)
    
    # Extract the base interaction series z(i)
    z = result_df[interaction_count_col]
    
    feature_list = []
    
    for gap in gaps:
        # --- 1. First-Order Structural Feature: z'_{gap}(t) ---
        # Formula: (Sum(current_window) - Sum(previous_window)) / gap
        
        # Calculate rolling sum for the current window [t-gap, t]
        # This corresponds to Sum_{i=t-gap}^{t} z(i)
        current_sum = z.rolling(window=gap, min_periods=gap).sum()
        
        # The previous window sum is just the current sum shifted by 'gap'
        # This corresponds to Sum_{i=t-2gap}^{t-gap} z(i)
        prev_sum = current_sum.shift(gap)
        
        # Calculate z'
        z_prime = (current_sum - prev_sum) / gap
        
        # Rename for storage
        z_prime_col_name = f'z_prime_{gap}'
        result_df[z_prime_col_name] = z_prime
        
        # --- 2. Second-Order Structural Feature: z''_{gap}(t) ---
        # Formula: (Sum(current_window_of_z') - Sum(previous_window_of_z')) / gap
        
        # Now we apply the same rolling logic to the z_prime series we just created
        current_sum_prime = z_prime.rolling(window=gap, min_periods=gap).sum()
        prev_sum_prime = current_sum_prime.shift(gap)

        # Calculate z''
        z_double_prime = (current_sum_prime - prev_sum_prime) / gap
        
        # Rename for storage
        z_double_prime_col_name = f'z_double_prime_{gap}'
        result_df[z_double_prime_col_name] = z_double_prime
        
        # --- 3. Padding (Handling Initial NaNs) ---
        # The paper states: "padding... with the nearest timestamp's temporal structural feature"
        # Because we used rolling windows, the beginning of the series will have NaNs.
        # We use backfill (bfill) to propagate the first valid observation backwards.
        result_df[z_prime_col_name] = result_df[z_prime_col_name].bfill()
        result_df[z_double_prime_col_name] = result_df[z_double_prime_col_name].bfill()
        
        # Keep track of feature names for the final concatenation
        feature_list.extend([z_prime_col_name, z_double_prime_col_name])

    # Return only the extracted features (concatenated as per Eq 2)
    return result_df[['timestamp'] + feature_list]

In [7]:
daily_interaction_count_df = interaction_df.groupby('timestamp').size().reset_index(name='interaction_count')
daily_interaction_count_df = daily_interaction_count_df.sort_values(by='timestamp').reset_index(drop=True)

# Extract Features
f_stru = calculate_structural_features(daily_interaction_count_df, 
                                        gaps=[90, 30, 7, 1])    #dataframe

# Check shape (Should have 9 columns: 1 for timestamp and 2 for each of the 4 gaps)
print(f"f_stru: {f_stru.shape}")    #Feature Matrix Shape: (5919, 9)
print(f"f_stru: {f_stru.columns}")    #Feature Matrix Shape: (5919, 9)


f_stru: (5231, 9)
f_stru: Index(['timestamp', 'z_prime_90', 'z_double_prime_90', 'z_prime_30',
       'z_double_prime_30', 'z_prime_7', 'z_double_prime_7', 'z_prime_1',
       'z_double_prime_1'],
      dtype='object')


### 1.3 Concat Temporal Statistical Features and Temporal Structure Features

In [8]:
f_all = pd.merge(f_stat, f_stru, on='timestamp', how='inner')

print(f'f_stat.shape: {f_stat.shape}')
print(f'f_stru.shape: {f_stru.shape}')
print(f'f_all.shape: {f_all.shape}')


f_stat.shape: (5231, 7)
f_stru.shape: (5231, 9)
f_all.shape: (5231, 15)


### 1.4 Clustering time

In [9]:
# # Apply K-Means
kmeans = KMeans(n_clusters= n_clusters, random_state=42, n_init=10)

f_all_feature_only = f_all.drop('timestamp', axis=1)
f_all['cluster_label'] = kmeans.fit_predict(f_all_feature_only)

print(f'f_all.shape: {f_all.shape}')
print(f'f_all.columns: {f_all.columns}')



f_all.shape: (5231, 16)
f_all.columns: Index(['timestamp', 'season_sin', 'season_cos', 'month_sin', 'month_cos',
       'week_sin', 'week_cos', 'z_prime_90', 'z_double_prime_90', 'z_prime_30',
       'z_double_prime_30', 'z_prime_7', 'z_double_prime_7', 'z_prime_1',
       'z_double_prime_1', 'cluster_label'],
      dtype='object')


In [10]:
temporal_interaction_df = pd.merge(interaction_df, f_all, on='timestamp', how='inner')
print(f'TCKG_df.shape: {temporal_interaction_df.shape}')
print(f'TCKG_df.columns: {temporal_interaction_df.columns}')


TCKG_df.shape: (61090, 32)
TCKG_df.columns: Index(['user_id', 'entity_id', 'timestamp', 'user_id:token', 'entity_id:token',
       'item_id:token', 'rating:float', 'year', 'month', 'week', 'season',
       'month_sin_x', 'month_cos_x', 'week_sin_x', 'week_cos_x',
       'season_sin_x', 'season_cos_x', 'season_sin_y', 'season_cos_y',
       'month_sin_y', 'month_cos_y', 'week_sin_y', 'week_cos_y', 'z_prime_90',
       'z_double_prime_90', 'z_prime_30', 'z_double_prime_30', 'z_prime_7',
       'z_double_prime_7', 'z_prime_1', 'z_double_prime_1', 'cluster_label'],
      dtype='object')


## 2. Create TCKG

In [12]:
graph_df = pd.read_csv(f'./data/{name}/{name}_processed_graph.csv')

temporal_interaction_df = temporal_interaction_df.rename(columns={'user_id': 'head_id', 
                                            'entity_id': 'tail_id',
                                            'user_id:token': 'head_id:token',
                                            'entity_id:token': 'tail_id:token'})

max_relation_id_in_graph = graph_df['relation_id'].max()

temporal_interaction_df['relation_id'] = temporal_interaction_df['cluster_label'] + max_relation_id_in_graph   # new relation_id
temporal_interaction_df['relation_id:token'] = 'interacted_' + temporal_interaction_df['cluster_label'].astype(str)  # new relation_id:token

temporal_interaction_df = temporal_interaction_df[['head_id', 'relation_id', 'tail_id',
                                'head_id:token', 'relation_id:token', 'tail_id:token']]
                            
TCKG_df = pd.concat([graph_df, temporal_interaction_df], ignore_index=True)
TCKG_df.to_csv(f'./data/{name}/{name}_TCKG.csv', index= False)
