In [129]:
import random, math, os 
import numpy as np
import pandas as pd
from collections import defaultdict

from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_score, recall_score, accuracy_score


In [130]:
NAME = 'book'
N_CLUSTERS = 4

### 1. CREATE TEMPORAL STAT & STRUCTURE FEATURES 

In [131]:
class TemporalFeatureCal:
    def __init__(self, interaction_df, n_clusters):
        self.n_clusters = n_clusters

        self.interaction_df = interaction_df
        self.interaction_df['timestamp'] = pd.to_datetime(self.interaction_df['timestamp'])
        self.interaction_df = self.interaction_df.sort_values(by='timestamp').reset_index(drop=True)

        # train_interaction_df['year'] = train_interaction_df['timestamp'].dt.year
        self.interaction_df['month'] = self.interaction_df['timestamp'].dt.month
        self.interaction_df['week'] = self.interaction_df['timestamp'].dt.isocalendar().week

        # Simple season mapping (1:Winter, 2:Spring, 3:Summer, 4:Fall)
        self.interaction_df['season'] = self.interaction_df['month'].apply(lambda x: (x%12 + 3)//3)
    

    ################ Creating Temporal Statistical Features ################
    def _encode_cyclic(self, data, max_val):
        data_norm = 2 * np.pi * data / max_val
        return np.sin(data_norm), np.cos(data_norm)
    
    def create_temporal_stat_features(self):
        self.interaction_df['month_sin'], self.interaction_df['month_cos'] = self._encode_cyclic(self.interaction_df['month'], 12)
        self.interaction_df['week_sin'], self.interaction_df['week_cos'] = self._encode_cyclic(self.interaction_df['week'], 52)
        self.interaction_df['season_sin'], self.interaction_df['season_cos'] = self._encode_cyclic(self.interaction_df['season'], 4)

        feature_cols = [
            'season_sin', 'season_cos',
            'month_sin', 'month_cos', 
            'week_sin', 'week_cos',
        ]

        f_stat = self.interaction_df[['timestamp'] + feature_cols] # dataframe
        f_stat = f_stat.drop_duplicates()

        return f_stat
    ################ Creating Temporal Statistical Features ################
    

    ################ Creating Temporal Structure Features ################
    def _calculate_structural_features(self, df, interaction_count_col='interaction_count', 
                                gaps=[90, 30, 7, 1]):
        """
        Calculates 1st and 2nd order structural features (z' and z'') for specified gaps.
        print(f_stat.tail(1))
        Parameters:
        - df: DataFrame containing time-series data.
        - interaction_count_col: Name of the column containing interaction counts (z(i)).
        - gaps: List of window sizes (default: [90, 30, 7, 1] for season, month, week, day).
        
        Returns:
        - DataFrame with new columns for each gap (z_prime and z_double_prime).
        """
        
        # We work on a copy to avoid Modifying the original dataframe
        result_df = df.copy()
        result_df = result_df.sort_values(by='timestamp').reset_index(drop=True)
        
        # Extract the base interaction series z(i)
        z = result_df[interaction_count_col]
        
        feature_list = []
        
        for gap in gaps:
            # --- 1. First-Order Structural Feature: z'_{gap}(t) ---
            # Formula: (Sum(current_window) - Sum(previous_window)) / gap
            
            # Calculate rolling sum for the current window [t-gap, t]
            # This corresponds to Sum_{i=t-gap}^{t} z(i)
            current_sum = z.rolling(window=gap, min_periods=gap).sum()
            
            # The previous window sum is just the current sum shifted by 'gap'
            # This corresponds to Sum_{i=t-2gap}^{t-gap} z(i)
            prev_sum = current_sum.shift(gap)
            
            # Calculate z'
            z_prime = (current_sum - prev_sum) / gap
            
            # Rename for storage
            z_prime_col_name = f'z_prime_{gap}'
            result_df[z_prime_col_name] = z_prime
            
            # --- 2. Second-Order Structural Feature: z''_{gap}(t) ---
            # Formula: (Sum(current_window_of_z') - Sum(previous_window_of_z')) / gap
            
            # Now we apply the same rolling logic to the z_prime series we just created
            current_sum_prime = z_prime.rolling(window=gap, min_periods=gap).sum()
            prev_sum_prime = current_sum_prime.shift(gap)

            # Calculate z''
            z_double_prime = (current_sum_prime - prev_sum_prime) / gap
            
            # Rename for storage
            z_double_prime_col_name = f'z_double_prime_{gap}'
            result_df[z_double_prime_col_name] = z_double_prime
            
            # --- 3. Padding (Handling Initial NaNs) ---
            # The paper states: "padding... with the nearest timestamp's temporal structural feature"
            # Because we used rolling windows, the beginning of the series will have NaNs.
            # We use backfill (bfill) to propagate the first valid observation backwards.
            result_df[z_prime_col_name] = result_df[z_prime_col_name].bfill()
            result_df[z_double_prime_col_name] = result_df[z_double_prime_col_name].bfill()
            
            # Keep track of feature names for the final concatenation
            feature_list.extend([z_prime_col_name, z_double_prime_col_name])

        # Return only the extracted features (concatenated as per Eq 2)
        return result_df[['timestamp'] + feature_list]
    
    def create_temporal_structure_features(self):
        daily_interaction_count_df = self.interaction_df.groupby('timestamp').size().reset_index(name='interaction_count')
        daily_interaction_count_df = daily_interaction_count_df.sort_values(by='timestamp').reset_index(drop=True)

        # Extract Features
        f_stru = self._calculate_structural_features(daily_interaction_count_df, 
                                                gaps=[90, 30, 7, 1])    #dataframe
        return f_stru
    ################ Creating Temporal Structure Features ################

    ################ Concat Temporal Stat & Structure Features ################
    def create_temporal_features(self):
        f_stat = self.create_temporal_stat_features()
        f_stru = self.create_temporal_structure_features()

        f_all = pd.merge(f_stat, f_stru, on='timestamp', how='inner')

        return f_all
    ################ Concat Temporal Stat & Structure Features ################


In [132]:
TRAIN_INTERACTION_CSV= f'./data/{NAME}/{NAME}_train_interactions.csv'
train_interaction_df = pd.read_csv(TRAIN_INTERACTION_CSV )
train_temporal = TemporalFeatureCal(train_interaction_df, n_clusters= N_CLUSTERS)

train_f_all = train_temporal.create_temporal_features()
print(train_f_all.columns)

Index(['timestamp', 'season_sin', 'season_cos', 'month_sin', 'month_cos',
       'week_sin', 'week_cos', 'z_prime_90', 'z_double_prime_90', 'z_prime_30',
       'z_double_prime_30', 'z_prime_7', 'z_double_prime_7', 'z_prime_1',
       'z_double_prime_1'],
      dtype='object')


### 2. APPLY GMM CLUSTERING

In [133]:
############## Apply GMM
from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=N_CLUSTERS, random_state=42, n_init=10)

train_f_all_feature_only = train_f_all.drop('timestamp', axis=1)
train_f_all['cluster_label'] = gmm.fit_predict(train_f_all_feature_only)    # Return that cluster_label having highest probaililty

W_matrix = gmm.predict_proba(train_f_all_feature_only)      # Returen probability of each clusters,
for i in range(N_CLUSTERS):
    train_f_all[f'prob_cluster_{i}'] = W_matrix[:, i]

print(f'train_f_all.shape: {train_f_all.shape}')
print(f'train_f_all.columns: {train_f_all.columns}')



train_f_all.shape: (5413, 20)
train_f_all.columns: Index(['timestamp', 'season_sin', 'season_cos', 'month_sin', 'month_cos',
       'week_sin', 'week_cos', 'z_prime_90', 'z_double_prime_90', 'z_prime_30',
       'z_double_prime_30', 'z_prime_7', 'z_double_prime_7', 'z_prime_1',
       'z_double_prime_1', 'cluster_label', 'prob_cluster_0', 'prob_cluster_1',
       'prob_cluster_2', 'prob_cluster_3'],
      dtype='object')


## 2. Create TCKG

In [134]:
graph_df = pd.read_csv(f'./data/{NAME}/{NAME}_processed_static_graph.csv')

temporal_train_interaction_df = pd.merge(train_interaction_df, 
                                        train_f_all[['timestamp', 'cluster_label']], 
                                        on='timestamp', how='inner')

temporal_train_interaction_df = temporal_train_interaction_df.rename(columns={'user_id': 'head_id', 
                                            'entity_id': 'tail_id',
                                            'user_id:token': 'head_id:token',
                                            'entity_id:token': 'tail_id:token'})

max_relation_id_in_graph = graph_df['relation_id'].max()

temporal_train_interaction_df['relation_id'] = temporal_train_interaction_df['cluster_label'] \
                                                + max_relation_id_in_graph + 1  # new relation_id. +1 because relation_id starts at 1, not 0
temporal_train_interaction_df['relation_id:token'] = 'interacted_' + temporal_train_interaction_df['cluster_label'].astype(str)  # new relation_id:token

TCKG_COLUMN_NAMES = graph_df.columns
print(f'TCKG_COLUMN_NAMES: {TCKG_COLUMN_NAMES}')
temporal_train_interaction_df = temporal_train_interaction_df[TCKG_COLUMN_NAMES]
                            
TCKG_df = pd.concat([graph_df, temporal_train_interaction_df], ignore_index=True)
TCKG_df = TCKG_df.sort_values(by=['relation_id', 'head_id', 'tail_id'])
TCKG_df.to_csv(f'./data/{NAME}/{NAME}_TCKG.csv', index= False)


TCKG_COLUMN_NAMES: Index(['head_id', 'relation_id', 'tail_id', 'head_id:token',
       'relation_id:token', 'tail_id:token'],
      dtype='object')


In [135]:
# COLUMN_NAMES = ['head_id', 'relation_id', 'tail_id',
#                                 'head_id:token', 'relation_id:token', 'tail_id:token']
# COLUMN_NAMES.append('cluster_label')
# for i in range(N_CLUSTERS):
#     COLUMN_NAMES.append(f'prob_cluster_{i}')

# print(COLUMN_NAMES)

# temporal_train_interaction_df = temporal_train_interaction_df[COLUMN_NAMES]
# temporal_train_interaction_df.to_csv(f'./data/{NAME}/{NAME}_train_set.csv', index= False)