In [90]:
import random, math, os 
import numpy as np
import pandas as pd
from collections import defaultdict

from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_score, recall_score, accuracy_score


In [91]:
name = 'book'
n_clusters = 4

In [None]:
import os
import pandas as pd
import joblib
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler

class TemporalClusteringPipeline:
    def __init__(self, interaction_csv_path, n_clusters, feature_cols):
        self.n_clusters = n_clusters
        self.feature_cols = feature_cols
        self.interaction_csv_path = interaction_csv_path

        self.interaction_df = pd.read_csv(self.interaction_csv_path )
        self.interaction_df['timestamp'] = pd.to_datetime(self.interaction_df['timestamp'])
        self.interaction_df = self.interaction_df.sort_values(by='timestamp').reset_index(drop=True)

        # train_interaction_df['year'] = train_interaction_df['timestamp'].dt.year
        self.interaction_df['month'] = self.interaction_df['timestamp'].dt.month
        self.interaction_df['week'] = self.interaction_df['timestamp'].dt.isocalendar().week

        # Simple season mapping (1:Winter, 2:Spring, 3:Summer, 4:Fall)
        self.interaction_df['season'] = self.interaction_df['month'].apply(lambda x: (x%12 + 3)//3)
    

    ################ Creating Temporal Statistical Features ################
    def _encode_cyclic(data, max_val):
        data_norm = 2 * np.pi * data / max_val
        return np.sin(data_norm), np.cos(data_norm)
    
    def create_temporal_stat_features(self):
        self.interaction_df['month_sin'], self.interaction_df['month_cos'] = self._encode_cyclic(self.interaction_df['month'], 12)
        self.interaction_df['week_sin'], self.interaction_df['week_cos'] = self._encode_cyclic(self.interaction_df['week'], 52)
        self.interaction_df['season_sin'], self.interaction_df['season_cos'] = self._encode_cyclic(self.interaction_df['season'], 4)

        feature_cols = [
            'season_sin', 'season_cos',
            'month_sin', 'month_cos', 
            'week_sin', 'week_cos',
        ]

        f_stat = self.interaction_df[['timestamp'] + feature_cols] # dataframe
        f_stat = f_stat.drop_duplicates()

        return f_stat
    ################ Creating Temporal Statistical Features ################
    

    ################ Creating Temporal Structure Features ################
    def _calculate_structural_features(df, interaction_count_col='interaction_count', 
                                gaps=[90, 30, 7, 1]):
        """
        Calculates 1st and 2nd order structural features (z' and z'') for specified gaps.
        print(f_stat.tail(1))
        Parameters:
        - df: DataFrame containing time-series data.
        - interaction_count_col: Name of the column containing interaction counts (z(i)).
        - gaps: List of window sizes (default: [90, 30, 7, 1] for season, month, week, day).
        
        Returns:
        - DataFrame with new columns for each gap (z_prime and z_double_prime).
        """
        
        # We work on a copy to avoid Modifying the original dataframe
        result_df = df.copy()
        result_df = result_df.sort_values(by='timestamp').reset_index(drop=True)
        
        # Extract the base interaction series z(i)
        z = result_df[interaction_count_col]
        
        feature_list = []
        
        for gap in gaps:
            # --- 1. First-Order Structural Feature: z'_{gap}(t) ---
            # Formula: (Sum(current_window) - Sum(previous_window)) / gap
            
            # Calculate rolling sum for the current window [t-gap, t]
            # This corresponds to Sum_{i=t-gap}^{t} z(i)
            current_sum = z.rolling(window=gap, min_periods=gap).sum()
            
            # The previous window sum is just the current sum shifted by 'gap'
            # This corresponds to Sum_{i=t-2gap}^{t-gap} z(i)
            prev_sum = current_sum.shift(gap)
            
            # Calculate z'
            z_prime = (current_sum - prev_sum) / gap
            
            # Rename for storage
            z_prime_col_name = f'z_prime_{gap}'
            result_df[z_prime_col_name] = z_prime
            
            # --- 2. Second-Order Structural Feature: z''_{gap}(t) ---
            # Formula: (Sum(current_window_of_z') - Sum(previous_window_of_z')) / gap
            
            # Now we apply the same rolling logic to the z_prime series we just created
            current_sum_prime = z_prime.rolling(window=gap, min_periods=gap).sum()
            prev_sum_prime = current_sum_prime.shift(gap)

            # Calculate z''
            z_double_prime = (current_sum_prime - prev_sum_prime) / gap
            
            # Rename for storage
            z_double_prime_col_name = f'z_double_prime_{gap}'
            result_df[z_double_prime_col_name] = z_double_prime
            
            # --- 3. Padding (Handling Initial NaNs) ---
            # The paper states: "padding... with the nearest timestamp's temporal structural feature"
            # Because we used rolling windows, the beginning of the series will have NaNs.
            # We use backfill (bfill) to propagate the first valid observation backwards.
            result_df[z_prime_col_name] = result_df[z_prime_col_name].bfill()
            result_df[z_double_prime_col_name] = result_df[z_double_prime_col_name].bfill()
            
            # Keep track of feature names for the final concatenation
            feature_list.extend([z_prime_col_name, z_double_prime_col_name])

        # Return only the extracted features (concatenated as per Eq 2)
        return result_df[['timestamp'] + feature_list]
    
    def create_temporal_structure_features(self):
        daily_interaction_count_df = self.interaction_df.groupby('timestamp').size().reset_index(name='interaction_count')
        daily_interaction_count_df = daily_interaction_count_df.sort_values(by='timestamp').reset_index(drop=True)

        # Extract Features
        f_stru = self._calculate_structural_features(daily_interaction_count_df, 
                                                gaps=[90, 30, 7, 1])    #dataframe
        return f_stru
    ################ Creating Temporal Structure Features ################

    ################ Concat Temporal Stat & Structure Features ################
    def create_temporal_features(self):
        f_stat = self.create_temporal_stat_features()
        f_stru = self.create_temporal_structure_features()

        f_all = pd.merge(f_stat, f_stru, on='timestamp', how='inner')

        return f_all
    ################ Concat Temporal Stat & Structure Features ################

    def process_train(self, train_csv_path, output_csv_path):
        """
        Xử lý tập TRAIN: Fit Scaler, Fit GMM, trích xuất xác suất và lưu Model.
        """
        print(f"⏳ Đang xử lý tập TRAIN: {train_csv_path}...")
        df_train = pd.read_csv(train_csv_path)
        
        # 1. Trích xuất đúng các cột thời gian
        X_train = df_train[self.feature_cols]
        
        # 2. Fit và Transform Scaler
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        
        # 3. Khởi tạo và Fit GMM
        gmm = GaussianMixture(n_components=self.n_clusters, random_state=42, n_init=10)
        gmm.fit(X_train_scaled)
        
        # 4. Lấy Label và Ma trận Xác suất
        df_train['cluster_label'] = gmm.predict(X_train_scaled)
        W_matrix = gmm.predict_proba(X_train_scaled)
        
        # 5. Đẩy xác suất vào từng cột
        for i in range(self.n_clusters):
            df_train[f'prob_cluster_{i}'] = W_matrix[:, i]
            
        # 6. LƯU LẠI SCALER VÀ GMM ĐỂ DÙNG CHO VALIDATION
        joblib.dump(scaler, self.scaler_path)
        joblib.dump(gmm, self.gmm_path)
        
        # 7. Xuất ra file CSV mới
        df_train.to_csv(output_csv_path, index=False)
        print(f"✅ Đã hoàn tất Train! Model được lưu tại: {self.model_save_dir}")
        print(f"✅ Đã xuất dữ liệu Train mới ra: {output_csv_path}\n")
        
        return df_train

    def process_validation(self, val_csv_path, output_csv_path):
        """
        Xử lý tập VALIDATION/TEST: Chỉ Load Model, Transform và trích xuất xác suất.
        """
        print(f"⏳ Đang xử lý tập VALIDATION: {val_csv_path}...")
        
        # 1. Kiểm tra xem model đã được train chưa
        if not os.path.exists(self.scaler_path) or not os.path.exists(self.gmm_path):
            raise FileNotFoundError("Chưa tìm thấy Scaler hoặc GMM model! Hãy chạy process_train() trước.")
            
        df_val = pd.read_csv(val_csv_path)
        X_val = df_val[self.feature_cols]
        
        # 2. LOAD LẠI SCALER VÀ GMM
        scaler = joblib.load(self.scaler_path)
        gmm = joblib.load(self.gmm_path)
        
        # 3. CHỈ TRANSFORM (Tuyệt đối không dùng fit)
        X_val_scaled = scaler.transform(X_val)
        
        # 4. Dự đoán Label và Ma trận Xác suất
        df_val['cluster_label'] = gmm.predict(X_val_scaled)
        W_matrix = gmm.predict_proba(X_val_scaled)
        
        # 5. Đẩy xác suất vào từng cột
        for i in range(self.n_clusters):
            df_val[f'prob_cluster_{i}'] = W_matrix[:, i]
            
        # 6. Xuất ra file CSV mới
        df_val.to_csv(output_csv_path, index=False)
        print(f"✅ Đã hoàn tất Validation! Dữ liệu xuất ra: {output_csv_path}\n")
        
        return df_val

## 1. Creating Temporal Features

In [92]:
train_interaction_df = pd.read_csv(f'./data/{name}/{name}_train_interactions.csv')
train_interaction_df['timestamp'] = pd.to_datetime(train_interaction_df['timestamp'])
train_interaction_df = train_interaction_df.sort_values(by='timestamp').reset_index(drop=True)

# train_interaction_df['year'] = train_interaction_df['timestamp'].dt.year
train_interaction_df['month'] = train_interaction_df['timestamp'].dt.month
train_interaction_df['week'] = train_interaction_df['timestamp'].dt.isocalendar().week

# Simple season mapping (1:Winter, 2:Spring, 3:Summer, 4:Fall)
train_interaction_df['season'] = train_interaction_df['month'].apply(lambda x: (x%12 + 3)//3)

### 1.1 Creating Temporal Statistical Features

In [93]:
# A. Handle Cyclical Features (Month, Week, Season)
# We transform "Month" into two dimensions: sin_month and cos_month.
# This places months on a unit circle.
def encode_cyclic(data, max_val):
    data_norm = 2 * np.pi * data / max_val
    return np.sin(data_norm), np.cos(data_norm)

In [94]:
train_interaction_df['month_sin'], train_interaction_df['month_cos'] = encode_cyclic(train_interaction_df['month'], 12)
train_interaction_df['week_sin'], train_interaction_df['week_cos'] = encode_cyclic(train_interaction_df['week'], 52)
train_interaction_df['season_sin'], train_interaction_df['season_cos'] = encode_cyclic(train_interaction_df['season'], 4)

feature_cols = [
    'season_sin', 'season_cos',
    'month_sin', 'month_cos', 
    'week_sin', 'week_cos',
]

f_stat = train_interaction_df[['timestamp'] + feature_cols] # dataframe
f_stat = f_stat.drop_duplicates()

print(f"f_stat.shape: {f_stat.shape}")    # Feature Matrix Shape: (5919, 7)
print(f'f_stat.columns: {f_stat.columns}')


f_stat.shape: (5413, 7)
f_stat.columns: Index(['timestamp', 'season_sin', 'season_cos', 'month_sin', 'month_cos',
       'week_sin', 'week_cos'],
      dtype='object')


### 1.2 Creating Temporal Structure Features

In [95]:
def calculate_structural_features(df, interaction_count_col='interaction_count', 
                                gaps=[90, 30, 7, 1]):
    """
    Calculates 1st and 2nd order structural features (z' and z'') for specified gaps.
    print(f_stat.tail(1))
    Parameters:
    - df: DataFrame containing time-series data.
    - interaction_count_col: Name of the column containing interaction counts (z(i)).
    - gaps: List of window sizes (default: [90, 30, 7, 1] for season, month, week, day).
    
    Returns:
    - DataFrame with new columns for each gap (z_prime and z_double_prime).
    """
    
    # We work on a copy to avoid Modifying the original dataframe
    result_df = df.copy()
    result_df = result_df.sort_values(by='timestamp').reset_index(drop=True)
    
    # Extract the base interaction series z(i)
    z = result_df[interaction_count_col]
    
    feature_list = []
    
    for gap in gaps:
        # --- 1. First-Order Structural Feature: z'_{gap}(t) ---
        # Formula: (Sum(current_window) - Sum(previous_window)) / gap
        
        # Calculate rolling sum for the current window [t-gap, t]
        # This corresponds to Sum_{i=t-gap}^{t} z(i)
        current_sum = z.rolling(window=gap, min_periods=gap).sum()
        
        # The previous window sum is just the current sum shifted by 'gap'
        # This corresponds to Sum_{i=t-2gap}^{t-gap} z(i)
        prev_sum = current_sum.shift(gap)
        
        # Calculate z'
        z_prime = (current_sum - prev_sum) / gap
        
        # Rename for storage
        z_prime_col_name = f'z_prime_{gap}'
        result_df[z_prime_col_name] = z_prime
        
        # --- 2. Second-Order Structural Feature: z''_{gap}(t) ---
        # Formula: (Sum(current_window_of_z') - Sum(previous_window_of_z')) / gap
        
        # Now we apply the same rolling logic to the z_prime series we just created
        current_sum_prime = z_prime.rolling(window=gap, min_periods=gap).sum()
        prev_sum_prime = current_sum_prime.shift(gap)

        # Calculate z''
        z_double_prime = (current_sum_prime - prev_sum_prime) / gap
        
        # Rename for storage
        z_double_prime_col_name = f'z_double_prime_{gap}'
        result_df[z_double_prime_col_name] = z_double_prime
        
        # --- 3. Padding (Handling Initial NaNs) ---
        # The paper states: "padding... with the nearest timestamp's temporal structural feature"
        # Because we used rolling windows, the beginning of the series will have NaNs.
        # We use backfill (bfill) to propagate the first valid observation backwards.
        result_df[z_prime_col_name] = result_df[z_prime_col_name].bfill()
        result_df[z_double_prime_col_name] = result_df[z_double_prime_col_name].bfill()
        
        # Keep track of feature names for the final concatenation
        feature_list.extend([z_prime_col_name, z_double_prime_col_name])

    # Return only the extracted features (concatenated as per Eq 2)
    return result_df[['timestamp'] + feature_list]

In [96]:
daily_interaction_count_df = train_interaction_df.groupby('timestamp').size().reset_index(name='interaction_count')
daily_interaction_count_df = daily_interaction_count_df.sort_values(by='timestamp').reset_index(drop=True)

# Extract Features
f_stru = calculate_structural_features(daily_interaction_count_df, 
                                        gaps=[90, 30, 7, 1])    #dataframe

# Check shape (Should have 9 columns: 1 for timestamp and 2 for each of the 4 gaps)
print(f"f_stru: {f_stru.shape}")    #Feature Matrix Shape: (5919, 9)
print(f"f_stru: {f_stru.columns}")    #Feature Matrix Shape: (5919, 9)


f_stru: (5413, 9)
f_stru: Index(['timestamp', 'z_prime_90', 'z_double_prime_90', 'z_prime_30',
       'z_double_prime_30', 'z_prime_7', 'z_double_prime_7', 'z_prime_1',
       'z_double_prime_1'],
      dtype='object')


### 1.3 Concat Temporal Statistical Features and Temporal Structure Features

In [97]:
f_all = pd.merge(f_stat, f_stru, on='timestamp', how='inner')

print(f'f_stat.shape: {f_stat.shape}')
print(f'f_stru.shape: {f_stru.shape}')
print(f'f_all.shape: {f_all.shape}')


f_stat.shape: (5413, 7)
f_stru.shape: (5413, 9)
f_all.shape: (5413, 15)


### 1.4 Clustering time

In [None]:
# # # Apply K-Means
# kmeans = KMeans(n_clusters= n_clusters, random_state=42, n_init=10)

# f_all_feature_only = f_all.drop('timestamp', axis=1)
# f_all['cluster_label'] = kmeans.fit_predict(f_all_feature_only)

# print(f'f_all.shape: {f_all.shape}')
# print(f'f_all.columns: {f_all.columns}')


############## Apply GMM
from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=n_clusters, random_state=42, n_init=10)

f_all_feature_only = f_all.drop('timestamp', axis=1)
f_all['cluster_label'] = gmm.fit_predict(f_all_feature_only)    # Return that cluster_label having highest probaililty

W_matrix = gmm.predict_proba(f_all_feature_only)      # Returen probability of each clusters,
for i in range(n_clusters):
    f_all[f'prob_cluster_{i}'] = W_matrix[:, i]

print(f'f_all.shape: {f_all.shape}')
print(f'f_all.columns: {f_all.columns}')



f_all.shape: (5413, 20)
f_all.columns: Index(['timestamp', 'season_sin', 'season_cos', 'month_sin', 'month_cos',
       'week_sin', 'week_cos', 'z_prime_90', 'z_double_prime_90', 'z_prime_30',
       'z_double_prime_30', 'z_prime_7', 'z_double_prime_7', 'z_prime_1',
       'z_double_prime_1', 'cluster_label', 'prob_cluster_0', 'prob_cluster_1',
       'prob_cluster_2', 'prob_cluster_3'],
      dtype='object')


In [99]:
temporal_train_interaction_df = pd.merge(train_interaction_df, f_all, on='timestamp', how='inner')
print(f'temporal_train_interaction_df.shape: {temporal_train_interaction_df.shape}')
print(f'temporal_train_interaction_df.columns: {temporal_train_interaction_df.columns}')


temporal_train_interaction_df.shape: (37513, 34)
temporal_train_interaction_df.columns: Index(['user_id', 'entity_id', 'timestamp', 'user_id:token', 'entity_id:token',
       'item_id:token', 'month', 'week', 'season', 'month_sin_x',
       'month_cos_x', 'week_sin_x', 'week_cos_x', 'season_sin_x',
       'season_cos_x', 'season_sin_y', 'season_cos_y', 'month_sin_y',
       'month_cos_y', 'week_sin_y', 'week_cos_y', 'z_prime_90',
       'z_double_prime_90', 'z_prime_30', 'z_double_prime_30', 'z_prime_7',
       'z_double_prime_7', 'z_prime_1', 'z_double_prime_1', 'cluster_label',
       'prob_cluster_0', 'prob_cluster_1', 'prob_cluster_2', 'prob_cluster_3'],
      dtype='object')


## 2. Create TCKG

In [None]:
graph_df = pd.read_csv(f'./data/{name}/{name}_processed_static_graph.csv')

temporal_train_interaction_df = temporal_train_interaction_df.rename(columns={'user_id': 'head_id', 
                                            'entity_id': 'tail_id',
                                            'user_id:token': 'head_id:token',
                                            'entity_id:token': 'tail_id:token'})

max_relation_id_in_graph = graph_df['relation_id'].max()

temporal_train_interaction_df['relation_id'] = temporal_train_interaction_df['cluster_label'] \
                                                + max_relation_id_in_graph + 1  # new relation_id. +1 because relation_id starts at 1, not 0
temporal_train_interaction_df['relation_id:token'] = 'interacted_' + temporal_train_interaction_df['cluster_label'].astype(str)  # new relation_id:token


COLUMN_NAMES = ['head_id', 'relation_id', 'tail_id',
                                'head_id:token', 'relation_id:token', 'tail_id:token']

df1 = temporal_train_interaction_df[COLUMN_NAMES]

                            
TCKG_df = pd.concat([graph_df, df1], ignore_index=True)
TCKG_df = TCKG_df.sort_values(by=['relation_id', 'head_id', 'tail_id'])
TCKG_df.to_csv(f'./data/{name}/{name}_TCKG.csv', index= False)


['head_id', 'relation_id', 'tail_id', 'head_id:token', 'relation_id:token', 'tail_id:token', 'cluster_label', 'prob_cluster_0', 'prob_cluster_1', 'prob_cluster_2', 'prob_cluster_3']


###### TRAIN CSV

In [None]:
COLUMN_NAMES = ['head_id', 'relation_id', 'tail_id',
                                'head_id:token', 'relation_id:token', 'tail_id:token']
COLUMN_NAMES.append('cluster_label')
for i in range(n_clusters):
    COLUMN_NAMES.append(f'prob_cluster_{i}')

print(COLUMN_NAMES)

temporal_train_interaction_df = temporal_train_interaction_df[COLUMN_NAMES]
temporal_train_interaction_df.to_csv(f'./data/{name}/{name}_train_set.csv', index= False)