In [1]:
import pandas as pd
# baic transformer Decoder model
import torch
import torch.nn as nn
import torch.nn.functional as Fun
import torch.optim as optim
import numpy as np
import xformers.ops as xops
import math 
from typing import Optional, Union
from torch import Tensor
import random

CUBLAS_WORKSPACE_CONFIG=:4096:8

main_df = pd.read_csv('adult.csv')
main_df.head()
DEVICE = 'cuda'
# DEVICE = 'cpu'

In [2]:
def check_DataFrame_distribution(X_trans):
    columns_range = {}
    print('%15s' % '', '%6s' % 'min','%6s' % 'max', '%6s' % 'nunique')
    
    for column in X_trans.columns:
        print('%15s' % column, '%6s' % X_trans[column].min(),'%6s' % X_trans[column].max(), '%6s' % X_trans[column].nunique())
        columns_range[column] = {}

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer
def POOL_preprocess(df, N_BINS = 100):
    '''
    Preprocess the DataFrame 
    Args:
        df: DataFrame
        N_BINS: number of bins for each numerical column (will not be the exact number of bins, differ by distribution)
    Return:
        X_trans: DataFrame after preprocessing
        ct: ColumnTransformer object, for inference and inverse transform
        NUM_vs_CAT: tuple, (number of numerical columns, number of categorical columns - 1) "in feature field, do not include label column"
        existing_values: dict, {column name: sorted list of existing values}
    '''
    
    CAT = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']
    NUM = ['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']
    
    num_CAT = len(CAT)
    num_NUM = len(NUM)  
    
    ct = ColumnTransformer([
        ("age", KBinsDiscretizer(n_bins = N_BINS, encode='ordinal', strategy='uniform', subsample=None), ["age"]),
        ("fnlwgt", KBinsDiscretizer(n_bins = N_BINS, encode='ordinal', strategy='quantile', subsample=None), ["fnlwgt"]),
        ("educational-num", KBinsDiscretizer(n_bins = N_BINS, encode='ordinal', strategy='quantile', subsample=None), ["educational-num"]),
        ("capital-gain", KBinsDiscretizer(n_bins = N_BINS, encode='ordinal', strategy='uniform', subsample=None), ["capital-gain"]),
        ("capital-loss", KBinsDiscretizer(n_bins = N_BINS, encode='ordinal', strategy='uniform', subsample=None), ["capital-loss"]),
        ("hours-per-week", KBinsDiscretizer(n_bins = N_BINS, encode='ordinal', strategy='uniform', subsample=None), ["hours-per-week"]),
         ],remainder = 'passthrough', verbose_feature_names_out = False) # make sure columns are unique
    ct.set_output(transform = 'pandas')
    X_trans = ct.fit_transform(df) 
    
    # store the numrical columns' existing values for identifying unseen values
    existing_values = {}
    for column in NUM:
        existing_values[column] = sorted(X_trans[column].unique().astype(int))
    for column in CAT:
        existing_values[column] = sorted(X_trans[column].unique().astype(str))
    
    # apply Ordinal encoding on columns
    from sklearn.preprocessing import OrdinalEncoder
    OE_list = {}
    for column in NUM + CAT:
        OE = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value = -1)
        X_trans[column] = OE.fit_transform(X_trans[[column]])
        OE_list[column] = OE
    
    # make all columns' catagory unique
    # 7/19: each NUM column has its own number of unique values, plus 1 for unseen values
    # each column has it's own number of unique values. '+1' is for unseen values
    offset = 0
    for column in NUM + CAT:
        X_trans[column] = X_trans[column].apply(lambda x: x + offset)
        offset += (X_trans[column].max() - X_trans[column].min() + 1) + 1
    
    X_trans = X_trans.astype(int).reset_index(drop = True)
    return X_trans, (ct, OE_list, NUM, CAT, existing_values), (num_NUM, num_CAT - 1)
    # -1 is for the income column (label)
main_df_SHUFFLE = main_df.sample(frac=1).reset_index(drop=True)
X_trans, inference_package , _  = POOL_preprocess(main_df_SHUFFLE[48842//5:])
X_trans.head()



Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,workclass,education,marital-status,occupation,relationship,race,gender,native-country,income
0,46,86,188,191,215,304,366,384,391,407,413,423,427,448,473
1,1,132,184,191,215,324,362,387,393,397,416,424,427,468,472
2,18,160,183,191,215,309,366,383,393,401,414,424,427,468,472
3,18,137,184,191,215,324,367,387,389,401,414,424,427,468,472
4,40,137,186,191,215,304,366,379,389,398,414,422,426,468,472


In [4]:

def POOL_preprocess_inference(df: pd.DataFrame,
                              inference_package: tuple,
                                # ct: ColumnTransformer,
                                # OE_list: dict,
                                # NUM: list,
                                # CAT: list,
                                # existing_values: dict,
                              ):
    '''Preprocess the DataFrame when inference
    
    Args:
        `df`: DataFrame to be processed.\n
        `inference_package`: tuple, containing the following objects.
            `ct`: ColumnTransformer object required for inference, which makes sure values are in the same range as training data
            `OE_list`: dict, {column name: OrdinalEncoder object}\n
            `NUM`: list of numerical columns \n
            `CAT`: list of categorical columns\n
            `existing_values`: dict, {column name: sorted list of existing values}
    '''
    (ct, OE_list, NUM, CAT, existing_values) = inference_package
    X_trans_ori = ct.transform(df)
    
    # caculate the loaction of unseen values
    unseen_node_indexs = {}
    offset = 0
    for col in NUM + CAT:
        unseen_node_indexs[col] = (int(len(existing_values[col])) + offset )
        offset += int(len(existing_values[col])) + 1
    
    X_trans = X_trans_ori
    
    # apply Ordinal encoding on columns, and make all columns' catagory unique
    offset = 0
    for column in NUM + CAT:
        OE = OE_list[column]
        X_trans[column] = OE.transform(X_trans[[column]]) # use fitted OE to transform, the unseen values will be encoded as -1
        if -1 in X_trans[column].tolist():
            print('[preprocess]: detected unseen values in column', column)
        X_trans[column] = X_trans[column].apply(lambda x: x + offset if x != -1 else unseen_node_indexs[column])
        offset = unseen_node_indexs[column] + 1  

    
    X_trans = X_trans.astype(int).reset_index(drop = True) 
    return X_trans, unseen_node_indexs 
X_trans_ , unseen_node_indexs= POOL_preprocess_inference(main_df_SHUFFLE[:48842//5], inference_package)
X_trans_.head()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,workclass,education,marital-status,occupation,relationship,race,gender,native-country,income
0,29,153,189,205,215,304,369,386,391,401,413,424,427,468,473
1,27,111,183,191,215,297,367,383,389,405,417,424,426,468,472
2,24,116,183,191,215,314,366,383,394,404,414,422,427,468,472
3,30,117,183,191,215,307,366,383,389,401,417,424,426,468,472
4,8,173,183,191,215,304,366,383,393,400,416,424,427,468,472


In [5]:
check_DataFrame_distribution(X_trans_)

                   min    max nunique
            age      0     73     72
         fnlwgt     75    174    100
educational-num    176    189     14
   capital-gain    191    213     19
   capital-loss    215    262     36
 hours-per-week    265    360     83
      workclass    362    370      9
      education    372    387     16
 marital-status    389    395      7
     occupation    397    411     15
   relationship    413    418      6
           race    420    424      5
         gender    426    427      2
 native-country    429    470     41
         income    472    473      2


In [6]:
check_DataFrame_distribution(X_trans)
'[74, 175, 190, 214, 264, 360, 370, 387, 395, 411, 418, 424, 427, 470, 473]'

                   min    max nunique
            age      0     73     74
         fnlwgt     75    174    100
educational-num    176    189     14
   capital-gain    191    213     23
   capital-loss    215    263     49
 hours-per-week    265    360     96
      workclass    362    370      9
      education    372    387     16
 marital-status    389    395      7
     occupation    397    411     15
   relationship    413    418      6
           race    420    424      5
         gender    426    427      2
 native-country    429    470     42
         income    472    473      2


'[74, 175, 190, 214, 264, 360, 370, 387, 395, 411, 418, 424, 427, 470, 473]'

In [7]:
'''Notations
  node: number of all nodes = L + S + C + F
  L: number of lable nodes + 1 (for unseen lable)
  S: number of sample nodes + 1 (for inference)
  C: number of catagory nodes + F (for each field(column)
  F: number of field(column) nodes (no unseen field is allowed)
  hidden: number of hidden representation

data size = 
mask size =
use nn.transformerDecoder(data,mask) to get the output
use the above output as input of MLP to predict the lable   
'''

'Notations\n  node: number of all nodes = L + S + C + F\n  L: number of lable nodes + 1 (for unseen lable)\n  S: number of sample nodes + 1 (for inference)\n  C: number of catagory nodes + F (for each field(column)\n  F: number of field(column) nodes (no unseen field is allowed)\n  hidden: number of hidden representation\n\ndata size = \nmask size =\nuse nn.transformerDecoder(data,mask) to get the output\nuse the above output as input of MLP to predict the lable   \n'

In [8]:
class HGNN_dataset():
    def __init__(self,
                 data_df : pd.DataFrame,
                 label_column : str,
                 test_df : pd.DataFrame = None,
                 split_ratio : float = None,
                 embedding_dim : int = 128,
                 ):
        if test_df is None:
            # shuffle and cut data
            data_df = data_df.sample(frac=1,random_state=42).reset_index(drop=True)
            test_size = math.ceil(data_df.shape[0] * (1-split_ratio))
            train_pool = data_df[test_size:]
            test_pool = data_df[:test_size]
            print('total data num:' , data_df.shape[0])
            print('trian data num:' , train_pool.shape[0])
            print('test data num:' , test_pool.shape[0])
        else: 
            # given train and test data, seperated (K-fold)
            train_pool = data_df
            test_pool = test_df
            print('trian data num:' , train_pool.shape[0])
            print('test data num:' , test_pool.shape[0])

        
        # to-dos:
        # train
        #   
        N_BINS = 100
        TRAIN_POOL, self.inference_package, self.NUM_vs_CAT = POOL_preprocess(train_pool, N_BINS = N_BINS)
        TEST_POOL, self.unseen_node_indexs_C = POOL_preprocess_inference(test_pool, self.inference_package)
        LABEL_COLUMN = label_column

        # cut feature and lable
        FEATURE_POOL = TRAIN_POOL.drop(LABEL_COLUMN, axis=1)
        LABEL_POOL = TRAIN_POOL[LABEL_COLUMN]
        TEST_LABEL_POOL = TEST_POOL[LABEL_COLUMN]
        
        from sklearn.preprocessing import OneHotEncoder
        enc = OneHotEncoder()
        LABEL_POOL = enc.fit_transform(LABEL_POOL.values.reshape(-1,1)).toarray()
        TEST_LABEL_POOL = enc.fit_transform(TEST_LABEL_POOL.values.reshape(-1,1)).toarray()

        # L: number of lable nodes, the last node of Lable nodes is served as unknown lable node
        L = LABEL_POOL.shape[1] + 1

        # S: number of sample nodes, the last node of sample nodes is served as infering node
        S = FEATURE_POOL.shape[0] + 1
        
        # F: number of field (column) nodes
        F = FEATURE_POOL.shape[1]

        # C: number of catagory nodes, each field(column) has its own "unseen" catagory nodes
        self.nodes_of_fields = []
        for column in FEATURE_POOL.columns:
            self.nodes_of_fields.append(FEATURE_POOL[column].nunique()+1)
        C = sum(self.nodes_of_fields) # the total number of nodes equals to the sum of nodes of each field
        C_POOL = range(int(C))

        nodes_num = {'L':L, 'S':S, 'C':C, 'F':F}
        print('node_nums', nodes_num)
        # print('total', L+S+C+F, 'nodes')
        
        # get samples indexs for each label
        self.labe_to_index = {}
        tmp_pool = TRAIN_POOL.copy().reset_index(drop=True)
        for label in tmp_pool['income'].unique():
            self.labe_to_index[label] = (tmp_pool[tmp_pool['income'] == label].index).tolist()
        
        self.TRAIN_POOL = TRAIN_POOL
        self.TEST_POOL = TEST_POOL
        self.TEST_LABEL_POOL = TEST_LABEL_POOL
        self.LABEL_COLUMN = LABEL_COLUMN
        self.FEATURE_POOL = FEATURE_POOL
        self.LABEL_POOL = LABEL_POOL
        self.C_POOL = C_POOL   
        self.nodes_num = nodes_num
        self.N_BINS = N_BINS
        self.embedding_dim = embedding_dim
        
        self.make_input_tensor()
        # self.get_sample(10)        
        self.make_mask_all()
        
        # self.make_mask()
        
        
    def make_mask_subgraph(self,
                  sample_indices: Optional[list] = None,
                  query_indices: Optional[list] = None,
                ):
        '''Makeing masks for subgraph. Mask values are 1 if two nodes are connected, otherwise 0.
        
        Args:
            sample_indices: list of list of sample node indices, in shape of `[batch_size, sample_size]`
            query_indices: list of query node indices for each batch, in shape of `[batch_size]`
        
        for example, with:
            {'L': 3, 'S': 39074, 'C': 470, 'F': 14, 'K': 10}
            
        the masks will be:
            masks['L2S'] = torch.Size([16, 8]), values in torch.Size([10, 3])\\
            masks['S2C'] = torch.Size([472, 16]), values in torch.Size([470, 10])\\
            masls['C2F'] = torch.Size([16, 472]), values in torch.Size([14, 470])\\
        Notice: xformer require the mask's tensor must align on memory, and should be slice of a tensor if shape cannot be divided by 8
        '''
        L, S, C, F = self.nodes_num['L'], self.nodes_num['S'], self.nodes_num['C'], self.nodes_num['F']

        sample_size = len(sample_indices[0])
        # caculate masking
        masks = {}
        
        tmp_L2S = []
        tmp_S2C = []
        for batch_indices in sample_indices:
            # masked_POOL = self.TRAIN_POOL.iloc[batch_indices] # sample dataframe into shape (10,14)
            
            # label to sample
            tmp = self.MASKS_FULL['L2S']
            tmp = torch.index_select(tmp, 0, torch.tensor(batch_indices, device=DEVICE)) #The returned tensor does not use the same storage as the original tensor
            # tmp = torch.zeros([math.ceil(sample_size/8) * 8, math.ceil(L/8) * 8], dtype=torch.float, device=DEVICE) 
            # label_value = masked_POOL[self.LABEL_COLUMN].values
            # tmp[torch.arange(sample_size, device=DEVICE), torch.tensor(label_value - min(label_value), device=DEVICE)] = 1
            shape = (math.ceil(sample_size/8) * 8, math.ceil(L/8) * 8)
            new_tensor = torch.zeros(*shape, device=DEVICE)
            new_tensor[:tmp.shape[0], :tmp.shape[1]] = tmp
            tmp = new_tensor.view(*shape)
            tmp_L2S.append(tmp)
            # masks['L2S'] = tmp.repeat(batch_size,1,1)
            

            # sample to catagory
            tmp = self.MASKS_FULL['S2C']
            tmp = torch.index_select(tmp, 1, torch.tensor(batch_indices, device=DEVICE)) # The returned tensor does not use the same storage as the original tensor
            # tmp = torch.zeros([math.ceil(C/8) * 8, math.ceil(sample_size/8) * 8], dtype=torch.float, device=DEVICE).T
            # tmp_df = masked_POOL.drop(self.LABEL_COLUMN, axis=1)
            # tmp[torch.arange(sample_size, device=DEVICE).unsqueeze(-1), torch.tensor(tmp_df.values, device=DEVICE)] = 1
            # tmp = tmp.T.contiguous()
            shape = (math.ceil(C/8) * 8, math.ceil(sample_size/8) * 8)
            new_tensor = torch.zeros(*shape,device=DEVICE)
            new_tensor[:tmp.shape[0], :tmp.shape[1]] = tmp
            tmp = new_tensor.view(*shape)
            tmp_S2C.append(tmp)
        
        masks['L2S'] = torch.stack(tmp_L2S, dim = 0)
        masks['S2C'] = torch.stack(tmp_S2C, dim = 0)
        # tmp_ = []
        # for index, query in enumerate(query_indices):
        #     tmp = torch.zeros([math.ceil(C/8) * 8, math.ceil(sample_size/8) * 8], dtype=torch.float, device=DEVICE).T
        #     tmp_df = masked_POOL.drop(self.LABEL_COLUMN, axis=1)
        #     tmp[torch.arange(sample_size, device=DEVICE).unsqueeze(-1), torch.tensor(tmp_df.values, device=DEVICE)] = 1
        #     tmp = tmp.T.contiguous()
        #     tmp_.append(tmp)
        
        # masks['S2C'] = Tensor.contiguous(tmp.repeat(batch_size,1,1))
        # masks['S2C'] = torch.stack(tmp_, dim = 0)

        # catagory to field
        masks['C2F'] = self.MASKS_FULL['C2F'].repeat(len(query_indices),1,1)
        self.MASKS = masks
        self.nodes_num['K'] = sample_size
        
    def make_mask_all(self):
        '''Makeing masks for the entire graph. Mask values are 1 if two nodes are connected, otherwise 0.

        for example, with:
            {'L': 3, 'S': 39074, 'C': 470, 'F': 14, 'K': 10}.
            
        the masks will be:
            masks['L2S']: torch.Size([39080, 8]), values in torch.Size([39074, 3]).\\
            masks['S2C']: torch.Size([472, 39080]), values in torch.Size([470, 39074]).\\
            masls['C2F']: torch.Size([16, 472]), values in torch.Size([14, 470]).\\
            
        Notice: xformer require the mask's tensor must align on memory, and should be slice of a tensor if shape cannot be divided by 8
        '''
        L, S, C, F = self.nodes_num['L'], self.nodes_num['S'], self.nodes_num['C'], self.nodes_num['F']
        # caculate masking
        masks = {}
        
        # label to sample 
        tmp = torch.zeros([math.ceil(S/8) * 8, math.ceil(L/8) * 8], dtype=torch.float, device=DEVICE)
        label_ids = self.TRAIN_POOL[self.LABEL_COLUMN].unique()
        for i, value_df in enumerate(self.TRAIN_POOL[self.LABEL_COLUMN]):
            for j, value_label in enumerate(label_ids):
                if value_label == value_df:
                    tmp[i][j] = 1
                    break
        masks['L2S'] = tmp

        # sample to catagory
        tmp = torch.zeros([math.ceil(C/8) * 8, math.ceil(S/8) * 8], dtype=torch.float, device=DEVICE).T
        tmp_df = self.TRAIN_POOL.drop(self.LABEL_COLUMN, axis=1)
        tmp[torch.arange(len(self.TRAIN_POOL), device=DEVICE).unsqueeze(-1), torch.tensor(tmp_df.values, device=DEVICE)] = 1
        tmp = tmp.T.contiguous()
        masks['S2C'] = tmp

        # catagory to field
        # to do : this is wrong , should connect all catagory nodes (even unseen nodes))
        tmp = torch.zeros([math.ceil(F/8) * 8, math.ceil(C/8) * 8], dtype=torch.float, device=DEVICE)
        unique_items = [sorted(self.FEATURE_POOL[column].unique()) for column in (self.FEATURE_POOL.columns)]
        for i in range(F):
            for j in (unique_items[i]):
                tmp[i][j] = 1
        masks['C2F'] = tmp
        self.MASKS = masks
        self.MASKS_FULL = masks
        
    def make_mask_test(self, 
                       indexs_in_test_pool : list
                       ):
        '''Make mask tensor for the testing scenario. \n
        In testing scenario, L, S, C, F remain the same, while all INPUTs are the same (sience they are initialized fixed vlaues\n
        All we need to do is to update masks(L2S, S2C) for the new inference node
        '''
        L, S, C, F = self.nodes_num['L'], self.nodes_num['S'], self.nodes_num['C'], self.nodes_num['F']
        
        masks = {}
        tmp_L2S = []
        tmp_S2C = []
        # for i query nodes
        for index_in_test_pool in indexs_in_test_pool:
            # L2S shape: torch.Size([39080, 8]), values in torch.Size([39074, 3]).
            # number of sample nodes : 39073 + 1 (inference node)
            # S = 39074, -1 to convert to index of last node
            tmp = self.MASKS_FULL['L2S'].clone().detach()
            tmp[S-1, L-1] = 1 # connect inference node to unseen lable nodes
            tmp_L2S.append(tmp)
            # masks['L2S'] = tmp.unsqueeze(0)
        
            # S2C shape: torch.Size([472, 39080]), values in torch.Size([470, 39074]).
            # self.MASKS_FULL['S2C'].T :[39080, 472], values in [39074, 470]
            # self.TEST_POOL.drop(self.LABEL_COLUMN, axis=1).values[index_in_test_pool]
            tmp = self.MASKS_FULL['S2C'].T.clone().detach()
            # connect the last sample node (inference node) with it's catagory nodes
            tmp[S-1, self.TEST_POOL.drop(self.LABEL_COLUMN, axis=1).values[index_in_test_pool]] = 1  
            tmp_S2C.append(tmp.T)
            
            # masks['S2C'] = tmp.T.contiguous().unsqueeze(0)
            
        masks['L2S'] = torch.stack(tmp_L2S, dim = 0)
        masks['S2C'] = torch.stack(tmp_S2C, dim = 0)
        # C2F remains the same
        masks['C2F'] = self.MASKS_FULL['C2F'].repeat(len(indexs_in_test_pool),1,1)
        
        self.MASKS = masks
        # print('masks[\'L2S\']',masks['L2S'].shape)
        # print('masks[\'S2C\']',masks['S2C'].shape)
        # print('masks[\'C2F\']',masks['C2F'].shape)
        
        
    def make_input_tensor(self):
        '''Makeing input tensor for the entire graph.
            
        for example, with:
            {'L': 3, 'S': 39074, 'C': 470, 'F': 14, 'K': 10}.
                
        the input tensor will be:
            L_input: torch.Size([3, 1]).
            S_input: torch.Size([39074, 128]).
            C_input: torch.Size([470, 1]).
            F_input: torch.Size([14, 1]).
        '''
        # make input tensor
        L, S, C, F = self.nodes_num['L'], self.nodes_num['S'], self.nodes_num['C'], self.nodes_num['F']
        # L
        L_input = torch.tensor([range(L)], device=DEVICE).reshape(-1,1)
        # print('L_input', L_input.type(), L_input.shape)
        
        # S (normalized by standard scaler)
        # features = torch.tensor(self.FEATURE_POOL.values, device=DEVICE).float()
        # normalized_features = (features - torch.mean(features, dim = 0)) / torch.std(features, dim = 0)
        # S_input = torch.cat([normalized_features, torch.tensor([[0]*F], device=DEVICE)],dim = 0).float() # add infering node
        
        # S (initialize by random)
        S_input = torch.rand(self.embedding_dim, device=DEVICE).repeat(S,1)
        # print('S_input', S_input.type(), S_input.shape)
        # C 
        C_input = torch.tensor([self.C_POOL], device=DEVICE).reshape(-1,1)
        # print('C_input', C_input.type(), C_input.shape)
        # F 
        F_input = torch.tensor([range(F)], device=DEVICE).reshape(-1,1)
        # print('F_input', F_input.type(), F_input.shape)
        # 
        self.INPUTS = (L_input, S_input, C_input, F_input)
        self.INPUT_DIMS = (L_input.size(1), S_input.size(1), C_input.size(1), F_input.size(1))

    def sample_with_distrubution(self, sample_size):
        '''
        Sample equally from each label with required sample size\\
        forced to make balenced sample
        '''
        # decide each label's number of samples (fourced to be balenced if possible) 
        label_list = []
        label_unique = list(self.labe_to_index.keys())
        count = sample_size // len(label_unique)
        remainder = sample_size % len(label_unique)
        label_list = [item for item in label_unique for _ in range(count)]
        label_list.extend(random.sample(label_unique, remainder))
        # sample from indexes
        indices = [random.choice(self.labe_to_index[label]) for label in label_list]
        return indices     
        
    def get_sample(self, sample_size, query_indices = []):
        '''get sample nodes indices, and update mask and input tensor
        
        Args:
            sample_size: number of sample nodes required for each batch.
            query_indices (optional): list of nodes indices that must be included in nodes indices (one for each batch).
        Return:
            sample_indices: list of sample nodes indices, in shape of `[batch_size, sample_size]`
        
        For example, with `sample_size = 3`, `query_indices = [1,2,3]`\n
        means that there are `batch_size = 3` batches, each batch has `3` nodes.\n
        particularly, the three batches' `sample_indices` could be:\n
        `[1,324,656]`, `[2, 435, 9867]`, `[3, 789, 1343]`
        
        The included nodes shold not and will not be repeated, in case of the lable leakage.
        '''
        # include specific nodes (e.g. query nodes), while remaining sample_size
        sample_indices = []
        if query_indices is not []:
            for query in query_indices:
                indices = self.sample_with_distrubution(sample_size - 1)
                while query in indices:
                    indices = self.sample_with_distrubution(sample_size - 1)
                # add query nodes into sample_indices
                indices.append(query)
                sample_indices.append(sorted(indices))
        else:
            indices = self.sample_with_distrubution(sample_size - len(query_indices))
            sample_indices.append(sorted(indices))
        # update mask
        # modify input tensor
        L_input, S_input, C_input, F_input = self.INPUTS
        S_input_masked = []
        for i in range(len(query_indices)):
            S_input_masked.append(torch.index_select(S_input, 0, torch.tensor(sample_indices[i], device=DEVICE)))
        S_input_masked = torch.stack(S_input_masked, dim = 0) # convert back to tensor
        self.MASKED_INPUTS = (L_input, S_input_masked, C_input, F_input) 
          
        return sample_indices
            
# Train_data = HGNN_dataset( main_df, 'income', split_ratio = 0.8)
# query_indices=[100,111]
# # Train_data.get_sample(10, query_indices=query_indices)
# Train_data.make_mask_subgraph(Train_data.get_sample(10, query_indices=query_indices), query_indices = query_indices)
# Train_data.make_mask_subgraph(Train_data.get_sample(10, query_indices=query_indices), query_indices = query_indices)
# Train_data.make_mask_test([0,1,3])
# # print(Train_data.MASKS['L2S'].shape)
# # print(Train_data.MASKS['S2C'].shape)
# # print(Train_data.MASKS['C2F'].shape)
# # print(Train_data.MASKED_INPUTS[1].shape)



In [9]:
from torch import Tensor
from typing import Optional, Any, Union, Callable
class TabTransformerDecoder(nn.TransformerDecoder):
    def forward(self, tgt: Tensor, memory: Tensor, tgt_ori:Tensor, tgt_mask: Tensor | None = None, memory_mask: Tensor | None = None, tgt_key_padding_mask: Tensor | None = None, memory_key_padding_mask: Tensor | None = None) -> Tensor:
        '''Pass the inputs (and mask) through the decoder layer in turn.

        Args:
            tgt: the sequence to the decoder (required).
            memory: the sequence from the last layer of the encoder (required).
            tgt_mask: the mask for the tgt sequence (optional).
            memory_mask: the mask for the memory sequence (optional).
            tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
            memory_key_padding_mask: the mask for the memory keys per batch (optional).

        Shape:
            see the docs in Transformer class.
        '''
        output = tgt

        for mod in self.layers:
            output = mod(output, memory, tgt_ori, tgt_mask=tgt_mask,
                         memory_mask=memory_mask,
                         tgt_key_padding_mask=tgt_key_padding_mask,
                         memory_key_padding_mask=memory_key_padding_mask)

        if self.norm is not None:
            output = self.norm(output)

        return output
    

class TabHyperformer_Layer(nn.TransformerDecoderLayer):
    def __init__(self, d_model, nhead, dim_feedforward=512, dropout=0, activation='relu'):
        super().__init__(d_model, nhead, dim_feedforward, dropout, activation)
        # remove defined modules
        delattr(self, 'self_attn')
        delattr(self, 'norm1')
        delattr(self, 'dropout1')
    
    def forward(self, tgt, memory, tgt_ori, tgt_mask=None, memory_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
        x = tgt
        # x = self.norm1(x + self._sa_block(x, tgt_mask, tgt_key_padding_mask))
        x = self.norm2(tgt_ori + self._mha_block(x, memory, memory_mask))
        # x =  x + self._mha_block(x, memory, memory_mask, memory_key_padding_mask)
        # x = self.norm3(x + self._ff_block(x))

        return x
    def _mha_block(self, x: Tensor, mem: Tensor,
                   attn_mask: Optional[Tensor],) -> Tensor:
        x = xops.memory_efficient_attention(x, mem, mem, attn_mask)
        # return self.dropout2(x)
        return (x)


In [10]:
# baic transformer decoder model
import torch
import torch.nn as nn
import torch.nn.functional as Fun
from tqdm import trange, tqdm

class TransformerDecoderModel(nn.Module):
    def __init__(self, 
                 dataset : HGNN_dataset, 
                 num_layers = 1, 
                 embedding_dim = 128, 
                 propagation_steps = 1,
                 ):
        super(TransformerDecoderModel, self).__init__()

        L_dim, S_dim, C_dim, F_dim = dataset.INPUT_DIMS
        L, S, C, F = dataset.nodes_num['L'], dataset.nodes_num['S'], dataset.nodes_num['C'], dataset.nodes_num['F']
        num_NUM , num_CAT = dataset.NUM_vs_CAT
        

        self.Lable_embedding = nn.Embedding(L, embedding_dim, dtype=torch.float)
    
        # self.Catagory_embedding_num = nn.Linear(C_dim, embedding_dim, dtype=torch.float)
        # for every numrical filed, construct it's own Linear embedding layer
        self.Catagory_embedding_nums = []
        for i in range(num_NUM):
            self.Catagory_embedding_nums.append(
                nn.Linear(C_dim, embedding_dim, dtype=torch.float, device=DEVICE)
            )
        catagories = dataset.nodes_of_fields[-num_CAT:] # number of all possible catagories nodes
        self.Catagory_embedding_cat = nn.Embedding(sum(catagories), embedding_dim, dtype=torch.float)
        
        self.Field_embedding = nn.Embedding(F, embedding_dim, dtype=torch.float)
        
        self.transformer_decoder = TabTransformerDecoder(
            TabHyperformer_Layer(embedding_dim,  nhead = 2 ),
            num_layers
        )
        
        # downstream task
        self.MLP = nn.Sequential(
            nn.Linear(embedding_dim, 2),
        )
        
        # initialize MASK_FULL
        dataset.make_mask_all()
        # dataset.make_input_tensor()
        
        self.tmpmask_L2S = dataset.MASKS['L2S'].clone()

        self.propagation_steps = propagation_steps

    def maskout_lable(self,
                      dataset: HGNN_dataset,
                      query_indices: list, # must be sorted
                      sample_indices: Optional[list] = None, 
                      ):
        L = dataset.nodes_num['L']
        if sample_indices is not None:
            self.tmpmask_L2S = dataset.MASKS['L2S'].clone().detach()
            for index, sample_indice in enumerate(sample_indices): # sample_indice in length K
                # modify the mask to mask out the queries node's edge to it's label node
                query_index = sample_indice.index(query_indices[index]) # query_index: index of query node in sample_indice of the batch
                # L2S mask shape : B, S, L
                self.tmpmask_L2S[index, query_index] = 0
                self.tmpmask_L2S[index, query_index][L-1] = 1 # make it as unseen label
        else:
            self.tmpmask_L2S = dataset.MASKS['L2S'].clone().detach()
            for index, query in enumerate(query_indices):
                self.tmpmask_L2S[index, query] = 0
                self.tmpmask_L2S[index, query][L-1] = 1 # make it as unseen label
    def forward(self, 
                dataset: HGNN_dataset, 
                mode : str = 'train',
                query_indices: list = None,  # must be sorted
                K : Optional[int] = 10,
                ):
        L, S, C, F = dataset.nodes_num['L'], dataset.nodes_num['S'], dataset.nodes_num['C'], dataset.nodes_num['F']
        num_NUM, num_CAT = dataset.NUM_vs_CAT
        batch_size = len(query_indices)
        # decide scenario
        if mode == 'train':
            if batch_size > 1:
                # batch mode
                # to-do: batch mode
                
                pass 
            # generate subgraph with K nodes, including query_indices
            # update mask and input tensor
            sample_indices = dataset.get_sample(K, query_indices = query_indices) # update mask
            dataset.make_mask_subgraph(sample_indices, query_indices)
            masks = dataset.MASKS
            
            # get updated masked input tensor and mask 
            L_input, S_input, C_input, F_input = dataset.MASKED_INPUTS
            L_input = L_input.clone().detach().repeat(batch_size,1,1)
            # S_input is already in shape [batch_size, sample_size, embedding_dim], see get_sample()
            S_input = S_input.clone().detach()
            C_input = C_input.clone().detach().repeat(batch_size,1,1)
            F_input = F_input.clone().detach().repeat(batch_size,1,1)
            
            # mask out the queries node's edge to it's label node, prevent label leakage
            self.maskout_lable(dataset, query_indices, sample_indices)
            
            # the query node's indexs in sample_indices
            query_indexs = [sample_indices[i].index(query) for i, query in enumerate(query_indices)]
            S_ = K # the S used in transformer decoder
            
        elif mode == 'inferring':
            # use all nodes in the graph 
            # get input tensor (no need to update)
            L_input, S_input, C_input, F_input = dataset.INPUTS
            L_input = L_input.repeat(batch_size,1,1)
            S_input = S_input.repeat(batch_size,1,1)
            C_input = C_input.repeat(batch_size,1,1)
            F_input = F_input.repeat(batch_size,1,1)
            # updata mask for inference node
            dataset.make_mask_test(query_indices) # query node equal to inference node, only one query node is allowed
            masks = dataset.MASKS
            
            self.maskout_lable(dataset, query_indices, None)
            
            
            # the query node's indexs in sample_indices
            # query_indexs = [S-1]
            query_indexs = [S-1]*batch_size
            S_ = S # the S used in transformer decoder
        else:
            raise NotImplementedError

        # for S and C, we use two different embedding methods, for CAT and NUM, respectively
        # Squeeze for making batch dimantion
        L_embedded = self.Lable_embedding(L_input.long()).squeeze(2).float()
        
        S_embedded = S_input.float()

        # for every numrical filed, use it's own Linear embedding layer
        C_embedded_nums = []
        field = dataset.nodes_of_fields
        start = 0
        for index, nodes in enumerate(field[:num_NUM]): # pick numrical fields
            end = start + nodes
            C_embedded_nums.append(self.Catagory_embedding_nums[index](C_input[:,start:end].float()))
            start = end
        
        C_embedded_num = torch.cat(C_embedded_nums, dim = 1)
        
        catagorical_filed_nodes = sum(field[-num_CAT:]) # pick catagory fields
        C_embedded_cat = self.Catagory_embedding_cat(C_input[:,-catagorical_filed_nodes:].squeeze(2).long() - sum(field[:num_NUM])).float() # - sum(field[:num_NUM] because the embedding index should start from 0
        # print(C_embedded_num.shape, C_embedded_cat.shape)
        C_embedded = torch.cat([C_embedded_num, C_embedded_cat], dim = 1)
        
        F_embedded = self.Field_embedding(F_input.long()).squeeze(2).float()
        
        # print(query_indices, K)
        # print(L_embedded.shape, S_embedded.shape, C_embedded.shape, F_embedded.shape)
        
        
        # propagate steps: L→S→C→F
        #                  L←S←C←
        # more steps more menory usage
        PROPAGATE_STEPS = self.propagation_steps
        origin_S = S_embedded.clone()
        origin_C = C_embedded.clone()
        origin_F = F_embedded.clone()
        origin_L = L_embedded.clone()
        for i in range(PROPAGATE_STEPS):
            S_embedded = self.transformer_decoder(S_embedded,L_embedded, origin_S,
                                                memory_mask = self.tmpmask_L2S.clone().detach()[:,:S_,:L])# + S_embedded
            C_embedded = self.transformer_decoder(C_embedded,S_embedded, origin_C,
                                                memory_mask = masks['S2C'].clone().detach()[:,:C,:S_])# + C_embedded   
            F_embedded = self.transformer_decoder(F_embedded,C_embedded, origin_F,
                                                memory_mask = masks['C2F'].clone().detach()[:,:F,:C])# + F_embedded
            C_embedded = self.transformer_decoder(C_embedded,F_embedded, origin_C,
                                                memory_mask = Tensor.contiguous(masks['C2F'].clone().detach().transpose(1, 2))[:,:C,:F])# + C_embedded
            S_embedded = self.transformer_decoder(S_embedded,C_embedded, origin_S,
                                                memory_mask = Tensor.contiguous(masks['S2C'].clone().detach().transpose(1, 2))[:,:S_,:C])# + S_embedded
            L_embedded = self.transformer_decoder(L_embedded,S_embedded, origin_L,
                                                memory_mask = Tensor.contiguous(self.tmpmask_L2S.clone().detach().transpose(1, 2))[:,:L,:S_])# + L_embedded
        
        # print('after',S_embedded[0][0])
        output = self.MLP(S_embedded)
        outputs = []
        for index, query in enumerate(query_indexs):
            outputs.append(output[index, query])
        outputs = torch.stack(outputs, dim = 0)
        # output_batch = [output[:,query_indexs][query_indexs[i]] for i in range(batch_size)]
        # print(output_batch)
        return outputs
  

# # 測試模型
# num_layers = 1  # TransformerDecoder 的層數
# embedding_dim = 128  # 嵌入維度
# hidden_dim = 64  

# print('input_dims', Train_data.INPUT_DIMS)
# model = TransformerDecoderModel(Train_data, num_layers, embedding_dim).to(DEVICE)


# outputs = model(Train_data, mode = 'inferring', query_indices = [10,20,3000], K = 50)
# # outputs = model(Train_data, mode = 'inferring', query_indices = [10], K = 50)
# print("模型輸出的大小[q,2]:", outputs.shape)
# print(outputs)
# print(outputs.softmax(dim=1))
# output_labels = torch.argmax(outputs.softmax(dim=1), dim=1)
# output_labels



In [11]:
# training
from torch import autograd
from torcheval.metrics.aggregation.auc import AUC
from torcheval.metrics import BinaryAUROC
from sklearn.metrics import roc_auc_score
tmp_log = []
tmp__log = []
def train(model : nn.Module, 
          datset : HGNN_dataset, 
          epochs : int = 20,
          batch_size : int = 8,
          batch_size_test : int = 2,
          lr : float = 0.0001,
          K : int = 10,
          verbose : int = 1,
            # verbose = 0: no printed log
            # verbose = 1: print loss and AUC per train
            # verbose = 2: print loss and AUC per epoch
          wandb_log : bool = False,
            # inited outside
          log_name : str = 'unnamed',
          ):
    LABEL_POOL = datset.LABEL_POOL
    TEST_LABEL_POOL = datset.TEST_LABEL_POOL
    weight = torch.from_numpy(np.array([0.2, 1])).float().to(DEVICE)
    criterion = nn.CrossEntropyLoss(weight)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    epoch_AUC = 0
    epoch_AUC_test = 0
    
    if verbose == 1:
        stepper_epoch = trange(epochs)
    else:
        stepper_epoch = range(epochs) 
    for epoch in stepper_epoch:
        
        '''------------------------training------------------------'''
        
        QUERY_POOL = list(range(len(datset.FEATURE_POOL)))
        random.shuffle(QUERY_POOL)
        # train
        model.train()
        # logs
        loss_log = 0
        AUC_metric = BinaryAUROC().to(DEVICE)
        AUC_metric_test = BinaryAUROC().to(DEVICE)
        
        iter = 0
        if verbose >= 2:
            stepper = trange(len(datset.FEATURE_POOL)//batch_size)
        else:
            stepper = range(len(datset.FEATURE_POOL)//batch_size)
        for index in stepper: # query through all sample nodes (not infering node)
            
            optimizer.zero_grad()
            # pick query nodes
            query_indices = QUERY_POOL[:batch_size]
            QUERY_POOL = QUERY_POOL[batch_size:]

            outputs = model(datset, mode = 'train', query_indices = query_indices, K = K)
            # output shape:[q,2], example: torch.Size( 2, 2]
            # tensor([[-0.6845, -0.6323],
            #          [-0.7770, -0.4703]], device='cuda:0', grad_fn=<IndexBackward0>)
                
            # for trainning, only the query node's output is used
            # caculate loss
            LABEL_POOL_ = LABEL_POOL[query_indices] # shape:[q,2] ,example [[1. 0.], [1. 0.]]
                        
            # caculate loss
            batch_loss = criterion(outputs, torch.tensor(LABEL_POOL_,device=DEVICE))
            loss_log += batch_loss.item()
            # backpropagation
            batch_loss.backward()
            optimizer.step()

            TRUE = np.argmax(LABEL_POOL_,axis=1)
            
            outputs = outputs.softmax(dim=1)
            # print(query_indices)
            # print(outputs)
            # print(TRUE)
            # break
            pred_prob_of_is_1 = [probs[1] for probs in outputs] 
            # the probability of the query node is 1 (from model output)
            
            # tmp_log.append(float(pred_prob_of_is_1))
            # tmp__log.append((TRUE))
            AUC_metric.update(torch.Tensor(pred_prob_of_is_1),torch.Tensor(TRUE))
            torch.cuda.empty_cache()
            # break
            iter += 1
            # if iter >= 100:
            #     break
            if index == len(datset.FEATURE_POOL)//batch_size -1 and verbose >= 2:
                stepper.set_postfix(AUC=float(AUC_metric.compute()))
                stepper.update()
                
        epoch_loss = loss_log / batch_size
        epoch_AUC = float(AUC_metric.compute()) 
        if verbose == 1:
            stepper_epoch.set_postfix({'loss_train':epoch_loss, 'AUC_train':epoch_AUC, 'AUC_test':epoch_AUC_test})
        
        '''------------------------evaluate------------------------'''
        # evaluate
        model.eval()
        iter = 0
        with torch.no_grad():
            if verbose >= 2:
                stepper = trange(len(datset.TEST_POOL)//batch_size_test)
            else:
                stepper = range(len(datset.TEST_POOL)//batch_size_test)
            for index in stepper:
                QUERY_POOL = list(range(len(datset.TEST_POOL)))
                random.shuffle(QUERY_POOL)
                query_indices = QUERY_POOL[:batch_size_test]
                QUERY_POOL = QUERY_POOL[batch_size_test:]
                outputs = model(datset, mode = 'inferring', query_indices = query_indices, K = None)
                LABEL_POOL_ = TEST_LABEL_POOL[query_indices]
                TRUE = np.argmax(LABEL_POOL_,axis=1)
                outputs = outputs.softmax(dim=1)
                pred_prob_of_is_1 = [probs[1] for probs in outputs] 
                AUC_metric_test.update(torch.Tensor(pred_prob_of_is_1),torch.Tensor(TRUE))
                torch.cuda.empty_cache()
                iter += 1
                # if iter >= 100:
                #     break
        epoch_AUC_test = float(AUC_metric_test.compute()) 
        if verbose == 1:
            stepper_epoch.set_postfix({'loss_train':epoch_loss, 'AUC_train':epoch_AUC, 'AUC_test':epoch_AUC_test})

        AUC_metric.reset()
        AUC_metric_test.reset()
        # break
        del loss_log, AUC_metric
        tmp_log.append(float(epoch_loss))
        tmp__log.append(float(epoch_AUC))

        if wandb_log:
            wandb.log({'loss': epoch_loss, 'AUC_train': epoch_AUC, 'AUC_test': epoch_AUC_test, 'epoch': epoch})
        
        # print(f"Epoch{epoch+1}/{epochs} | Loss: {epoch_loss} | AUC: {epoch_AUC} |")
        if verbose >= 2:
            print(f"Epoch{epoch+1}/{epochs} | Loss: {epoch_loss} | AUC_train: {epoch_AUC} | AUC_test: {epoch_AUC_test}")
        
        
        with open('logs/' + log_name + '.txt', 'a') as f:
            # f.write(f"Epoch{epoch+1}/{epochs} | Loss: {epoch_loss} | AUC: {epoch_AUC}| ")
            f.write(f"Epoch{epoch+1}/{epochs} | Loss: {epoch_loss} | AUC_train: {epoch_AUC}| AUC_test: {epoch_AUC_test}\n ")
    if verbose >=1:
        print(f"{log_name} | Loss: {epoch_loss} | AUC_train: {epoch_AUC} | AUC_test: {epoch_AUC_test}")
# model = TransformerDecoderModel(Train_data, num_layers, embedding_dim).to(DEVICE)
# train(model, Train_data,
#       epochs= 20,
#       lr = 0.0005,
#       batch_size = 128,
#       batch_size_test = 32,
#       K = 100,
#       verbose = 1,
#       log_name = 'notuseful',
#       wandb_log = False)



In [12]:
# from matplotlib import pyplot as plt
# plt.plot(tmp_log)
# plt.plot(tmp__log)
# plt.show()

## 5 fold + wandb

In [13]:
from sklearn.model_selection import KFold
import wandb
import os

kf = KFold(n_splits=5, shuffle=True, random_state=42)

os.environ["WANDB_SILENT"] = "true"
torch.random.manual_seed(42)
random.seed(42)
np.random.seed(42)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
torch.use_deterministic_algorithms(True)

main_df = pd.read_csv('adult.csv')
DEVICE = 'cuda'
# 進行5-fold交叉驗證
for index, (train_index, test_index) in enumerate(kf.split(main_df)):
    # if index <= 2:
    #     continue
    print('[', index+1, 'fold] processing...')
    train_pool, test_pool = main_df.iloc[train_index], main_df.iloc[test_index]
    os.environ["WANDB_SILENT"] = "true"
    config = {
        "project": "K_fold test 0727 ",
        "name" : "residual " + str(index+1),
        "Max_epoch" : 20,
        "group": "residual ",
        "learning_rate": 0.001,
        "batch_size": 128,
        "batch_size_test": 12,
        "K" : 100,
        "num_layers": 1,
        "embedding_dim": 128,
        "propagation_steps": 3,
        "notes": 'v3',
    }

    wandb.init(
    # set the wandb project where this run will be logged
        project = config['project'], 
        name = config['name'],
        notes = config['notes'],
        entity = 'tabhyperformer',
        group = config['group'],
        # track hyperparameters and run metadata
        config = config
    )



    Main_data = HGNN_dataset( train_pool, label_column = 'income', test_df = test_pool, embedding_dim = config['embedding_dim'])
    model = TransformerDecoderModel(Main_data, 
                                    config['num_layers'], 
                                    config['embedding_dim'],
                                    config['propagation_steps']
                                    ).to(DEVICE)
    train(model, Main_data,
        epochs= config['Max_epoch'],
        lr = config['learning_rate'],
        batch_size = config['batch_size'],
        batch_size_test = config['batch_size_test'],
        K = config['K'],
        verbose = 1,
        log_name = config['name'],
        wandb_log = True)
    del Main_data, model
    wandb.finish()

[ 1 fold] processing...


trian data num: 39073
test data num: 9769
[preprocess]: detected unseen values in column capital-loss




node_nums {'L': 3, 'S': 39074, 'C': 471, 'F': 14}


  0%|          | 0/20 [00:00<?, ?it/s]


RuntimeError: Deterministic behavior was enabled with either `torch.use_deterministic_algorithms(True)` or `at::Context::setDeterministicAlgorithms(true)`, but this operation is not deterministic because it uses CuBLAS and you have CUDA >= 10.2. To enable deterministic behavior in this case, you must set an environment variable before running your PyTorch application: CUBLAS_WORKSPACE_CONFIG=:4096:8 or CUBLAS_WORKSPACE_CONFIG=:16:8. For more information, go to https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility