In [1]:
import pandas as pd
# baic transformer Decoder model
import torch
import torch.nn as nn
import torch.nn.functional as Fun
import torch.optim as optim
import numpy as np
import xformers.ops as xops
import math

main_df = pd.read_csv('adult.csv')
main_df.head()
DEVICE = 'cuda'

In [2]:
def POOL_preprocess(df):
    '''
    input the original dataframe, output the dataframe after preprocessing,
    change the numerical columns to categorical columns by qcut and cut
    then apply label encoding to all columns
    
    '''
    df = df.copy()
    CAT = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']
    NUM = ['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']
    # qcut on numerical columns
    for column in NUM:
        if column in ['educational-num','capital-gain','capital-loss','hours-per-week']:
            df[column] = pd.cut(df[column], 100)
        else:
            df[column] = pd.cut(df[column], 100)
    # make income column binary
    df['income'] = df['income'].apply(lambda x: 1 if x == '>50K' else 0)

    # lable encoding categorical columns
    from sklearn.preprocessing import LabelEncoder
    lb = LabelEncoder()
    df = df.apply(lambda x: lb.fit_transform(x))

    # make all catagory in every column unique

    # 迴圈處理多個欄位
    offset = 0
    for column in df.columns:
        df[column] = df[column].apply(lambda x: x + offset)
        offset += df[column].nunique()
    
    return df
tmp = POOL_preprocess(main_df)
tmp.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,8,78,97,164,185,199,209,220,225,229,230,253,341,437,440
1,21,78,88,174,187,197,207,217,227,229,230,253,351,437,440
2,11,76,104,170,190,197,213,217,227,229,230,253,341,437,441
3,27,78,93,178,188,197,209,217,225,229,237,253,341,437,441
4,1,74,89,178,188,199,202,220,227,228,230,253,331,437,440


In [3]:
train_size = 4*48842//5
test_size = 48842//5
train_pool = main_df[test_size:]
test_pool = main_df[:test_size]
print('total data num:' , main_df.shape[0])
print('trian data num:' , train_pool.shape[0])
print('test data num:' , test_pool.shape[0])

total data num: 48842
trian data num: 39074
test data num: 9768


In [4]:
# notations
#   node: number of all nodes = L + S + C + F
#   L: number of lable nodes
#   S: number of sample nodes
#   C: number of catagory nodes
#   F: number of field(column) nodes
#   hidden: number of hidden representation

# data size = (node, hidden)
# mask size = (node, node - L) without lable nodes
#             for each node, real mask = cat[mask,(node,L)] = (node, node)
#             cannot see it's label node

# use nn.transformerDecoder(data,mask) to get the output
# use the above output as input of MLP to predict the lable   

In [5]:
class HGNN_DataSet():
    def __init__(self,
                 mode : str,  # {'train', 'test'}
                 target_df : pd.DataFrame,
                 label_column : str):
        TARGET_POOL = POOL_preprocess(target_df)
        LABEL_COLUMN = label_column

        # cut feature and lable
        FEATURE_POOL = TARGET_POOL.drop(LABEL_COLUMN, axis=1)
        LABEL_POOL = TARGET_POOL[LABEL_COLUMN]

        # trasform label into one-hot
        from sklearn.preprocessing import OneHotEncoder
        enc = OneHotEncoder()
        LABEL_POOL = enc.fit_transform(LABEL_POOL.values.reshape(-1,1)).toarray()

        # L: number of lable nodes
        L = LABEL_POOL.shape[1]

        # S: number of sample nodes
        S = FEATURE_POOL.shape[0]

        # C: number of catagory nodes
        C = FEATURE_POOL.apply(lambda x: x.nunique()).sum() # total_unique_labels
        C_POOL = sorted(list(set(FEATURE_POOL.values.flatten())))
        # the last node of catagory nodes is served as knseen node
        C += 1
        C_POOL.append(C_POOL[-1] + 1)

        # F: number of field(column) nodes
        F = FEATURE_POOL.shape[1]

        nodes_num = {'L':L, 'S':S, 'C':C, 'F':F}
        print('node_nums', nodes_num)
        print('total', L+S+C+F, 'nodes')
        
        self.TARGET_POOL = TARGET_POOL
        self.LABEL_COLUMN = LABEL_COLUMN
        self.FEATURE_POOL = FEATURE_POOL
        self.LABEL_POOL = LABEL_POOL
        self.C_POOL = C_POOL   
        self.nodes_num = nodes_num
        
        
        self.make_mask()
        self.make_input_tensor()
        
        
    def make_mask(self):
        L, S, C, F = self.nodes_num['L'], self.nodes_num['S'], self.nodes_num['C'], self.nodes_num['F']
        # caculate masking
        masks = {}

        # label to sample 
        tmp = torch.zeros([math.ceil(S/8) * 8, math.ceil(L/8) * 8], dtype=torch.float)
        label_ids = self.TARGET_POOL[self.LABEL_COLUMN].unique()
        for i, value_df in enumerate(self.TARGET_POOL[self.LABEL_COLUMN]):
            for j, value_label in enumerate(label_ids):
                if value_label == value_df:
                    tmp[i][j] = 1
                    break
        masks['L2S'] = tmp


        # sample to catagory
        tmp = torch.zeros([math.ceil(C/8) * 8, math.ceil(S/8) * 8], dtype=torch.float)
        tmp_df = self.TARGET_POOL.drop(self.LABEL_COLUMN, axis=1)
        for i, value_df in enumerate(tmp_df.values):
            for j, value in enumerate(value_df):
                tmp[value][i] = 1
        masks['S2C'] = tmp


        # catagory to field
        tmp = torch.zeros([math.ceil(F/8) * 8, math.ceil(C/8) * 8], dtype=torch.float)
        unique_items = [(self.TARGET_POOL[column].unique()) for column in (self.TARGET_POOL.columns)]
        for i in range(F):
            for j in (unique_items[i]):
                tmp[i][j] = 1
        masks['C2F'] = tmp
        
        self.MASKS = masks
        
    def make_input_tensor(self):
        # make input tensor
        L, S, C, F = self.nodes_num['L'], self.nodes_num['S'], self.nodes_num['C'], self.nodes_num['F']
        # L
        L_input = torch.eye(L).to(DEVICE)
        print('L_input', L_input.type(), L_input.shape)
        # S
        S_input = torch.tensor(self.FEATURE_POOL.values).to(DEVICE)
        print('S_input', S_input.type(), S_input.shape)
        # C random init
        C_input = torch.tensor(np.diag(self.C_POOL)).to(DEVICE)
        print('C_input', C_input.type(), C_input.shape)
        # F random init
        F_input = torch.eye(F).to(DEVICE)
        print('F_input', F_input.type(), F_input.shape)
        print(L_input.type())
        # 
        self.INPUTS = (L_input, S_input, C_input, F_input)
        self.INPUT_DIMS = (L_input.size(1), S_input.size(1), C_input.size(1), F_input.size(1))
Train_data = HGNN_DataSet('train', train_pool, 'income')


node_nums {'L': 2, 'S': 39074, 'C': 437, 'F': 14}
total 39527 nodes
L_input torch.cuda.FloatTensor torch.Size([2, 2])
S_input torch.cuda.LongTensor torch.Size([39074, 14])
C_input torch.cuda.LongTensor torch.Size([437, 437])
F_input torch.cuda.FloatTensor torch.Size([14, 14])
torch.cuda.FloatTensor


In [6]:
from torch import Tensor
from typing import Optional, Any, Union, Callable

class CustomTransformerDecoderLayer(nn.TransformerDecoderLayer):
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation='relu'):
        super().__init__(d_model, nhead, dim_feedforward, dropout, activation)
        # remove defined modules
        delattr(self, 'self_attn')
        delattr(self, 'norm1')
        delattr(self, 'dropout1')
    
    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, tgt_key_padding_mask=None, memory_key_padding_mask=None):
        x = tgt
        if self.norm_first:
            # x = x + self._sa_block(self.norm1(x), tgt_mask, tgt_key_padding_mask)
            x = x + self._mha_block(self.norm2(x), memory, memory_mask, memory_key_padding_mask)
            x = x + self._ff_block(self.norm3(x))
        else:
            # x = self.norm1(x + self._sa_block(x, tgt_mask, tgt_key_padding_mask))
            x = self.norm2(x + self._mha_block(x, memory, memory_mask, memory_key_padding_mask))
            # x =  x + self._mha_block(x, memory, memory_mask, memory_key_padding_mask)
            # x = self.norm3(x + self._ff_block(x))

        return x
    def _mha_block(self, x: Tensor, mem: Tensor,
                   attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor]) -> Tensor:
        # random_mask = torch.cuda.FloatTensor(mem.shape[0], 1).uniform_() > 0.8
        # print(sum(sum(attn_mask)))
        x = xops.memory_efficient_attention(x, mem, mem, attn_mask)
        return self.dropout2(x)


In [16]:
# baic transformer decoder model
import torch
import torch.nn as nn
import torch.nn.functional as Fun

class TransformerDecoderModel(nn.Module):
    def __init__(self, target_dataset, num_layers, embedding_dim, hidden_dim):
        super(TransformerDecoderModel, self).__init__()

        L_dim, S_dim, C_dim, F_dim = target_dataset.INPUT_DIMS
        
        # 目前b卡在embedding的怎麼用
        # Catagory_embedding => 數值類Qcut後用linear來做embedding, 類別用nn.Embedding

        self.Lable_embedding = nn.Linear(L_dim, embedding_dim, dtype=torch.float)
        self.Sample_embedding = nn.Linear(S_dim, embedding_dim, dtype=torch.float)
        self.Catagory_embedding = nn.Linear(C_dim, embedding_dim, dtype=torch.float)
        self.Field_embedding = nn.Linear(F_dim, embedding_dim, dtype=torch.float)
        
        self.transformer_decoder = nn.TransformerDecoder(
            CustomTransformerDecoderLayer(embedding_dim,  nhead = 1 ),
            num_layers
        )
        
        # downstream task
        self.MLP = nn.Sequential(
            nn.Linear(embedding_dim, embedding_dim//3),
            nn.LeakyReLU(),
            #nn.Dropout(0.5),
            nn.Linear(embedding_dim//3, 2),
            #nn.LeakyReLU(),
            #nn.Dropout(0.5),
            nn.Softmax(dim=2),
            # nn.Sigmoid(),
        )


    def forward(self, target_dataset):
        L_input, S_input, C_input, F_input = target_dataset.INPUTS
        L, S, C, F = target_dataset.nodes_num['L'], target_dataset.nodes_num['S'], target_dataset.nodes_num['C'], target_dataset.nodes_num['F']
        masks = target_dataset.MASKS
        
        L_embedded = self.Lable_embedding(L_input.float()).unsqueeze(0)
        S_embedded = self.Sample_embedding(S_input.float()).unsqueeze(0)
        C_embedded = self.Catagory_embedding(C_input.float()).unsqueeze(0)
        F_embedded = self.Field_embedding(F_input.float()).unsqueeze(0)
        
        for mask in masks.keys():
            masks[mask] = masks[mask].to(DEVICE)
        
        # propagate steps: L→S→C→F
        #                  L←S←C←
        # more steps more menory usage
        PROPAGATE_STEPS = 3
        for i in range(PROPAGATE_STEPS):
            S_embedded = self.transformer_decoder(S_embedded,L_embedded, 
                                                memory_mask = masks['L2S'][:S,:L]) 
            C_embedded = self.transformer_decoder(C_embedded,S_embedded,
                                                memory_mask = masks['S2C'][:C,:S])
            F_embedded = self.transformer_decoder(F_embedded,C_embedded,
                                                memory_mask = masks['C2F'][:F,:C])
            C_embedded = self.transformer_decoder(C_embedded,F_embedded,
                                                memory_mask = Tensor.contiguous(masks['C2F'].transpose(0, 1))[:C,:F])
            S_embedded = self.transformer_decoder(S_embedded,C_embedded,
                                                memory_mask = Tensor.contiguous(masks['S2C'].transpose(0, 1))[:S,:C])
            L_embedded = self.transformer_decoder(L_embedded,S_embedded, 
                                                memory_mask = Tensor.contiguous(masks['L2S'].transpose(0, 1))[:L,:S])
        
        output = self.MLP(S_embedded)[0]
        return output

# 測試模型
num_layers = 1  # TransformerDecoder 的層數
embedding_dim = 2*128  # 嵌入維度
hidden_dim = 64  

print('input_dims', Train_data.INPUT_DIMS)
model = TransformerDecoderModel(Train_data, num_layers, embedding_dim, hidden_dim).to(DEVICE)
outputs = model(Train_data)

print("模型輸出的大小:", outputs.shape)

input_dims (2, 14, 437, 14)
模型輸出的大小: torch.Size([39074, 2])


In [17]:
outputs

tensor([[0.5407, 0.4593],
        [0.5539, 0.4461],
        [0.5335, 0.4665],
        ...,
        [0.5644, 0.4356],
        [0.5571, 0.4429],
        [0.5360, 0.4640]], device='cuda:0', grad_fn=<SelectBackward0>)

In [18]:
output_label = torch.argmax(outputs, dim=1)

In [21]:
# training
from torch import autograd
from torcheval.metrics.aggregation.auc import AUC

def train(model, datset):
    LABEL_POOL = datset.LABEL_POOL
    
    
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    epochs = 100
    for epoch in range(epochs):
        model.train()
        # with autograd.detect_anomaly():
        optimizer.zero_grad()
        outputs = model(datset)

        # caculate loss
        epoch_loss = criterion(outputs, torch.tensor(LABEL_POOL).to(DEVICE))

        # backpropagation
        epoch_loss.backward()
        optimizer.step()
        
        
        output_label = torch.argmax(outputs, dim=1)
        
        TRUE = torch.argmax(torch.tensor(LABEL_POOL).to(DEVICE), dim=1)

        metric = AUC()
        y_pred_first = torch.tensor([x[0] for x in outputs]).to(DEVICE)
        metric.update(TRUE,y_pred_first)
        epoch_AUC = float(metric.compute())

        epoch_acc = torch.sum(output_label == torch.argmax(torch.tensor(LABEL_POOL).to(DEVICE), dim=1)).item() / len(output_label)

        print(f"Epoch{epoch+1}/{epochs} | Loss: {epoch_loss} | Acc: {epoch_acc} | AUC: {epoch_AUC}")
model = TransformerDecoderModel(Train_data, num_layers, embedding_dim, hidden_dim).to(DEVICE)
train(model, Train_data)

Epoch1/100 | Loss: 0.6359372644687544 | Acc: 0.7595843783590112 | AUC: 0.643252968788147
Epoch2/100 | Loss: 0.582979273190791 | Acc: 0.7595843783590112 | AUC: 0.794161319732666
Epoch3/100 | Loss: 0.5646408041985379 | Acc: 0.7595843783590112 | AUC: 0.8971868753433228
Epoch4/100 | Loss: 0.5580641540911362 | Acc: 0.7595843783590112 | AUC: 0.9479806423187256
Epoch5/100 | Loss: 0.5557018130798023 | Acc: 0.7595843783590112 | AUC: 0.972641110420227
Epoch6/100 | Loss: 0.5547354501626469 | Acc: 0.7595843783590112 | AUC: 0.9849541187286377


KeyboardInterrupt: 

In [None]:
# self.Sample_embedding = nn.Linear(S_dim, embedding_dim, dtype=torch.half)
# self.Catagory_embedding = nn.Linear(C_dim, embedding_dim, dtype=torch.half)
# S_embedded = self.Sample_embedding(S_input.half()).unsqueeze(0)
# C_embedded = self.Catagory_embedding(C_input.half()).unsqueeze(0)
# xops.memory_efficient_attention(C_embedded, S_embedded, S_embedded,attn_mask)[0]