In [1]:
import pandas as pd
# baic transformer Encoder model
import torch
import torch.nn as nn
import torch.nn.functional as Fun
import numpy as np

main_df = pd.read_csv('adult.csv')
# main_df.head()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def POOL_preprocess(df):
    '''
    input the original dataframe, output the dataframe after preprocessing,
    change the numerical columns to categorical columns by qcut and cut
    then apply label encoding to all columns
    
    '''
    df = df.copy()
    CAT = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']
    NUM = ['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week']
    # qcut on numerical columns
    for column in NUM:
        if column in ['educational-num','capital-gain','capital-loss','hours-per-week']:
            df[column] = pd.cut(df[column], 100)
        else:
            df[column] = pd.cut(df[column], 100)
    # make income column binary
    df['income'] = df['income'].apply(lambda x: 1 if x == '>50K' else 0)

    # lable encoding categorical columns
    from sklearn.preprocessing import LabelEncoder
    lb = LabelEncoder()
    df = df.apply(lambda x: lb.fit_transform(x))

    # make all catagory in every column unique

    # 迴圈處理多個欄位
    offset = 0
    for column in df.columns:
        df[column] = df[column].apply(lambda x: x + offset)
        offset += df[column].nunique()
    
    return df
tmp = POOL_preprocess(main_df)
tmp.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,8,78,97,164,185,199,209,220,225,229,230,253,341,437,440
1,21,78,88,174,187,197,207,217,227,229,230,253,351,437,440
2,11,76,104,170,190,197,213,217,227,229,230,253,341,437,441
3,27,78,93,178,188,197,209,217,225,229,237,253,341,437,441
4,1,74,89,178,188,199,202,220,227,228,230,253,331,437,440


In [3]:
tmp.describe()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
count,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0,48842.0
mean,21.643585,77.870439,94.497113,173.28842,188.078089,197.61875,208.5777,218.443287,226.668052,228.668482,230.626858,254.102596,341.396728,434.749355,440.239282
std,13.71051,1.464234,7.118204,3.874492,2.570973,1.507703,4.230509,1.602151,0.845986,0.470764,2.670115,5.196314,12.295271,7.775343,0.426649
min,0.0,74.0,83.0,163.0,179.0,195.0,202.0,217.0,223.0,228.0,230.0,253.0,302.0,398.0,440.0
25%,11.0,78.0,90.0,172.0,187.0,197.0,205.0,217.0,227.0,228.0,230.0,253.0,341.0,437.0,440.0
50%,20.0,78.0,94.0,174.0,188.0,197.0,209.0,218.0,227.0,229.0,230.0,253.0,341.0,437.0,440.0
75%,31.0,78.0,98.0,175.0,190.0,199.0,212.0,220.0,227.0,229.0,230.0,253.0,346.0,437.0,440.0
max,73.0,82.0,162.0,178.0,194.0,201.0,216.0,222.0,227.0,229.0,252.0,301.0,397.0,439.0,441.0


In [4]:
train_size = 4*48842//5
test_size = 48842//5
train_pool = main_df[test_size:]
test_pool = main_df[:test_size]
print('total data num:' , main_df.shape[0])
print('trian data num:' , train_pool.shape[0])
print('test data num:' , test_pool.shape[0])

total data num: 48842
trian data num: 39074
test data num: 9768


In [5]:
# notations
#   node: number of all nodes = L + S + C + F
#   L: number of lable nodes
#   S: number of sample nodes
#   C: number of catagory nodes
#   F: number of field(column) nodes
#   hidden: number of hidden representation

# data size = (node, hidden)
# mask size = (node, node - L) without lable nodes
#             for each node, real mask = cat[mask,(node,L)] = (node, node)
#             cannot see it's label node

# use nn.transformerEncoder(data,mask) to get the output
# use the above output as input of MLP to predict the lable   

In [6]:
TARGET_POOL = POOL_preprocess(main_df)
LABEL_COLUMN = 'income'
# 
HIDDEN = 64

# cut feature and lable
FEATURE_POOL = TARGET_POOL.drop(LABEL_COLUMN, axis=1)
LABEL_POOL = TARGET_POOL[LABEL_COLUMN]

# trasform label into one-hot
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
LABEL_POOL = enc.fit_transform(LABEL_POOL.values.reshape(-1,1)).toarray()

print('feature pool shape:', FEATURE_POOL.shape)
print('label pool shape:', LABEL_POOL.shape)

# L: number of lable nodes
L = LABEL_POOL.shape[1]

# S: number of sample nodes
S = FEATURE_POOL.shape[0]

# C: number of catagory nodes
C = FEATURE_POOL.apply(lambda x: x.nunique()).sum() # total_unique_labels
C_POOL = list(set(FEATURE_POOL.values.flatten()))

# F: number of field(column) nodes
F = FEATURE_POOL.shape[1]

P = 310

print('L:', L)
print('S:', S)
print('P:', P)
print('C:', C)
print('F:', F)
print('total', L+S+P+C+F)
print(C_POOL)
print(len(C_POOL))


feature pool shape: (48842, 14)
label pool shape: (48842, 2)
L: 2
S: 48842
P: 310
C: 440
F: 14
total 49608
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 20

In [7]:
LABEL_POOL

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [0., 1.]])

In [8]:
# caculate masking

edge_x = []
edge_y = []

# label to sample 
offset = (L,0) # (x,y)
label_ids = TARGET_POOL[LABEL_COLUMN].unique()
for i, value_df in enumerate(TARGET_POOL[LABEL_COLUMN]):
    for j, value_label in enumerate(label_ids):
        if value_label == value_df:
            edge_x.append(offset[0] + i)
            edge_y.append(offset[1] + j)
print(max(edge_x))

# sample to catagory
offset = (L+S+P,L) # (x,y)
tmp_df = TARGET_POOL.drop(LABEL_COLUMN, axis=1)
for i, value_df in enumerate(tmp_df.values):
    for j, value in enumerate(value_df):
        edge_x.append(offset[0] + value)
        edge_y.append(offset[1] + i)
print(max(edge_x))

# catagory to field
offset = (L+S+P+C,L+S+P) # (x,y)
unique_items = [(TARGET_POOL[column].unique()) for column in (TARGET_POOL.columns)]
for i in range(F):
    for j in (unique_items[i]):
        edge_x.append(offset[0] + i)
        edge_y.append(offset[1] + j)
print(max(edge_x))

# make edges symmetric
tmp = edge_x.copy()
edge_x += edge_y
edge_y += tmp

# make value on edge as 1
value_on_edge = [1]*len(edge_x)

print(len(edge_x))
print(len(edge_y))

# create mask as sparse tensor
indices = torch.tensor([edge_x, edge_y])
values = torch.tensor(value_on_edge)
size = torch.Size([L+S+P+C+F, L+S+P+C+F])
sparse_tensor_mask = torch.sparse_coo_tensor(indices, values, size)


48843


49593
49607
1466140
1466140


In [9]:
# make input tensor
# L
L_input = torch.eye(L).to('cuda')
print('L_input', L_input.type(), L_input.shape)
# S
S_input = torch.tensor(FEATURE_POOL.values).to('cuda')
print('S_input', S_input.type(), S_input.shape)
# C random init
C_input = torch.tensor(np.diag(C_POOL)).to('cuda')
print('C_input', C_input.type(), C_input.shape)
# F random init
F_input = torch.eye(F).to('cuda')
print('F_input', F_input.type(), F_input.shape)
print(L_input.type())
# 

L_input torch.cuda.FloatTensor torch.Size([2, 2])
S_input torch.cuda.LongTensor torch.Size([48842, 14])
C_input torch.cuda.LongTensor torch.Size([440, 440])
F_input torch.cuda.FloatTensor torch.Size([14, 14])
torch.cuda.FloatTensor


In [10]:
from torch import Tensor
from typing import Optional, Any, Union, Callable
from linear_mem_attention_pytorch.fast_attn import Attention

class CustomTransformerEncoderLayer(nn.TransformerEncoderLayer):
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation='relu'):
        super().__init__(d_model, nhead, dim_feedforward, dropout, activation)
        # remove defined modules
        delattr(self, 'self_attn')
        
        self.self_attn = Attention(dim=d_model, heads = nhead, dim_head = 128, bias=False)
    
    def _sa_block(self, x: Tensor,
                  attn_mask: Optional[Tensor], key_padding_mask: Optional[Tensor]) -> Tensor:
        x = x[None, :]
        x = self.self_attn(x, x, attn_mask, query_chunk_size=1024, key_chunk_size=1024)
        return self.dropout1(x)

In [11]:
# baic transformer Encoder model
import torch
import torch.nn as nn
import torch.nn.functional as Fun

class TransformerEncoderModel(nn.Module):
    def __init__(self, nodes_num, num_layers, embedding_dim, hidden_dim):
        super(TransformerEncoderModel, self).__init__()

        L_dim, S_dim, C_dim, F_dim = nodes_num
        
        # 目前b卡在embedding的怎麼用
        # Catagory_embedding => 數值類Qcut後用linear來做embedding, 類別用nn.Embedding
        
        # self.Lable_embedding = nn.Embedding(L_dim, embedding_dim)
        # self.Sample_embedding = nn.Embedding(S_dim, embedding_dim)
        # self.Catagory_embedding = nn.Embedding(C_dim, embedding_dim)
        # self.Field_embedding = nn.Embedding(F_dim, emedding_dim)
        
        self.Lable_embedding = nn.Linear(L_dim, embedding_dim, dtype=torch.float)
        self.Sample_embedding = nn.Linear(S_dim, embedding_dim, dtype=torch.float)
        self.Catagory_embedding = nn.Linear(C_dim, embedding_dim, dtype=torch.float)
        self.Field_embedding = nn.Linear(F_dim, embedding_dim, dtype=torch.float)
        
        self.transformer_encoder = nn.TransformerEncoder(
            CustomTransformerEncoderLayer(embedding_dim,nhead = 4 ),
            num_layers
        )


    def forward(self, L_input, S_input, C_input, F_input, mask):
        L_embedded = self.Lable_embedding(L_input.float())
        S_embedded = self.Sample_embedding(S_input.float())
        C_embedded = self.Catagory_embedding(C_input.float())
        F_embedded = self.Field_embedding(F_input.float())
        
        # concat all embedded
        embedded = torch.cat((L_embedded, S_embedded, C_embedded, F_embedded),0)
        print('embedded shape', embedded.shape)
        print('embedded shape', embedded.type())
        print('mask shape', mask.bool().to_dense().shape)
        encoded = self.transformer_encoder(embedded, mask.bool().to_dense())  # 使用 TransformerEncoder 編碼
        print('encoded shape', encoded.shape)
        # return encoded.permute(1, 0, 2)  # 改變 tensor 的維度順序回來

# 測試模型
num_layers = 2  # TransformerEncoder 的層數
embedding_dim = 128  # 嵌入維度
hidden_dim = 64  # TransformerEncoderLayer 的隱藏層維度

input_dimantions = (L_input.size(1), S_input.size(1), C_input.size(1), F_input.size(1))
print('input_dimantions', input_dimantions)
model = TransformerEncoderModel(input_dimantions, num_layers, embedding_dim, hidden_dim).to('cuda')
output = model(L_input, S_input, C_input, F_input, sparse_tensor_mask.to('cuda'))

print("模型輸出的大小:", output.size())

input_dimantions (2, 14, 440, 14)


embedded shape torch.Size([49298, 128])
embedded shape torch.cuda.FloatTensor
mask shape torch.Size([49608, 49608])


RuntimeError: The expanded size of the tensor (1) must match the existing size (49608) at non-singleton dimension 0.  Target sizes: [1, 1024, 4, 1024].  Tensor sizes: [49608, 1, 1, 1024]