In [10]:
import torch
from torch_geometric.data import Data
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer
import torch_geometric
from tqdm import tqdm, trange
from torcheval.metrics import MulticlassAccuracy
from torcheval.metrics import BinaryAUROC
import numpy as np

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# DEVICE = torch.device('cuda')
print(DEVICE)

cuda


In [11]:
# get feature from csv
RAW_data = pd.read_csv('../data/adult.csv')
CAT = ['workclass','education','marital-status','occupation','relationship','race','gender','native-country']
NUM = ['age','fnlwgt','educational-num','capital-gain','capital-loss','hours-per-week']
LABEL = 'income'
# RAW_data = pd.read_csv('../data/compass_old.csv')
# CAT=['sex','age_cat','race','c_charge_degree','decile_score.1','score_text','v_type_of_assessment','v_decile_score','v_score_text']
# NUM=['age','juv_fel_count','juv_misd_count','juv_other_count','priors_count','days_b_screening_arrest','c_days_from_compas','end']
# LABEL = 'is_recid'
# convert categorical data to ordinal data
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
data_pd = RAW_data.copy()
data_pd[CAT] = enc.fit_transform(RAW_data[CAT])
# data_pd = pd.get_dummies(RAW_data, columns=CAT, dtype=float)
# label to category
data_pd[LABEL] = data_pd[LABEL].astype('category').cat.codes

# realign data to num + cat
data_pd = data_pd[NUM + CAT + [LABEL]]

# caculate unique value of each categorical feature
cat_num = [len(data_pd[col].unique()) for col in CAT]

# normalize numerical data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_pd[NUM] = scaler.fit_transform(data_pd[NUM])

# convert data to tensor
x = torch.tensor(data_pd.drop(columns=[LABEL]).values, dtype=torch.float, device=DEVICE)  # [48842, 108]
y = torch.tensor(data_pd[LABEL].values, dtype=torch.long, device=DEVICE) # [48842]
print(x.shape, y.shape)
print(cat_num)
data_pd

torch.Size([48842, 14]) torch.Size([48842])
[9, 16, 7, 15, 6, 5, 2, 42]


Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,workclass,education,marital-status,occupation,relationship,race,gender,native-country,income
0,-0.995129,0.351675,-1.197259,-0.144804,-0.217127,-0.034087,4.0,1.0,4.0,7.0,3.0,2.0,1.0,39.0,0
1,-0.046942,-0.945524,-0.419335,-0.144804,-0.217127,0.772930,4.0,11.0,2.0,5.0,0.0,4.0,1.0,39.0,0
2,-0.776316,1.394723,0.747550,-0.144804,-0.217127,-0.034087,2.0,7.0,2.0,11.0,0.0,4.0,1.0,39.0,1
3,0.390683,-0.277844,-0.030373,0.886874,-0.217127,-0.034087,4.0,15.0,2.0,7.0,0.0,2.0,1.0,39.0,1
4,-1.505691,-0.815954,-0.030373,-0.144804,-0.217127,-0.841104,0.0,15.0,4.0,0.0,3.0,4.0,0.0,39.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,-0.849254,0.640492,0.747550,-0.144804,-0.217127,-0.195490,4.0,7.0,2.0,13.0,5.0,4.0,0.0,39.0,0
48838,0.098933,-0.334178,-0.419335,-0.144804,-0.217127,-0.034087,4.0,11.0,2.0,7.0,0.0,4.0,1.0,39.0,1
48839,1.411808,-0.357510,-0.419335,-0.144804,-0.217127,-0.034087,4.0,11.0,6.0,1.0,4.0,4.0,0.0,39.0,0
48840,-1.213941,0.111984,-0.419335,-0.144804,-0.217127,-1.648120,4.0,11.0,4.0,1.0,3.0,4.0,1.0,39.0,0


In [12]:
class K_graph(torch.nn.Module):
    def __init__(self, NUM, CAT, LABEL, cat_num):
        super(K_graph, self).__init__()
        '''
        num_cols: number of numerical columns
        cat_cols: number of categorical columns
        label_cols: number of label columns
        cat_num: number of unique value of each categorical columns
        '''
        self.hidden_dim = 128
        
        # order: num -> cat -> label
        self.num_cols = len(NUM)
        self.cat_cols = len(CAT)
        self.label_cols = len(LABEL)
        self.number_of_columns = self.num_cols + self.cat_cols 
        
        
        # numerical feature
        self.num_embeddings = torch.nn.ModuleList([torch.nn.Linear(1, self.hidden_dim) for i in range(self.num_cols)])
        # categorical feature
        self.cat_embeddings = torch.nn.ModuleList([torch.nn.Embedding(cat_num[i], self.hidden_dim) for i in range(self.cat_cols)])
        
        self.prediction = torch.nn.Sequential(
            torch.nn.Linear(self.hidden_dim * self.number_of_columns, self.hidden_dim),
            torch.nn.ReLU(),
            torch.nn.LayerNorm(self.hidden_dim),
            torch.nn.Linear(self.hidden_dim, self.label_cols + 1)
        )
        
        # feature importance learning
        self.feature_importance_learners = torch.nn.ModuleList([torch.nn.Sequential(
            torch.nn.Linear(self.hidden_dim, 1),
        ) for i in range(self.number_of_columns)])
        
        
    def forward(self, input_data, epoch = -1):
        
        # make feature embedding
        num_data = input_data[:,:self.num_cols].unsqueeze(-1).unsqueeze(-1) 
        feature_embedding_num = torch.cat([self.num_embeddings[i](num_data[:,i]) for i in range(self.num_cols)], dim=1).reshape(len(input_data), -1) # [batch_size, num_cols * hidden_dim]
        feature_embedding_num = torch.nn.ReLU()(feature_embedding_num)
        feature_embedding_num = torch.layer_norm(feature_embedding_num, feature_embedding_num.shape)
        # categorical feature
        feature_embedding_cat = torch.cat([self.cat_embeddings[i](input_data[:,self.num_cols+i].long()) for i in range(self.cat_cols)], dim=1) # [batch_size, cat_cols * hidden_dim]
        feature_embedding_cat = torch.layer_norm(feature_embedding_cat, feature_embedding_cat.shape)
        # concat
        feature_embedding = torch.cat((feature_embedding_num, feature_embedding_cat), dim=1) # [batch_size, (num_cols + cat_cols) * hidden_dim]
        # print(feature_embedding.shape)
        # feature_embedding = feature_embedding.reshape((len(input_data), self.number_of_columns, -1)) # [batch_size, (num_cols + cat_cols), hidden_dim]
        
        # # feature importance learning
        # feature_importance_ = torch.cat([self.feature_importance_learners[i](feature_embedding[:,i*self.hidden_dim:(i+1)*self.hidden_dim]) for i in range(self.number_of_columns)], dim=1) # [batch_size, num_cols + cat_cols, 1]
        # feature_importance = torch.softmax(feature_importance_, dim=1) # [batch_size, num_cols + cat_cols, 1]
        # feature_importance = torch.layer_norm(feature_importance, feature_importance.shape)
        # # print('feature_importance',feature_importance.sum(dim=0)/len(input_data))
        # feature_embedding = feature_embedding.reshape((len(input_data),self.number_of_columns, -1)) * feature_importance.unsqueeze(-1) # [batch_size, (num_cols + cat_cols) * hidden_dim]
        # feature_embedding = feature_embedding.reshape((len(input_data), -1)) # [batch_size, (num_cols + cat_cols) * hidden_dim]
        
        
        # make prediction
        prediction = self.prediction(feature_embedding)
        
        
        return prediction


In [13]:
the_model = K_graph(NUM, CAT, [LABEL], cat_num).to(DEVICE)
optimizer = torch.optim.SGD(the_model.parameters(), lr=0.001)

# optimizer.step()
data_count = 100
# random pick data
indices = torch.randperm(len(x))[:data_count]
print(indices)
train_data = x[indices]
train_label = y[indices]

for i in range(1):
    
    optimizer.zero_grad()
    output = the_model(train_data[:data_count], epoch=200)
    loss = torch.nn.functional.cross_entropy(output, train_label[:data_count])
    loss.backward()
    # print(((the_model.feature_importance_learners.grad).abs().max(dim=1)[0]))
    optimizer.step()
    
    print('-----------------------------------------')


tensor([48511, 30917, 26725, 21784,  4636, 28281, 23661, 14621, 48500, 28845,
         9172, 11413,  8184, 24427, 39428, 24792, 45015,  1472, 47274, 42164,
          618, 44699, 48374, 24245, 46329, 38705,  5791, 25825, 20397, 38938,
        32714,  6203, 17434, 31866, 15415,  8377,  2962, 13995, 19808, 37027,
        17970,  6518, 15183,  6568, 44975,  5160, 44515, 20400, 28787, 13316,
        44306, 28330, 30658, 24987, 46105, 22075, 36253, 20690, 31050, 13061,
        27015,    74, 30715, 20362, 16385,  4839,  8524, 35403, 36674, 44473,
        39099, 10081, 19897, 17738, 33455, 35643,  1034, 24978, 34058, 25843,
        44109,  6409,   945,  6433, 45752, 22484, 34206,  6658, 35371, 46258,
        39902,  8322, 47861, 10925, 40932, 47223, 12875, 35988,  2969,  2387])
-----------------------------------------


In [14]:

def train_epoch(model, optimizer, datas, batch_size, epoch):
    train_data, train_label, validation_data, validation_label = datas
    
    # slice data into batch
    train_data = torch.split(train_data, batch_size)
    train_label = torch.split(train_label, batch_size)

    # losses and metrics
    batch_loss = 0
    train_acc = MulticlassAccuracy(num_classes=2).to(DEVICE)
    train_auc = BinaryAUROC().to(DEVICE)
    valid_acc = MulticlassAccuracy(num_classes=2).to(DEVICE)
    valid_auc = BinaryAUROC().to(DEVICE)
    
    # train the model
    stepper = trange(len(train_data))
    for i in stepper:
        stepper.set_description(f'Epoch {epoch}')
        
        optimizer.zero_grad()
        output = model(train_data[i], epoch=epoch)
        loss = torch.nn.functional.cross_entropy(output, train_label[i]) * model.number_of_columns
        loss.backward()
        optimizer.step()
        batch_loss += loss.item()
        
        # metrics
        preds = output.softmax(dim=1)
        true = torch.nn.functional.one_hot(train_label[i], num_classes=2).to(DEVICE)
        train_acc.update(torch.argmax(preds, 1),true.T[1])
        train_auc.update(preds.T[0],true.T[0])
        
        # at the end of epoch, print result and validate the model
        if i == len(train_data) - 1:
            train_acc = train_acc.compute()
            train_auc = train_auc.compute()
            stepper.set_postfix({'loss': round(batch_loss/(i+1), 3), 'acc': round(train_acc.item(), 3), 'AUC': round(train_auc.item(), 3)})
            stepper.update()
        
            with torch.no_grad():
                output = model(validation_data, epoch=200)
                # loss = torch.nn.functional.cross_entropy(output, validation_label[i])
                preds = output.softmax(dim=1)
                true = torch.nn.functional.one_hot(validation_label, num_classes=2).to(DEVICE)
                valid_acc.update(torch.argmax(preds,1),true.T[1])
                valid_auc.update(preds.T[0],true.T[0])
            stepper.set_postfix({'loss': round(batch_loss/(i+1), 3), 'acc': round(train_acc.item(), 3), 'AUC': round(train_auc.item(), 3), 'val_acc': round(valid_acc.compute().item(), 3), 'val_AUC': round(valid_auc.compute().item(), 3)})



In [15]:
def overall_train(x, y):
    # hyperparameter
    epoch = 50
    batch_size = 1024

    
    # shuffle data
    indices = torch.randperm(len(x))
    x = x[indices]
    y = y[indices]
    # slice data into train and test and validation
    train_ratio = 0.7
    validation_ratio = 0.1
    train_data = x[:int(len(x)*train_ratio)]
    train_label = y[:int(len(x)*train_ratio)]
    validation_data = x[int(len(x)*train_ratio):int(len(x)*(train_ratio+validation_ratio))]
    validation_label = y[int(len(x)*train_ratio):int(len(x)*(train_ratio+validation_ratio))]
    test_data = x[int(len(x)*(train_ratio+validation_ratio)):]
    test_label = y[int(len(x)*(train_ratio+validation_ratio)):]

    # build model and optimizer
    the_model = K_graph(NUM, CAT, [LABEL], cat_num).to(DEVICE)
    optimizer = torch.optim.SGD(the_model.parameters(), lr=0.001)
    
    # train the model
    datas = (train_data, train_label, validation_data, validation_label)
    for i in range(epoch):
        train_epoch(the_model, optimizer, datas, batch_size, epoch=i+1)
    
    # test the model
    with torch.no_grad():
        output = the_model(test_data, epoch=200)
        preds = output.softmax(dim=1)
        true = torch.nn.functional.one_hot(test_label, num_classes=2).to(DEVICE)
        test_acc = MulticlassAccuracy(num_classes=2).to(DEVICE)
        test_auc = BinaryAUROC().to(DEVICE)
        test_acc.update(torch.argmax(preds,1),true.T[1])
        test_auc.update(preds.T[0],true.T[0])
        print('test_acc:', test_acc.compute().item())
        print('test_auc:', test_auc.compute().item())
        print('-----------------------------------------')

In [22]:
overall_train(x, y)

Epoch 1:   0%|          | 0/34 [00:00<?, ?it/s]

Epoch 1: 100%|██████████| 34/34 [00:00<00:00, 159.11it/s, loss=5.94, acc=0.809, AUC=0.819, val_acc=0.847, val_AUC=0.893]
Epoch 2: 100%|██████████| 34/34 [00:00<00:00, 164.89it/s, loss=4.57, acc=0.849, AUC=0.902, val_acc=0.847, val_AUC=0.904]
Epoch 3: 100%|██████████| 34/34 [00:00<00:00, 175.00it/s, loss=4.52, acc=0.85, AUC=0.904, val_acc=0.853, val_AUC=0.907]
Epoch 4: 100%|██████████| 34/34 [00:00<00:00, 164.62it/s, loss=4.4, acc=0.854, AUC=0.909, val_acc=0.853, val_AUC=0.909]
Epoch 5: 100%|██████████| 34/34 [00:00<00:00, 162.78it/s, loss=4.36, acc=0.855, AUC=0.91, val_acc=0.856, val_AUC=0.909]
Epoch 6: 100%|██████████| 34/34 [00:00<00:00, 188.18it/s, loss=4.32, acc=0.857, AUC=0.912, val_acc=0.855, val_AUC=0.91]
Epoch 7: 100%|██████████| 34/34 [00:00<00:00, 160.40it/s, loss=4.29, acc=0.857, AUC=0.913, val_acc=0.856, val_AUC=0.91]
Epoch 8: 100%|██████████| 34/34 [00:00<00:00, 159.08it/s, loss=4.28, acc=0.858, AUC=0.914, val_acc=0.857, val_AUC=0.91]
Epoch 9: 100%|██████████| 34/34 [00:00

test_acc: 0.8497287631034851
test_auc: 0.9133439756728756
-----------------------------------------


In [23]:
sum([0.8570989966392517,
     0.8552564382553101,
     0.857201337814331,
     0.854437530040741,
     0.8497287631034851,
     ])/5

0.8547446131706238

In [24]:
sum([0.9126779092790669,
     0.91077120905494,
     0.9158374102217134,
     0.9109161840761032,
     0.9133439756728756])/5

0.9127093376609399