In [1]:
%load_ext autoreload
%autoreload 2

import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" 
os.environ['CUDA_VISIBLE_DEVICES'] = '6'
import torch
import pandas as pd
from collections import defaultdict
from torch_geometric.data import InMemoryDataset, Data

import torch.nn.functional as F
import torch.nn as nn
from torch_geometric.nn import GATConv
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm
from torch_geometric.loader import DataLoader

In [93]:
import os
from torch_geometric.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, root, node_embeddings, orders, order2nodes, order2labels=None, transform=None, pre_transform=None):
        self.root = root
        self.node_embeddings = node_embeddings
        self.orders = orders
        self.order2nodes = order2nodes
        self.order2labels = order2labels
        
        if not os.path.exists(self.processed_paths[0]):
            self.process()

        super(CustomDataset, self).__init__(root, transform, pre_transform)
        #print('self.raw_paths:', self.raw_paths)
        #print('self.processed_paths:', self.processed_paths)
        
        self.data_list = torch.load(self.processed_paths[0])
    
    
    @property
    def raw_file_names(self): # If raw data is not in the root directory, then call the download function.
        return []
    
    @property
    def processed_file_names(self): # If preprocessed data is not in the root directory, then call the process function.
        return ['task1_graphs.pt']
    
    def download(self):
        pass
    
    def process(self):
        data_list = []
        for order in self.orders:
            edges = []
            for i, node1 in enumerate(self.order2nodes[order]):
                for j, node2 in enumerate(self.order2nodes[order]):
                    if node1 != node2:
                        edges.append([i, j])
            
            #print('order:', order)
            #print('self.order2nodes[order]:', self.order2nodes[order])
            x = self.node_embeddings[self.order2nodes[order]]
            print('self.order2nodes[order]:', self.order2nodes[order])
            print('x:', x)
            edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
            y = self.order2labels[order]
            
            data = Data(x=x, edge_index=edge_index, y=y)
            data_list.append(data)
        
        #data, slices = self.collate(data_list)
        torch.save(data_list, self.processed_paths[0])
        
    def len(self):
        return len(self.data_list)
    
    def get(self, idx):
        return self.data_list[idx]
    

In [65]:
idx = [1, 2, 3]

a = torch.tensor([0, 1, 2, 3, 4, 5])

print(a[idx])

tensor([1, 2, 3])


# ToyData

In [5]:
embeddings = torch.arange(5).reshape(5,1)
order2labels = {0: 0, 1:1, 2:1}
order2nodes = {0: [0, 1], 1: [0, 2, 3], 2: [1, 2, 4]}

In [50]:
train_ds = CustomDataset(root='../data', node_embeddings=embeddings, order2labels=order2labels, order2nodes=order2nodes)

Data(x=[2, 1], edge_index=[2, 2], y=0)


In [51]:
print(train_ds[0].edge_index)

tensor([[0, 1],
        [1, 0]])


In [52]:
print(train_ds[1].edge_index)

tensor([[0, 0, 1, 1, 2, 2],
        [1, 2, 0, 2, 0, 1]])


In [54]:
print(train_ds[2].edge_index)

tensor([[0, 0, 1, 1, 2, 2],
        [1, 2, 0, 2, 0, 1]])


In [55]:
train_dl = DataLoader(train_ds, batch_size=2, shuffle=False)

In [56]:
for data in train_dl:
    print(data)

DataBatch(x=[5, 1], edge_index=[2, 8], y=[2], batch=[5], ptr=[3])
DataBatch(x=[3, 1], edge_index=[2, 6], y=[1], batch=[3], ptr=[2])


In [57]:
data1 = next(iter(train_dl))

print(data1)

DataBatch(x=[5, 1], edge_index=[2, 8], y=[2], batch=[5], ptr=[3])


In [65]:
print(data1.x)
print(data1.edge_index)
print(data1.y)
print(data1.batch)

tensor([[0],
        [2],
        [0],
        [2],
        [3]])
tensor([[0, 1, 2, 2, 3, 3, 4, 4],
        [1, 0, 3, 4, 2, 4, 2, 3]])
tensor([0, 1])
tensor([0, 0, 1, 1, 1])


# Test Custom dataset with real data

In [71]:
NUM_CUSTOMER =342039
NUM_PRODUCT = 58415

# GAT 모델 정의
class FirstGAT(torch.nn.Module):
    def __init__(self, in_channels, out_channels, heads=1, dropout=0.6):
        super(FirstGAT, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.customer_embedding = nn.Embedding(NUM_CUSTOMER, in_channels) # 342039
        self.product_embedding = nn.Embedding(NUM_PRODUCT, in_channels - 2) # 58415
        self.gat1 = GATConv(in_channels, 8, heads=heads, dropout=dropout)
        self.gat3 = GATConv(8*heads, out_channels, heads=1, concat=False, dropout=dropout)
        #self.gat_test = GATConv(in_channels, out_channels, heads=1, concat=False, dropout=dropout)
    
    def forward(self, prod_features, edge_index):
        
        customer_embeddings = self.customer_embedding(torch.arange(NUM_CUSTOMER).to(device))
        product_embeddings = self.product_embedding(torch.arange(NUM_PRODUCT).to(device))
        prod_features = torch.hstack([product_embeddings, prod_features])
        x = torch.vstack([customer_embeddings, prod_features])
        #print('shape:', x.shape)
        
        x = F.dropout(x, p=0.1, training=self.training)
        x = F.elu(self.gat1(x, edge_index))
        x = F.dropout(x, p=0.1, training=self.training)
        x = F.elu(self.gat3(x, edge_index))
        
        return x

In [4]:
saved_data = torch.load("/home/myeongjin/data-mining/src/training_data_save")

In [27]:
def save_model(path, first_model, second_model, opt, scheduler, loss, epoch, **kwargs):
    # kwargs 는 GAT 네트워크 파라미터 in_channels, out_channels, ...

    save_obj = {
            'GAT_params' : kwargs,
            'first_weights': first_model.state_dict(),
            'second_weights': second_model.state_dict(),
            'opt_state': opt.state_dict(),
            'scheduler_state': (scheduler.state_dict() if scheduler else None),
            'loss': loss,
            'epoch': epoch
        }

    #path = os.path.join(path, 'epo_'+str(epoch)+'loss_'+f"{loss:0.3f}")
    path = os.path.join(path, 'epo_'+str(epoch))
    torch.save(save_obj, path)

In [69]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device:', device)

# 모델 생성
model1 = FirstGAT(10, 10, heads=4).to(device)
model1.train()

# data
saved_data = saved_data.to(device)
prod_features, edge_index, order2labels, order2nodes = saved_data.prod_features, saved_data.edge_index, saved_data.order2labels, saved_data.order2nodes 

node_features = model1(prod_features, edge_index)

device: cuda
shape: torch.Size([400454, 10])


In [70]:
print(node_features.shape)

torch.Size([400454, 10])


In [8]:
ts = torch.arange(2).reshape(2, 1).to('cuda')

print(ts)

tensor([[0],
        [1]], device='cuda:0')


In [12]:
#test_idx = torch.tensor([0, 1, 2])
test_idx = [0, 1, 2]

print(prod_features[test_idx])

tensor([[-0.9907,  0.6129],
        [ 0.0085, -0.0548],
        [-1.3010,  1.8149]], device='cuda:0')


In [13]:
training_orders = list(order2labels.keys())

In [82]:
nodes = torch.arange(NUM_CUSTOMER + NUM_PRODUCT).reshape(-1, 1)

In [90]:
print(nodes.shape)

torch.Size([400454, 1])


In [89]:
NUM_CUSTOMER

342039

In [92]:
print(order2nodes[326624])

[192219, 344111, 384742]


In [94]:
train_ds = CustomDataset(root='../data', node_embeddings=nodes, orders=training_orders, order2labels=order2labels, order2nodes=order2nodes)

self.order2nodes[order]: [192219, 396693, 399166]
x: tensor([[192219],
        [396693],
        [399166]])
self.order2nodes[order]: [322757, 365716, 397032]
x: tensor([[322757],
        [365716],
        [397032]])
self.order2nodes[order]: [305375, 396008]
x: tensor([[305375],
        [396008]])
self.order2nodes[order]: [210167, 383726, 355357, 368235]
x: tensor([[210167],
        [383726],
        [355357],
        [368235]])
self.order2nodes[order]: [66642, 356867, 364570]
x: tensor([[ 66642],
        [356867],
        [364570]])
self.order2nodes[order]: [67175, 383092, 365552, 370900]
x: tensor([[ 67175],
        [383092],
        [365552],
        [370900]])
self.order2nodes[order]: [78729, 375885]
x: tensor([[ 78729],
        [375885]])
self.order2nodes[order]: [225568, 387187, 357437, 381745, 381520, 360266, 397022]
x: tensor([[225568],
        [387187],
        [357437],
        [381745],
        [381520],
        [360266],
        [397022]])
self.order2nodes[order]: [293442, 3

In [87]:
print(train_ds[0])

Data(x=[3, 1], edge_index=[2, 6], y=0)


In [88]:
print(train_ds[0].x)

tensor([[192219],
        [396693],
        [399166]])


In [17]:
# before print(train_ds[0])

Data(x=[3, 3], edge_index=[2, 6], y=0)


In [74]:
train_dl = DataLoader(train_ds, batch_size=16) 

In [76]:
iterator = iter(train_dl)

data = next(iterator)

In [77]:
print(data)

DataBatch(x=[56, 3], edge_index=[2, 182], y=[16], batch=[56], ptr=[17])


In [79]:
print(data.x)

tensor([[-0.0456,  0.1405,  0.2278],
        [-0.1271,  0.1256,  0.2542],
        [ 0.2904,  0.2979,  0.1018],
        [ 1.0848, -0.2395, -0.3550],
        [ 0.3586,  0.3384,  0.2293],
        [ 0.1212,  0.1525, -0.2799],
        [ 0.1283,  0.1448, -0.2521],
        [ 0.2314,  0.1740,  0.0636],
        [ 0.2751,  0.1501,  0.1873],
        [ 0.4930,  0.3891,  0.4467],
        [ 0.3279,  0.1562,  0.1717],
        [ 0.7222,  0.3768,  0.0729],
        [ 0.7691,  0.2457,  0.3380],
        [ 0.0000,  0.0000,  0.0000],
        [ 0.1060,  0.0815,  0.1212],
        [ 2.8266,  0.8957,  2.2240],
        [ 1.4502,  0.3533,  1.4634],
        [ 0.1340,  0.1835,  0.3068],
        [ 0.0000,  0.0000,  0.0000],
        [ 1.9344,  0.6038,  1.5723],
        [ 0.0836,  0.1247,  0.1380],
        [-0.1206,  0.0769,  0.1455],
        [ 0.0330,  0.1927,  1.0494],
        [-0.0576, -0.0126,  0.0916],
        [-0.3438, -0.2391,  0.1159],
        [ 0.0303,  0.1769,  0.9634],
        [ 0.0542,  0.0602,  0.2567],
 

In [60]:
from torch_geometric.nn.pool import global_mean_pool

class SecondGAT(torch.nn.Module):
    def __init__(self, in_channels, out_channels, heads=4, dropout=0.1):
        super(SecondGAT, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.gat1 = GATConv(in_channels, 8, heads=heads, dropout=dropout)
        self.gat3 = GATConv(8*heads, out_channels, heads=1, dropout=dropout)
        self.fc1 = nn.Linear(out_channels, 10)
        self.fc2 = nn.Linear(10, 3)
    
    def forward(self, x, edge_index, batch_idx):
        
        x = F.dropout(x, p=0.1)
        x = F.elu(self.gat1(x, edge_index))
        #x = F.elu(self.gat_test(x, edge_index))
        x = F.dropout(x, p=0.1)
        #x = F.elu(self.gat2(x, edge_index))
        #x = F.dropout(x, p=0.1, training=self.training)
        x = F.elu(self.gat3(x, edge_index))
        x = global_mean_pool(x, batch_idx)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, p=0.1)
        x = self.fc2(x)
        
        # pooling
        
        return x

In [61]:
def train(saved_data, dataloader, in_channels=10, out_channels=10, heads=8, epochs=10, batch_size=64, lr=1e-3, step_size=2, save_period=5000, save_path='saved_models/'):
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('device:', device)
    
    
    # 모델 생성
    first_model = FirstGAT(10, in_channels, heads=4).to(device)
    second_model = SecondGAT(in_channels, out_channels, heads=heads).to(device)
    

    # data
    saved_data = saved_data.to(device)
    prod_features, edge_index, order2labels, order2nodes = saved_data.prod_features, saved_data.edge_index, saved_data.order2labels, saved_data.order2nodes 


    # 손실 함수 및 옵티마이저 정의
    criterion = torch.nn.CrossEntropyLoss()
    #optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-3)
    optimizer = torch.optim.Adam(list(first_model.parameters()) +  list(second_model.parameters()), lr=lr)
    scheduler = StepLR(optimizer, step_size=step_size, gamma=0.5) # 학습률 스케줄러 추가


    min_loss = 10
    #모델 학습
    first_model.train()
    second_model.train()
    for epoch in range(epochs):
        for i, data in enumerate(dataloader):
            node_features = first_model(prod_features, edge_index)
            
            x, edge_index, y, batch_idx = data.x, data.edge_index, data.y, data.batch
            edge_index = edge_index.to(device)
            y = torch.tensor(y).to(device)
            #print('edge device:',  edge_index.device)

            # edge_index=[2, 182], y=[16], batch=[56]
            
            out = second_model(x, edge_index, batch_idx)
            optimizer.zero_grad()
            loss = criterion(out, y)
            loss.backward()
            optimizer.step()
            
            if i % 20 == 0:
                print(f'Epoch {epoch}, batch_idx {i}/{len(dataloader)+1} Loss: {loss.item()}')
                
            if loss < min_loss:
                min_loss = loss
                min_epoch = epoch
                save_model(path=save_path, first_model=first_model, second_model=second_model, opt=optimizer, scheduler=scheduler, loss=loss, epoch=epoch)
            
        scheduler.step() # 학습률 스케쥴러 업데이트
    
    print(f'min_loss: {min_loss:.3f}, epoch: {min_epoch}')

# 모델 평가 
def evaluate(model, prod_features, orders, order2nodes, edge_index, targets):
    model.eval()
    pred = model(prod_features, orders, order2nodes, edge_index).argmax(dim=1) #prod_features, orders, order2nodes, edge_index
    #print('pred:', pred)
    accuracy = (pred == targets).sum().item() / targets.size(0)
    print(f'Accuracy: {accuracy}')

In [62]:
train(first_model=model1, dataloader=train_dl, in_channels=10, out_channels=10, heads=8, epochs= 10, batch_size=64, lr=1e-2, step_size=2, save_period=10, save_path='/home/myeongjin/data-mining/saved_models/second_step_test')

device: cuda


Epoch 0, batch_idx 0/37153 Loss: 1.135193109512329
Epoch 0, batch_idx 20/37153 Loss: 1.156882405281067
Epoch 0, batch_idx 40/37153 Loss: 1.114025592803955
Epoch 0, batch_idx 60/37153 Loss: 1.1904520988464355
Epoch 0, batch_idx 80/37153 Loss: 1.152680516242981
Epoch 0, batch_idx 100/37153 Loss: 1.1646422147750854
Epoch 0, batch_idx 120/37153 Loss: 1.0752894878387451
Epoch 0, batch_idx 140/37153 Loss: 1.120540976524353
Epoch 0, batch_idx 160/37153 Loss: 1.1264289617538452
Epoch 0, batch_idx 180/37153 Loss: 1.0834499597549438
Epoch 0, batch_idx 200/37153 Loss: 1.0945990085601807
Epoch 0, batch_idx 220/37153 Loss: 1.030975580215454
Epoch 0, batch_idx 240/37153 Loss: 1.063046932220459
Epoch 0, batch_idx 260/37153 Loss: 1.1156682968139648
Epoch 0, batch_idx 280/37153 Loss: 1.1577715873718262
Epoch 0, batch_idx 300/37153 Loss: 1.0089219808578491
Epoch 0, batch_idx 320/37153 Loss: 1.0914503335952759
Epoch 0, batch_idx 340/37153 Loss: 1.0673242807388306
Epoch 0, batch_idx 360/37153 Loss: 0.9993

KeyboardInterrupt: 