In [1]:
import argparse
import os
import torch
import gensim
import dgl
import dgl.function as fn
import torch.optim as optim
import math
import time
import numpy as np
import discord_notify as dn
from torch.utils.data import TensorDataset, DataLoader, random_split
from torch import nn
from encoder import DocuEncoder, ClassEncoder, DocumentTokenizer
from layer import GCN
from classifier import TextClassifier
from preprocessor import TaxoDataManager, DocumentManager
from gensim.test.utils import datapath
from gensim.models import word2vec

Using backend: pytorch
[07:39:51] /opt/dgl/src/runtime/tensordispatch.cc:43: TensorDispatcher: dlopen failed: /root/mambaforge/envs/text-classifier/lib/python3.7/site-packages/dgl/tensoradapter/pytorch/libtensoradapter_pytorch_1.10.0.so: cannot open shared object file: No such file or directory


In [2]:
DATA_ROOT = 'data/'
TOKEN_LENGTH = 500
CLASS_LENGTH = 768
BATCH_SIZE = 4

word2vec_model = word2vec.Word2Vec.load(os.path.join(DATA_ROOT, 'pretrained/embedding'))
document_tokenizer = DocumentTokenizer(DATA_ROOT, TOKEN_LENGTH)
dim = word2vec_model.wv.vector_size
gcn = GCN(dim, dim, dim, 2, nn.ReLU())
class_encoder = ClassEncoder(gcn, word2vec_model)

notifier = dn.Notifier('https://discord.com/api/webhooks/917284193036275712/2Da9DmvQjYugyP8pzvB4AzPMqVEizyVipHYLDPE79ZySU2aPGL3imH-YdcqkiUZxf_ku')

def create_dataset(data_name, document_file, token_length, taxo_manager, num_val=None):
    elapsed_start = time.time()
    training_data_dir = os.path.join(DATA_ROOT, f'training_data/{data_name}/')
    training_document_manager = DocumentManager(document_file, training_data_dir, f'{data_name}_train', document_tokenizer.Tokenize, taxo_manager, force_token_reload=True)
    training_document_manager.load_tokens()
    training_document_manager.load_dicts()

    num_classes = len(graph.nodes())
    training_document_ids = training_document_manager.get_ids()

    for i, document_id in enumerate(training_document_ids, 0):
        tokens = torch.tensor(training_document_manager.get_tokens(document_id), dtype=torch.int32)
        tokens = torch.reshape(tokens, (-1, 1))
        positive, non_negative = training_document_manager.get_output_label(document_id)
        output = torch.zeros(num_classes, 1)
        mask = torch.ones(num_classes, 1, dtype=torch.int32)

        for j in non_negative:
            if j in positive:
                output[j][0] = 1
            else:
                mask[j] = 0
        input = torch.cat((tokens, mask), 0)
        if i==0:
            train_x = input
            train_y = output
        else:
            train_x = torch.cat((train_x, input), 0)
            train_y = torch.cat((train_y, output), 0)
        
    train_x = torch.reshape(train_x, (-1, num_classes + token_length))
    train_y = torch.reshape(train_y, (-1, num_classes, 1))

    dataset = TensorDataset(train_x, train_y)
    num_dataset = len(dataset)

    if num_val is not None:
        dataset = random_split(dataset, [num_val, num_dataset - num_val])
    
    notifier.send(f'{num_dataset}개 데이터셋 생성 완료. 걸린 시간: {round(time.time() - elapsed_start, 2)}.')
    
    return dataset
    
    

def train_coreclass_epoch(text_classifier, train_dataloader, loss_function, optimizer):
    train_loss = 0.0
    optimizer.zero_grad()
    for i, train_data in enumerate(train_dataloader):
        inputs, outputs = train_data
        predicted = text_classifier(inputs.cuda())
        loss = loss_function(predicted, outputs.cuda())
        loss.backward()
        train_loss += loss.item()

        if (i+1) % 8 == 0 :
            optimizer.step()
            optimizer.zero_grad()
    
    return train_loss / len(train_dataloader)

def validate_coreclass_epoch(model, dataloader, criterion):
    valid_loss = 0.0
    for data, labels in dataloader:
        if torch.cuda.is_available():
            data, labels = data.cuda(), labels.cuda()
        
        target = model(data)
        loss = criterion(target,labels)
        valid_loss += loss.item()
        
    return valid_loss / len(dataloader)

def print_epoch_result(index, train_loss, val_loss, elapsed_time):
    print(f'[{index + 1}]\ttrain loss: {round(train_loss, 3)}\tvalidation_loss: {round(valid_loss, 3)}\telapsed time: {time.time() - start}')

def train_coreclass(text_classifier, epoch, train_dataloader, loss_function, optimizer, valid_dataloader, save_path):
    train_start = time.time()
    text_classifier.cuda()
    min_valid_loss = np.inf
    
    for e in range(epoch):
        epoch_start = time.time()
        train_loss = train_coreclass_epoch(text_classifier, train_dataloader, loss_function, optimizer)
        valid_loss = validate_coreclass_epoch(text_classifier, valid_dataloader, loss_function)
        min_valid_loss = save_alltime_best(text_classifier, valid_loss, min_valid_loss, save_path)

        print_epoch_result(e, train_loss, valid_loss, time.time() - epoch_start)

    notifier.send(f'{epoch} epoch 코어클래스 학습 완료. 걸린 시간: {round(time.time() - train_start, 2)}.')

def save_alltime_best(model, val_loss, min_val_loss, save_path):
    if min_val_loss > val_loss:
        print('Validation loss decreased. Saving the model...')
        min_val_loss = val_loss
        torch.save(model.state_dict(), save_path)
        print('Model saved')

    return min_val_loss
    
def safe_div(a, b, epsilon=1e-8):
    return a / b.clamp(min=epsilon)

def safe_log(a, epsilon=1e-8):
    return torch.log(a.clamp(min=epsilon))

def target_distribution(prediction):
    weight = safe_div(prediction ** 2, prediction.sum(axis=0))
    weight_1 = safe_div((1 - prediction) **2, (1 - prediction).sum(axis=0))
    return safe_div(weight, (weight + weight_1))

def validate_self(model, dataloader, criterion):
    valid_loss = 0.0
    for data, _ in dataloader:
        if torch.cuda.is_available():
            data = data.cuda()
        
        predicted = model(data)
        target = target_distribution(predicted) 
        loss = criterion(predicted, target)
        valid_loss += loss.item()
        
    return valid_loss / len(dataloader)

def train_self_epoch(model, train_dataloader, loss_function, optimizer, update_period):
    train_loss = 0.0
    optimizer.zero_grad()

    for i, train_data in enumerate(train_dataloader):
        inputs, _ = train_data
        predicted = text_classifier(inputs.cuda())
        target = target_distribution(predicted)
        loss = loss_function(predicted, target)
        loss.backward()
        train_loss += loss.item()
        if i % update_period == 0:
            optimizer.step()
            optimizer.zero_grad()
    
    return train_loss / len(train_dataloader)

def kl_div_loss(predicted, target):
    return (target * safe_log(safe_div(target, predicted))).sum()

def train_self(text_classifier, epoch, train_dataloader, loss_function, optimizer, update_period, valid_dataloader, save_path):
    text_classifier.cuda()
    train_start = time.time()
    min_valid_loss = np.inf
    
    for e in range(epoch): 
        epoch_start = time.time()

        train_loss = train_self_epoch(text_classifier, train_dataloader, loss_function, optimizer, update_period)
        valid_loss = validate_self(text_classifier, valid_dataloader, loss_function)
        min_valid_loss = save_alltime_best(text_classifier, valid_loss, min_valid_loss, save_path)

        print_epoch_result(e, train_loss, valid_loss, time.time() - epoch_start)
        
    notifier.send(f'{epoch} epoch 자기 학습 완료. 걸린 시간: {round(time.time() - train_start, 2)}.')


In [3]:
def pipeline(data_name, data_filename, num_val, self_training=True):
  taxo_manager = TaxoDataManager(os.path.join(DATA_ROOT, f'training_data/{data_name}/'), 'taxonomy.json', data_name, word2vec_model, force_reload=True)
  taxo_manager.load_all()
  graph = taxo_manager.get_graph().to('cuda:0')
  features = taxo_manager.get_feature().cuda()

  val_dataset, train_dataset = create_dataset(data_name, data_filename, TOKEN_LENGTH, taxo_manager, num_val)
  val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)
  train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

  text_classifier = TextClassifier(class_encoder, DocuEncoder(DATA_ROOT), (dim, CLASS_LENGTH), TOKEN_LENGTH, graph, features, nn.Sigmoid(), False)

  optimizer = optim.AdamW([
    {'params': text_classifier.document_encoder.parameters(), 'lr': 5e-5},
    {'params': text_classifier.class_encoder.parameters()},
    {'params': text_classifier.weight}], lr=4e-3)

  save_path = os.path.join(DATA_ROOT, f'trained/text-classifier-{data_name}.pt')
  train_coreclass(text_classifier, 20, train_dataloader, torch.nn.BCELoss(reduction='sum'), optimizer, val_dataloader, save_path)
  if self_training:
    train_self(text_classifier, 5, train_dataloader, kl_div_loss, optimizer, 25, val_dataloader, save_path)


In [4]:
# pipeline('amazon', 'amazon-coreclass-45000.jsonl', 5000)

In [5]:
pipeline('DBPEDIA', 'DBPEDIA-coreclass-45000.jsonl', 5000)

170 words not in the training set!
Label dictionary is loaded
Calculating tokens from document...
Agent
{'Agent': [1], 'Device': [2], 'Event': [3], 'Place': [4], 'Species': [5], 'Sports Season': [6], 'Topical Concept': [7], 'Unit Of Work': [8], 'Work': [9], 'Actor': [10], 'Artist': [11], 'Athlete': [12], 'Boxer': [13], 'British Royalty': [14], 'Broadcaster': [15], 'Cleric': [16], 'Coach': [17], 'Comics Character': [18], 'Company': [19], 'Educational Institution': [20], 'Fictional Character': [21], 'Gridiron Football Player': [22], 'Group': [23], 'Motorcycle Rider': [24], 'Musical Artist': [25], 'Organisation': [26], 'Organisation Member': [27], 'Person': [28], 'Politician': [29], 'Presenter': [30], 'Racing Driver': [31], 'Scientist': [32], 'Sports League': [33], 'Sports Manager': [34], 'Sports Team': [35], 'Volleyball Player': [36], 'Winter Sport Player': [37], 'Wrestler': [38], 'Writer': [39], 'Engine': [40], 'Natural Event': [41], 'Olympics': [42], 'Race': [43], 'Societal Event': [44

TypeError: 'NoneType' object is not iterable