In [1]:
from dataset.negsampling_dataset import NegSamplingDataset
from torch.utils.data import Dataset, DataLoader
from nltk.corpus import treebank
from abc import abstractmethod
from nltk import sent_tokenize
import torch.nn as nn
import pandas as pd
import numpy as np
import itertools
import argparse
import easydict
import random
import pickle
import MeCab
import torch
import json
import sys
import os
import re

In [2]:
def config_parser(args):
    print('file path is ' + str(args.file_path))
    with open(args.config_path, 'rb') as f:
        config = easydict.EasyDict(json.load(f))
    config.model = args.model
    config.file_path = args.file_path
    config.dataset_path = args.dataset_path
    config.device = torch.device(args.device)
    return config

args = argparse.ArgumentParser(description='nlp embedding')
args.add_argument('-m', '--model', default='neg-sampling', type=str,
                  help='which model to use')
args.add_argument('-cp', '--config-path', default='config.json', type=str,
                  help='config file path (default: None)')
args.add_argument('-fp', '--file-path', default='D:\\data\\text\\news-articles\\kbanker_articles_subtitles.csv', type=str,
                  help='path to latest checkpoint (default: None)')
args.add_argument('-dp', '--dataset-path', default='data\\nlp_dataset.pkl', type=str,
                  help='if there is a pickled dataset')
args.add_argument('-d', '--device', default='cuda:0', type=str,
                  help='indices of GPUs to enable (default: all)')
sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname('__file__'))))
config = config_parser(args.parse_args())
config['neg_sample_size'] = 5
config['file_path'] = 'D:\\data\\text\\torch-dataset\\kbanker_articles_processed.pkl'
config['file_path'] = 'E:\\data\\text\\news-articles\\kbanker_articles_subtitles.csv'
config['file_path'] = 'treebank'
config['model'] = 'fast-text'

file path is C:\Users\sylim2357\AppData\Roaming\jupyter\runtime\kernel-7edcf06f-a52e-42db-b563-28527c26a594.json


In [3]:
def ngram(w, n):
    word = '<' + w + '>'
    if len(word) <= 3:
        return [word]
    else:
        ngram = []
        for i in range(n, len(word)+1):
            ngram += [word[i-n:i]]

        return ngram + [word]

In [68]:
# from w2v.utils import pre_process_raw_article, mecab_tokenize
from utils import pre_process_raw_article, mecab_tokenize
from torch.utils.data import Dataset
from nltk import sent_tokenize
from abc import abstractmethod
import pandas as pd
import collections
import itertools

class FastTextDataset(NegSamplingDataset):
    """Fast Text Dataset.

    Args:
        config (dict): hyperparameters
        word_frequency (dict): word index - word frequency map for negative sampling

    """

    def __init__(self, config):
        if 'pkl' in config.file_path:
            with open(config.file_path, 'rb') as f:
                corpus = pickle.load(f)[:1000]
        elif config.file_path == 'treebank':
            corpus = treebank.sents()[:5]
        else:
            articles = pd.read_csv(config.file_path, encoding='utf-8')['article'].dropna().values
            #pre process
            corpus = self.pre_process(articles)
            
        ngram_corpus = self.fast_text_pre_process(corpus)
        self.ngram_word_to_idx, self.ngram_idx_to_word, _ = self.construct_word_idx(ngram_corpus)
        #construct word matrix
        self.word_to_idx, self.idx_to_word, self.word_frequency = self.construct_word_idx(corpus)
        #make dataset
        self.x, self.y = self.construct_dataset(corpus, config)
    
    def fast_text_pre_process(self, corpus):
        return [[ngram(w, 3) for w in s] for s in corpus]

    def construct_word_idx(self, corpus):
        print('constructing word matrix')
        corpus_flatten = list(itertools.chain.from_iterable(corpus))
        if isinstance(corpus_flatten[0], list):
            word_frequency = collections.Counter(itertools.chain.from_iterable(corpus_flatten))
        else:
            word_frequency = collections.Counter(corpus_flatten)
        word_frequency = {word: word_frequency[word]**(3/4) for idx, word in enumerate(word_frequency)}
        word_to_idx = {word: idx for idx, word in enumerate(word_frequency)}
        idx_to_word = {word_to_idx[word]: word for word in word_to_idx}

        return word_to_idx, idx_to_word, word_frequency
    
    def neg_sample(self, word_contxt, config):
        word_universe = self.word_to_idx.keys() - set(word_contxt)
        word_distn = np.array([self.word_frequency[idx] for idx in word_universe])
        word_distn = word_distn / word_distn.sum()
        
        return np.random.choice(a=list(word_universe), size=config.neg_sample_size*config.window_size*2, p=word_distn)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx): idx = idx.tolist()
        pos = torch.Tensor([self.ngram_word_to_idx[c] for c in ngram(self.x[0][idx], 3)]).long()
        neg = [torch.Tensor([self.ngram_word_to_idx[c] for c in ngram(w, 3)]).long() for w in self.x[1][idx]]
        label = torch.Tensor([self.ngram_word_to_idx[c] for c in ngram(self.y[idx], 3)]).long()
        
#         return (pos,neg), label
        return (self.word_to_idx[self.x[0][idx]], [self.word_to_idx[c] for c in self.x[1][idx]]), self.y[idx]

In [69]:
fast_txt_dataset = FastTextDataset(config)

constructing word matrix
constructing word matrix
constructing training dataset


In [70]:
class FastTextEmbeddingModule(nn.Module):
    def __init__(self, idx_to_word, ngram_word_to_idx, config):
        super().__init__()
        self.idx_to_word = idx_to_word
        self.ngram_word_to_idx = ngram_word_to_idx
        self.embedding = nn.Embedding(len(self.ngram_word_to_idx), config.embed_dim).float()
        self.embedding.weight.data.uniform_(-1,1)

        self.tanh = nn.Tanh()

    def forward(self, x):
        word = self.idx_to_word[x]
        ngram_x = ngram(word, 3)
        subind = torch.Tensor([self.ngram_word_to_idx[c] for c in ngram_x]).long()
#         print(ngram_x[0])
#         print(self.ngram_word_to_idx[ngram_x[0]])
#         print(torch.Tensor([self.ngram_word_to_idx[c] for c in ngram_x]).long())
        embedded = self.embedding(subind).view(len(subind), -1)
        embedded.retain_grad()
        net = embedded.mean(axis=0) #수정하기
        return net

In [71]:
for i in fast_txt_dataset:
    print(i)
    print(FastTextEmbeddingModule(fast_txt_dataset.idx_to_word, fast_txt_dataset.ngram_word_to_idx, config)(i[0][0]))
    break

((0, [5, 21, 78, 75]), ',')
tensor([-0.0322, -0.0222, -0.0154, -0.0116, -0.3570,  0.0091,  0.4104, -0.2832,
        -0.0693, -0.0354, -0.2976,  0.1623,  0.1971, -0.4275,  0.3107,  0.2935,
        -0.3346,  0.2601, -0.2888, -0.1285], grad_fn=<MeanBackward1>)


In [80]:
def collate_fn(data):
    seqs, labels = zip(*data)
    return seqs, labels

dataloader = DataLoader(fast_txt_dataset, batch_size=config.batch_size, \
                        shuffle=False, num_workers=0, collate_fn=collate_fn)

target_emb = FastTextEmbeddingModule(fast_txt_dataset.idx_to_word, fast_txt_dataset.ngram_word_to_idx, config).to(config.device)
context_emb = FastTextEmbeddingModule(fast_txt_dataset.idx_to_word, fast_txt_dataset.ngram_word_to_idx, config).to(config.device)
sigmoid = nn.Sigmoid()
similar = nn.CosineSimilarity()

criterion = nn.BCELoss()
optimizer = torch.optim.SGD(list(target_emb.parameters()) + list(context_emb.parameters()), lr=5e-3, momentum=0.9)

for epoch in range(10):
    print('Epoch ' + str(epoch))
    for i, sample in enumerate(dataloader):
        print(np.array(sample[0])[:,1].astype('float64'))
        pos_idx = torch.Tensor(np.array(sample[0])[:,0].astype('float64')).long().to(config.device)
        neg_idx = torch.Tensor(np.array(sample[0])[:,1].astype('float64')).long().to(config.device)
#         neg_idx = np.array(sample[0])[:,1].to(config.device)
        target_idx = sample[1].long().to(config.device)
        pos_label = 1
        neg_label = 0
        
        pos = context_emb(pos_idx)
        neg = context_emb(neg_idx)
        target = target_emb(target_idx)
        print(pos_idx)
#         print(pos)
#         print(neg)
#         print(target)[]
#         print(pos.shape, target.shape)
#         print(similar(pos,target))
        pred_pos = sigmoid(similar(pos, target))
        pred_neg = sigmoid(similar(neg, target.unsqueeze(1).expand_as(neg)))

        pos_loss = criterion(pred_pos, pos_label)
        neg_loss = torch.sum(criterion(pred_neg, neg_label.unsqueeze(1).expand_as(pred_neg)))

        pos_loss.retain_grad()
        neg_loss.retain_grad()
        
        loss = pos_loss + neg_loss
        loss.retain_grad()
        
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        optimizer.step()
        
        if i % 100 == 99:
            print(i, loss.item())
            if i % 100000 == 99:
                with open('./checkpoints/neg_sample_checkpoint_epoch' + str(epoch) + '_' + str(i) + '.pkl', 'wb') as f:
                    pickle.dump(target_emb, f)
                print('./checkpoints/neg_sample_checkpoint_epoch' + str(epoch) + '_' + str(i) + '.pkl saved')

Epoch 0


ValueError: setting an array element with a sequence.