In [178]:
from torch.utils.data import Dataset, DataLoader
from nltk import sent_tokenize
import pandas as pd
import numpy as np
import itertools
import easydict
import MeCab
import torch
import json
import re

In [188]:
FILE_PATH = 'D:\\data\\text\\news-articles\\kbanker_articles.csv'
CONFIG_PATH = 'config.json'
with open(CONFIG_PATH, 'r') as f:
    args = easydict.EasyDict(json.load(f))

In [185]:
#pre-processing shit
def pre_process_raw_article(article):
    """Args
        article: str
    """
    replacements = [
        ('[“”]', '"'),
        ('[‘’]', '\''),
        ('\([^)]*\)', ''),
        ('[^가-힣\'"A-Za-z0-9.\s\?\!]', ' '),
        ('(?=[^0-9])\.(?=[^0-9])', '. '),
        ('\s\s+', ' ')
    ]
    
    for old, new in replacements:
        article = re.sub(old, new, article)
        
    return article

def mecab_tokenize(sentence):
    t = MeCab.Tagger()
    return [re.split(',', re.sub('\t', ',', s))[0] for s in t.parse(sentence).split('\n') if (s!='') & ('EOS' not in s)]

In [207]:
class NLPCorpusDataset(Dataset):
    """NLP Corpus dataset."""

    def __init__(self, csv_file, root_dir):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        articles = pd.read_csv(csv_file, encoding='utf-8')['article'].dropna().values
        articles = [pre_process_raw_article(article) for article in articles]
        sentences = itertools.chain.from_iterable([sent_tokenize(article) for article in articles])
        self.corpus = [mecab_tokenize(s) for s in list(sentences)]
        self.root_dir = root_dir
        del articles
        del sentences
        
        #construct word matrix
        word_set = set(itertools.chain.from_iterable(self.corpus))
        self.word_to_idx = {word : idx for idx, word in enumerate(word_set)}
        self.idx_to_word = {self.word_to_idx[word] : word for word in self.word_to_idx}
        del word_set
        
        #make train label dataset
        self.x = []
        self.y = []
        for sentence in self.corpus:
            for i in range(len(sentence) - args.window_size):
                self.x.append(sentence[i:i+args.window_size])
                self.y.append([sentence[i+args.window_size]])

    def __len__(self):
        return len(self.corpus)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        return self.x[idx], self.y[idx]

In [208]:
nlp_dataset = NLPCorpusDataset(csv_file=FILE_PATH, root_dir='.')

In [238]:
def collate_fn(data):
    return zip(*data)

dataloader = DataLoader(nlp_dataset, batch_size=2, shuffle=False, num_workers=0, collate_fn=collate_fn)

In [239]:
for i, sample in enumerate(dataloader):
    if i == 0:
        a = sample
        break

[(['본점', '필수', '인력', '연수원', '에'], ['분산']), (['필수', '인력', '연수원', '에', '분산'], ['근무'])]
