# Yelp 2018 데이터 전처리

In [2]:
import re
import json
import sys
import pickle
import numpy as np
from collections import defaultdict

from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from contractions import contraction_map
from stopwords import stopwords

ps = PorterStemmer()

* 문장, 단어 토크나이징
* 줄임말 복원
* 특수문자 제거
* 스테밍

In [2]:
def sentence_tokenizer(document):
    return [sentence.lower() for sentence in sent_tokenize(document)]


def expand_contractions(sentence, contraction_mapping):
    contractions_pattern = re.compile("({})".format("|".join(contraction_map.keys())))
    
    def expand_match(contraction):
        match = contraction.group(0)
        expanded_contraction = contraction_mapping.get(match)
        return expanded_contraction
    
    expanded_sentence = contractions_pattern.sub(expand_match, sentence)
    
    return expanded_sentence


def remove_special_character(sentence_list):
    char_pattern = re.compile("[^a-zA-Z0-9 ]")
    num_pattern = re.compile("[0-9]+")
    return [num_pattern.sub("NUM", char_pattern.sub("", sentence)) for sentence in sentence_list]


def tokenize_and_stemming(sentence):
    tokenized_sentence = word_tokenize(sentence)
    return [ps.stem(token) for token in tokenized_sentence if token not in stopwords]


dataset = open("dataset/review.json")

documents = []
num = 1
review_num = 1000000

for data in dataset:
    if num > review_num:
        break
        
    review = json.loads(data)    
    stars = review["stars"]
    document = review["text"]
    expanded_sents = [expand_contractions(sentence, contraction_map) for sentence in sentence_tokenizer(document)]
    sentence_list = remove_special_character(expanded_sents)
    documents.append([stars] + [tokenize_and_stemming(sentence) for sentence in sentence_list])
    
    num += 1

    if (num > 0) and (num % 100 == 0):
        sys.stdout.write("\rProcessing... %d/%d" % (num, review_num))

print("\rDone. Number of documents: %d" % (len(documents)))
dataset.close()

* 토크나이징 된 단어 개수 카운트
* 5번 이하로 발생한 단어는 "UNK"로 대체

In [None]:
word_count = defaultdict(lambda: 0)

for document in documents:
    for sentence in document[1:]:
        for word in sentence:
            word_count[word] += 1

cleaned_documents = []

for document in documents:
    sentence_list = []
    for sentence in document[1:]:
        sentence_list.append(["UNK" if word_count[word] < 5 else word for word in sentence])
    cleaned_documents.append([document[0]] + sentence_list)

with open("cleaned_documents", "wb") as f:
    pickle.dump(cleaned_document, f)

* 단어와 숫자를 일대일 대응시키는 딕셔너리 생성

In [None]:
vocab_map = defaultdict(lambda: 0)

num = 0

for document in unk_documents:
    for sentence in document[1:]:
        for word in sentence:
            if not vocab_map[word]:
                vocab_map[word] = num
                num += 1

* 단어를 정수로 변환하여 ndarray로 생성

In [17]:
def make_dataset(inputs, max_sent_num, max_word_num):
    array = np.array(inputs)
    np.random.shuffle(array)
    
    idx = int(len(array) * 0.1)
    
    train = array[idx*2:].copy()
    valid = array[:idx].copy()
    test = array[idx:idx*2].copy()
    
    def make_array(data, max_sent_num, max_word_num):
        label = []
        array = np.zeros((len(data), max_sent_num, max_word_num), dtype='int32')
        for i, doc in enumerate(data):
            label.append(doc[0])
            for j, sent in enumerate(doc[1:]):
                if j < max_sent_num:
                    for k, word in enumerate(sent):
                        if k < max_word_num:
                            array[i, j, k] = vocab_map[word]
        return array, label
    
    train_data, train_label = make_array(train, max_sent_num, max_word_num)
    valid_data, valid_label = make_array(valid, max_sent_num, max_word_num)
    test_data, test_label = make_array(test, max_sent_num, max_word_num)
    
    return train_data, valid_data, test_data, train_label, valid_label, test_label

max_sent_num = 20
max_word_num = 20

train_data, valid_data, test_data, train_label, valid_label, test_label = make_dataset(unk_documents, max_sent_num, max_word_num)

* ndarray를 TFRecord로 저장

In [None]:
import tensorflow as tf
from sklearn.preprocessing import LabelBinarizer

label = [1, 2, 3, 4, 5]

lb = LabelBinarizer().fit(label)

def make_tfrecord(fname, array, label):
    def int64_feature(value):
        return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
    writer = tf.python_io.TFRecordWriter(fname)
    
    data_cnt = len(label)
    
    for i, data in enumerate(zip(array, label)):
        encoded_label = lb.transform([data[1]])[0]
        feature = {
            'document': int64_feature(data[0].flatten()),
            'label': int64_feature(encoded_label)
        }
        features = tf.train.Features(feature=feature)
        example = tf.train.Example(features=features)
        writer.write(example.SerializeToString())
        
        if (i > 0) and (i % 100 == 0):
            sys.stdout.write("\rProcessing... %d/%d" % (i, data_cnt))
            
    print("\rComplete.")
    writer.close()

make_tfrecord("train.tfrecord", train_data, train_label)
make_tfrecord("valid.tfrecord", valid_data, valid_label)
make_tfrecord("test.tfrecord", test_data, test_label)