In [1]:
from collections import defaultdict
from pprint import pprint
from konlpy.tag import Twitter
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from scipy.io import mmwrite, mmread

twitter = Twitter()

review_corpus_file = '../naver_movie_review/review_train.txt'

docs = []
ratings = []
with open(review_corpus_file, encoding='utf-8') as f:
    while 1:
        line = f.readline()
        if not line: break
        arr = line.split('\t')
        doc = arr[1].strip()
        docs.append(doc)
        rating = arr[2].strip()
        ratings.append(rating)

print('num docs = %d' % len(docs))

tokens = []
for doc in docs:
    for lex, tag in twitter.pos(doc):
        if tag == 'Noun' or tag == 'Adjective' or tag == 'Verb':
            token = lex + '/' + tag
            tokens.append(token)
token_counter = Counter(tokens)
print('num of tokens: %d' % len(token_counter))

num docs = 150000
num of tokens: 55059


In [2]:
tokens[:10]

['더빙/Noun',
 '진짜/Noun',
 '짜증/Noun',
 '나네/Verb',
 '목소리/Noun',
 '흠/Noun',
 '포스터/Noun',
 '보고/Noun',
 '초딩/Noun',
 '영화/Noun']

In [3]:
for min_count in [2, 3, 5, 10]:
    _counter = {word for word, freq in token_counter.items() if freq >= min_count}
    print('num of tokens (min_count = %d): %d' % (min_count, len(_counter)))

num of tokens (min_count = 2): 31758
num of tokens (min_count = 3): 24396
num of tokens (min_count = 5): 17643
num of tokens (min_count = 10): 11093


In [4]:
token_dict = {word for word, freq in token_counter.items() if freq >= 3}

def custom_tokenizer(doc):
    tokens = []
    for lex, tag in twitter.pos(doc):
        token = lex + '/' + tag
        if token in token_dict:
            tokens.append(token)
    return tokens

vectorizer = CountVectorizer(tokenizer=custom_tokenizer, min_df=0.005, max_df=0.95)
x_sparse = vectorizer.fit_transform(docs)
vocab2int = vectorizer.vocabulary_
x_dense = x_sparse.todense()

In [5]:
len(vocab2int)

279

In [6]:
x_sparse

<150000x279 sparse matrix of type '<class 'numpy.int64'>'
	with 578711 stored elements in Compressed Sparse Row format>

In [7]:
vocab2int

{'가/Verb': 0,
 '가슴/Noun': 1,
 '가장/Noun': 2,
 '가족/Noun': 3,
 '감/Noun': 4,
 '감독/Noun': 5,
 '감동/Noun': 6,
 '같/Adjective': 7,
 '같다/Adjective': 8,
 '같아/Adjective': 9,
 '같은/Adjective': 10,
 '개/Noun': 11,
 '개봉/Noun': 12,
 '개인/Noun': 13,
 '거/Noun': 14,
 '건/Noun': 15,
 '건지/Verb': 16,
 '걸/Noun': 17,
 '것/Noun': 18,
 '게/Noun': 19,
 '결말/Noun': 20,
 '계속/Noun': 21,
 '공감/Noun': 22,
 '공포/Noun': 23,
 '괜찮/Adjective': 24,
 '굿/Noun': 25,
 '그/Noun': 26,
 '그냥/Noun': 27,
 '그런/Adjective': 28,
 '극장/Noun': 29,
 '급/Noun': 30,
 '기/Noun': 31,
 '기대/Noun': 32,
 '기분/Noun': 33,
 '기억/Noun': 34,
 '긴장감/Noun': 35,
 '꼭/Noun': 36,
 '끝/Noun': 37,
 '나/Noun': 38,
 '나름/Noun': 39,
 '나오는/Verb': 40,
 '나온/Verb': 41,
 '나와/Verb': 42,
 '나왔/Verb': 43,
 '난/Noun': 44,
 '남는/Verb': 45,
 '남자/Noun': 46,
 '내/Noun': 47,
 '내내/Noun': 48,
 '내용/Noun': 49,
 '너무/Noun': 50,
 '넘/Verb': 51,
 '년/Noun': 52,
 '눈/Noun': 53,
 '눈물/Noun': 54,
 '느낌/Noun': 55,
 '는/Verb': 56,
 '다른/Noun': 57,
 '다시/Noun': 58,
 '대박/Noun': 59,
 '대사/Noun': 60,
 '대한/Noun': 61,
 '더/Noun

In [8]:
ratings[:10]

['0', '1', '0', '0', '1', '0', '0', '0', '1', '1']

In [9]:
x_dense[1]

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [41]:
row = len(x_dense)

In [46]:
review_vector_file = '../naver_movie_review/review.csv'
with open(review_vector_file, 'w', encoding='utf-8') as fw:
    for i in range(row):
        x_array = list(x_dense[i].flat)
        for x in x_array:
            fw.write(str(x))
            fw.write(',')
        fw.write(ratings[i])
        fw.write('\n')

In [57]:
from operator import itemgetter

sorted_vocab2int = sorted(vocab2int.items(), key=itemgetter(1), reverse=False)

In [60]:
token_dictionary_file = '../naver_movie_review/token.dic'
with open(token_dictionary_file, 'w', encoding='utf-8') as fd:
    for vocab in sorted_vocab2int:
        fd.write(vocab[0])
        fd.write('\n')

In [64]:
test_file = '../naver_movie_review/review_test.txt'

docs = []
ratings = []
with open(test_file, encoding='utf-8') as f:
    while 1:
        line = f.readline()
        if not line: break
        arr = line.split('\t')
        doc = arr[1].strip()
        docs.append(doc)
        rating = arr[2].strip()
        ratings.append(rating)

In [66]:
len(docs)

50000

In [68]:
len(ratings)

50000

In [81]:
test_csv = '../naver_movie_review/test.csv'
with open(test_csv, 'w', encoding='utf-8') as ft:
    column = len(vocab2int)
    i = 0
    for doc in docs:
        _arr = ['0'] * column
        for lex, tag in twitter.pos(doc):
            token = lex + '/' + tag
            if token in vocab2int:
                idx = vocab2int.get(token)
                _arr[idx] = '1'
        for e in _arr:
            ft.write(e)
            ft.write(',')
        ft.write(ratings[i])
        ft.write('\n')
        i += 1