In [1]:
import json
import sys
import pickle
import csv
import numpy as np
from collections import defaultdict
from soyspacing.countbase import RuleDict, CountSpace
from soynlp.word import WordExtractor
from soynlp.tokenizer import MaxScoreTokenizer
from gensim.models import word2vec

In [2]:
class Sentences:
    def __init__(self, fname):
        self.fname = fname
        self.length = 0
        
    def __iter__(self):
        with open(self.fname, encoding='utf-8') as f:
            train = json.load(f)
            for i in range(len(train)):
                review = train[i]['review']
                review = review.strip()
                yield review
                
    def __len__(self):
        if self.length == 0:
            with open(self.fname, encoding='utf-8') as f:
                train = json.load(f)
                for i in range(len(train)):
                    self.length += 1
        return self.length

In [3]:
corpus_fname = "./data/train_data.json"
sentences = Sentences(corpus_fname)



word_extractor = WordExtractor(min_cohesion_forward=0.05, 
                               min_right_branching_entropy=0.0)

word_extractor.train(sentences)
word_extractor.save("./word_extractor")

training was done. used memory 1.654 Gbse memory 2.007 Gb


In [22]:
words = word_extractor.extract()

all cohesion probabilities was computed. # words = 294179
all branching entropies was computed # words = 337767
all accessor variety was computed # words = 337767


In [26]:
scores = {}

for key, value in words.items():
    scores[key] = value.cohesion_forward
    
tokenizer = MaxScoreTokenizer(scores=scores)

In [27]:
model_fname = './corpus.model'

spacing_model = CountSpace()
spacing_model.load_model(model_fname, json_format=False)

In [28]:
def spacing_tokenizing(sent, spacing_model, tokenizer):
    sent_corrected, tags = spacing_model.correct(doc=sent, 
                                                 verbose=False, 
                                                 force_abs_threshold=0.3,
                                                 nonspace_threshold=-0.3,
                                                 space_threshold=0.3,
                                                 min_count=10)
    
    return tokenizer.tokenize(sent_corrected)

In [41]:
sentences = []

with open("./data/train_data.json") as json_file:
    train = json.load(json_file)
    num_sent = len(train)
    for i in range(num_sent):
        sentences.append(spacing_tokenizing(train[i]['review'], spacing_model, tokenizer))
        if i % 10000 == 0:
             sys.stdout.write('\rtokenizing... {}/{}'.format(i, num_sent))

tokenizing... 690000/700000

In [42]:
f = open('sentences', 'wb')
# f = open('sentences', 'rb')
pickle.dump(sentences, f)
# sentences = pickle.load(f)
f.close()

In [45]:
word_count = defaultdict(lambda: 0)

for sentence in sentences:
    for word in sentence:
        word_count[word] += 1

In [None]:
unk_sentences = []

for sentence in sentences:
    unk_sentences.append(["UNK" if word_count[word] < 5 else word for word in sentence])

In [2]:
# f = open('unk_sentences', 'wb')
f = open('unk_sentences', 'rb')
# pickle.dump(unk_sentences, f)
unk_sentences = pickle.load(f)
f.close()

In [None]:
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

print('Training model...')
word2vec_model = word2vec.Word2Vec(unk_sentences, size=200, min_count=1, window=10)

word2vec_model.init_sims(replace=True)

model_name = "200f_5mw_10ct"
word2vec_model.save(model_name)

In [3]:
w2v_model = word2vec.Word2Vec.load('200f_5mw_10ct')

In [4]:
vocab_map = defaultdict(lambda: 0)

num = 0

for sentence in unk_sentences:
    for word in sentence:
        if not vocab_map[word]:
            vocab_map[word] = num
            num += 1

len(vocab_map)

59203

In [None]:
vocab_size = len(vocab_map) + 1
embedding_dim = 200
embedding = np.zeros((vocab_size, embedding_dim))

for key, value in vocab_map.items():
    embedding[value] = w2v_model[key]

In [75]:
np.save("embedding_matrix", embedding)

In [19]:
x_train = []
sequence_length = len(max(unk_sentences, key=len))

rating_list = []

with open("./data/train_data.json") as json_file:
    train = json.load(json_file)
    
    for i in range(len(train)):
        rating_list.append(train[i]["rating"])

for rating, sentence in zip(rating_list, unk_sentences):
    sentence_to_num = [vocab_map[word] for word in sentence]
    
    if len(sentence_to_num) < sequence_length:
        zero_pad = [0] * (sequence_length - len(sentence_to_num))
        sentence_to_num = sentence_to_num + zero_pad
        
    full_row = [rating] + sentence_to_num

    x_train.append(full_row)

with open("train.csv", "w") as csvfile:
    writer = csv.writer(csvfile)
    for row in x_train:
        writer.writerow(row)