In [2]:
import os
import datetime
import numpy as np
import pandas as pd
import gensim
import MeCab
from jamo import h2j, j2hcj


In [44]:
class Word2Vec(object):
    """
    Word2Vec을 이용하여 사전을 만드는 여러가지 방식을 제공
    <Parameters>
        - word2vec_size : 단어 하나당 vector size
        - window : 현재 word를 계산하는데 사용하는 maximum distance
        - min_count : 단어가 해당 개수 이하로 나오면 무시
        - flag : 5가지 종류 제공.
          1) W : mecab 형태소분석 결과 word 단위 word2vec 사전
          2) WT : mecab 형태소분석 결과 word 단위 + pos tagging word2vec 사전
          3) JM : jamo 단위 word2vec 사전
          4) LN : N = 1,2,.. n 글자수 단위 사전 제공
    <Function>
        - make_input(sentence_list) : 각 flag에 알맞은 input 형태를 만들어서 return
        - save_w2v_dic(sentence_list, path) : 각 flag에 알맞은 사전은 원하는 path에 저장
    """
    def __init__(self, word2vec_size=300, window=5, min_count=2, flag='W'):
        self.word2vec_size = word2vec_size
        self.window = window
        self.min_count = min_count
        self.flag = flag
    

    def parse_sentence(self, text, param):
        if type(text) == str:
            text = ''.join(text.split()) #space remove
            words_array = []
            
            if self.flag[0] == 'W':
                mecab = MeCab.Tagger()
                
                parse_result = mecab.parse(text) # pose parse
                info_of_words = parse_result.split('\n')
                
                for info in info_of_words:
                    if not (info == 'EOS' or info == ''):
                        info_elems = info.split(',')
                        posed_word = info_elems[0].split('\t')

                        if param and len(posed_word)>1:
                            words_array.append( posed_word[0]+'/'+posed_word[1])
                        else:
                            words_array.append( posed_word[0])
            elif self.flag == 'JM':
                words_array = list(j2hcj(h2j(text)))
            elif self.flag[0] == 'L':
                n = param
                words_array = [text[i:i+n] for i in range(0, len(text), n)]
                
            return words_array
        else:
            return False
    

    def make_input(self, sentence_list):
        input_x = []
        
        for text in sentence_list:
            if self.flag[0] == 'W':
                if len(self.flag) > 1 and self.flag[1] == 'T':
                    param = True
                else:
                    param = False
            elif self.flag[0] == 'L':
                if len(self.flag) > 1 :
                    param = int(self.flag[1])
            else:
                param = ''
            result = self.parse_sentence(text=text, param=param)
            if type(result)==list and len(result) > 0:
                input_x.append(result)
        return input_x        
    

    def make_w2v_dic(self, sentence_list):
        if type(sentence_list) == list:
            input_x = self.make_input(sentence_list)
            w2v_input = np.array(input_x)
            model = gensim.models.Word2Vec(min_count=self.min_count, window=self.window, size=self.word2vec_size)
            model.build_vocab(w2v_input)
            model.train(w2v_input, total_examples=model.corpus_count, epochs=model.epochs)
            word_vectors = model.wv

            return model, word_vectors
        else:
            return False
    
    
    def save_w2v_dic(self, sentence_list, path):

        SAVE_PATH = path
        SAVE_NAME = SAVE_PATH+datetime.datetime.now().strftime('%Y%m%d%H%M')+'_'+str(self.word2vec_size)+'_'+self.flag
        
        model, word_vectors = self.make_w2v_dic(sentence_list)
        print("Dictionary is saved : "+SAVE_NAME)
        model.save(SAVE_NAME+'.bin')
    
    
    def load_w2v_dic (self, dic_path, filetype='bin'):
        allFileNames = os.listdir(dic_path)
        print(allFileNames)
        modelNames = [fn for fn in allFileNames if fn.find(self.flag+'.'+filetype) > -1 and fn.find(str(self.word2vec_size)) > -1 and fn.endswith(filetype)]

        if dic_path[-1] != '/':
            dic_path += '/'

        model = gensim.models.Word2Vec.load(dic_path+modelNames[0])
        print(modelNames[0]+' is loaded')
        return model
    
    def find_vectors(sentence_list, max_seq_size, dic_path):
        #load model
        w2v_dic_model = self.load_w2v_dic(dic_path=dic_path, filetype='bin')
        w2v_dic = w2v_dic_model.wv
        
        #make input list
        parse_list = self.make_input(sentence_list)
        input_x_vec = []
        
        for word_array in parse_list:
            embedding_vector = []

            for w in word_array:
                if w not in w2v_dic.vocab:
                    embedding_vector.append(np.random.normal(scale=1e-2, size=self.word2vec_size))#[np.zeros(shape=300)]
                else:
                    embedding_vector.append(word_vectors[w])

            temp_len = max_seq_size-len(embedding_vector)
            if temp_len > 0:
                for i in range(0,temp_len):
                    embedding_vec += [np.zeros(word2vector_size)]    
            else:
                embedding_vec = embedding_vec[:max_seq_size]
            input_x_vec.append(embedding_vec)
            
        return np.array(input_x_vec)
        

In [4]:
#사용예시

sentence_list = []

#data 불러오기 - only sentence
dataDir = 'data/uplusInput/train'
allFileNames = os.listdir(dataDir)

filePaths = []

for fname in allFileNames:
    if fname[0] != '.':
        filePaths.append(os.path.join(dataDir, fname))

for fpn in filePaths:
    if fpn[-4:] == '.csv':
        datas = pd.read_csv(fpn)
        input_x = datas['content']
        sentence_list.extend(list(input_x))

targetDir = 'data/movieReview/formmated'
allFileNames = os.listdir(targetDir)
filesize = len(allFileNames)

testSize = int(filesize/10)
subNames = allFileNames[testSize+1:testSize*2]
# print(len(subNames))

for fname in subNames:
    if fname[0] != '.':
        data_path = os.path.join(targetDir, fname)

        try:
            corpus = pd.read_csv(data_path, quotechar="'", header=None, encoding="utf-8", )
            contents = np.array(corpus[5][:])

            sentence_list.extend(list(contents))

        except Exception as e:
            print(fname + ' : ' + str(e))
            pass

In [45]:
word2vec_size = 150
window = 5  #현재 word를 계산하는데 사용하는 maximum distance ?
min_count = 2 #단어가 해당 개수 이하로 나오면 무시
SAVE_PATH = './data/wordDic/'

nletterW2V = Word2Vec(word2vec_size=word2vec_size, window=window, min_count=min_count, flag='L2')

In [24]:
nletterW2V.save_w2v_dic(sentence_list,path=SAVE_PATH)

Dictionary is saved : ./data/wordDic/201804241053_150_L2


In [46]:
nletterW2V.load_w2v_dic(dic_path=SAVE_PATH)

['201804241053_150_L2.bin.trainables.syn1neg.npy', '201804241053_150_L2.bin', '201804231620_150_WT.bin', '201804231704_150_JM.bin', '201804241016_150_L1.bin', '201804231649_150_W.bin', '201804241053_150_L2.bin.wv.vectors.npy']
./data/wordDic/
./data/wordDic/201804241053_150_L2.bin
201804241053_150_L2.bin is loaded


<gensim.models.word2vec.Word2Vec at 0x7f1902523d30>