# Generate Embedding Model
## 본 과정에서 불러오는 Embedding 모델을 만드는 과정을 담고 있습니다. 

### Download Dataset
추후 추가될 수 있습니다. 

In [52]:
# import 및 사전 준비
from nltk.tokenize import RegexpTokenizer
from nltk.tag import pos_tag

tokenizer = RegexpTokenizer("[\w]+")
pos_dict = {}

# 여러 데이터셋에 대해서 tokenize, pos tag 진행 가능한 함수 생성. 
def tokenize_postag(txt, tokenizer, pos_dict):
    tokens = tokenizer.tokenize(txt)
    pos_tokens = pos_tag(tokens)
    for tag_tup in pos_tokens:
        pos_dict.update({tag_tup[0]:tag_tup[1]})

---

In [53]:
# json 파일 불러오기
import json
with open("test-transcripts-aligned.json") as json_data:
    data = json.load(json_data)

# txt_dataset(list)에 json file의 utterance(내용 부분)만 꺼내서 담는다
txt_dataset = []

for i in data:
    for j in data[i]:
        txt_dataset.append(j['utterance'])


In [54]:
for txt in txt_dataset:
    tokenize_postag(txt, tokenizer=tokenizer, pos_dict=pos_dict)

---

In [55]:
# nltk.book
import nltk
nltk.download("book", quiet=True)
from nltk.book import * 

In [72]:
book_dataset = []
for i in range(1, 10):
    text = globals()[f"text{i}"]
    vocab = list(text.vocab().keys())
    book_dataset.append(vocab) # ['Moby', 'Dick', '[', '}', ...]

In [82]:
# 특수문자만 있는 것 제거, lower()
for book_num in range(len(book_dataset)):
    book = book_dataset[book_num]
    eraser_list = []
    for vocab_num in range(len(book)):
        vocab = book_dataset[book_num][vocab_num]
        if not vocab.isalpha():
            eraser_list.append(vocab)
        elif vocab.isupper():
            book_dataset[book_num][vocab_num] = vocab.lower()
    for v in eraser_list:
        book_dataset[book_num].remove(v)

In [83]:
# POS tag
for book in book_dataset:
    pos_tokens = pos_tag(book)
    for tag_tup in pos_tokens:
        pos_dict.update({tag_tup[0]:tag_tup[1]})

---

In [86]:
# nltk.corpus.gutenberg
corpus_name = nltk.corpus.gutenberg.fileids()

corpus_list = []
for name in corpus_name:
    corpus_list.append(nltk.corpus.gutenberg.raw(name))

In [90]:
for corpus in corpus_list:
    tokenize_postag(corpus, tokenizer=tokenizer, pos_dict=pos_dict)

---

### Grouping by POS tag, save them

In [91]:
import numpy as np

VB_list = []
JJ_list = []
NN_list = []
RT_list = []

for key, value in pos_dict.items():
    if "VB" in value:
        VB_list.append(key)
    elif "JJ" in value:
        JJ_list.append(key)
    elif "NN" in value:
        NN_list.append(key)
    else:
        RT_list.append(key)
    
VB_list = np.array(VB_list)
NN_list = np.array(NN_list)
JJ_list = np.array(JJ_list)
RT_list = np.array(RT_list)

np.savez("pos_list",
            VB_list = VB_list,
            JJ_list = JJ_list,
            NN_list = NN_list,
            RT_list = RT_list)

### Embed word vector, save it
품사별로 나눠서 embed하지 않고 한 번에 embed

In [92]:
from gensim.models.fasttext import FastText
vector_size = 128
word_vector = FastText([list(pos_dict.keys())], vector_size = vector_size, window = 3, min_count = 1, workers = 1)
word_vector.save("word_vector.model")

### Generate ptxt codebook, codebook embedding model, save it

In [94]:
from CodebookModule import Codebook_HorizontalSliding
PW_rule = Codebook_HorizontalSliding
dic_codebook = {}

for pos_name, pos_list in {'NN_list': NN_list, 'VB_list':VB_list, 'JJ_list':JJ_list, 'RT_list':RT_list}.items():
    dict_pos = PW_rule(pos_list)
    encoded_pos = [list(dict_pos.keys())]
    encoded_pos_vec = FastText(encoded_pos, vector_size = vector_size, window = 3, min_count = 1, workers = 1)
    try:
        if pos_name == 'NN_list':
            encoded_pos_vec.save("NN_pwd.model")
        elif pos_name == 'VB_list':
            encoded_pos_vec.save("VB_pwd.model")
        elif pos_name == 'JJ_list':
            encoded_pos_vec.save("JJ_pwd.model")
        else:
            encoded_pos_vec.save("RT_pwd.model")
    except FileNotFoundError:
        print(f"No such list or model: {pos_list}")

    pos_codebook = []
    for pwd in encoded_pos_vec.wv.index_to_key:
        temp = np.hstack((word_vector.wv[dict_pos[pwd]], encoded_pos_vec.wv[pwd]))
        pos_codebook.append(temp)
    dic_codebook.update({pos_name:pos_codebook})

NN_codebook = dic_codebook['NN_list']
VB_codebook = dic_codebook['VB_list']
JJ_codebook = dic_codebook['JJ_list']
RT_codebook = dic_codebook['RT_list']

In [95]:
np.savez("pos_codebook",
            VB_codebook = VB_codebook,
            JJ_codebook = JJ_codebook,
            NN_codebook = NN_codebook,
            RT_codebook = RT_codebook)