# Generate Embedding Model
## 본 과정에서 불러오는 Embedding 모델을 만드는 과정을 담고 있습니다. 

### Download Dataset
추후 추가될 수 있습니다. 

In [1]:
# json 파일 불러오기
import json
with open("test-transcripts-aligned.json") as json_data:
    data = json.load(json_data)

# txt_dataset(list)에 json file의 utterance(내용 부분)만 꺼내서 담는다
txt_dataset = []

for i in data:
    for j in data[i]:
        txt_dataset.append(j['utterance'])

# txt_dataset을 하나의 string data로 변환한다. 
txt_data = ''
for txt in txt_dataset:
    txt_data += txt
# 예외처리
txt_data = txt_data.replace('\xa0', ' ')


### Tokenize, POS tag

In [2]:
# RegexpTokenizer: 정규표현식(Regular expression)을 이용해 tokenize
from nltk.tokenize import RegexpTokenizer
# \w(alphabet, numbers) 패턴을 사용해 tokenize. 특수문자는 제외된다. 
tokenizer = RegexpTokenizer("[\w]+")
tokens = tokenizer.tokenize(txt_data)
tokens

['I',
 'll',
 'pour',
 'this',
 'pestilence',
 'into',
 'his',
 'ear',
 'So',
 'will',
 'I',
 'make',
 'the',
 'net',
 'that',
 'will',
 'enmesh',
 'them',
 'all',
 'It',
 's',
 'an',
 'adult',
 'Iago',
 'who',
 'says',
 'that',
 'in',
 'Othello',
 'And',
 'it',
 's',
 'grownups',
 'that',
 'Machiavelli',
 'was',
 'writing',
 'about',
 'when',
 'he',
 'wrote',
 'The',
 'Prince',
 'his',
 'book',
 'about',
 'manipulating',
 'others',
 'and',
 'seizing',
 'power',
 'Notice',
 'he',
 'titled',
 'the',
 'book',
 'The',
 'Prince',
 'not',
 'The',
 'Little',
 'Prince',
 'The',
 'Little',
 'Prince',
 'is',
 'actually',
 'by',
 'somebody',
 'else',
 'if',
 'you',
 'don',
 't',
 'know',
 'that',
 'But',
 'in',
 'our',
 'American',
 'lives',
 'the',
 'real',
 'era',
 'of',
 'intrigue',
 'and',
 'manipulation',
 'for',
 'most',
 'of',
 'us',
 'is',
 'not',
 'adulthood',
 'It',
 's',
 'adolescence',
 'when',
 'our',
 'social',
 'circle',
 'is',
 'at',
 'its',
 'most',
 'constricting',
 'Today',
 '

In [3]:
# POS tag: tuple의 list 형태
from nltk.tag import pos_tag
pos_tokens = pos_tag(tokens)

# dictionary로 변환
pos_dict = {}
for tag_tup in pos_tokens:
    pos_dict.update({tag_tup[0]:tag_tup[1]})
pos_dict

{'I': 'PRP',
 'll': 'VBP',
 'pour': 'VB',
 'this': 'DT',
 'pestilence': 'NN',
 'into': 'IN',
 'his': 'PRP$',
 'ear': 'NN',
 'So': 'IN',
 'will': 'MD',
 'make': 'VB',
 'the': 'DT',
 'net': 'JJ',
 'that': 'DT',
 'enmesh': 'VB',
 'them': 'PRP',
 'all': 'DT',
 'It': 'PRP',
 's': 'VBZ',
 'an': 'DT',
 'adult': 'NN',
 'Iago': 'NNP',
 'who': 'WP',
 'says': 'VBZ',
 'in': 'IN',
 'Othello': 'NNP',
 'And': 'CC',
 'it': 'PRP',
 'grownups': 'NNS',
 'Machiavelli': 'NNP',
 'was': 'VBD',
 'writing': 'VBG',
 'about': 'IN',
 'when': 'WRB',
 'he': 'PRP',
 'wrote': 'VBD',
 'The': 'DT',
 'Prince': 'NNP',
 'book': 'NN',
 'manipulating': 'VBG',
 'others': 'NNS',
 'and': 'CC',
 'seizing': 'VBG',
 'power': 'NN',
 'Notice': 'NNP',
 'titled': 'VBN',
 'not': 'RB',
 'Little': 'JJ',
 'is': 'VBZ',
 'actually': 'RB',
 'by': 'IN',
 'somebody': 'NN',
 'else': 'RB',
 'if': 'IN',
 'you': 'PRP',
 'don': 'VBP',
 't': 'JJ',
 'know': 'VBP',
 'But': 'CC',
 'our': 'PRP$',
 'American': 'JJ',
 'lives': 'NNS',
 'real': 'JJ',
 'era

### Grouping by POS tag, save them

In [4]:
import numpy as np

VB_list = []
JJ_list = []
NN_list = []
RT_list = []

for key, value in pos_dict.items():
    if "VB" in value:
        VB_list.append(key)
    elif "JJ" in value:
        JJ_list.append(key)
    elif "NN" in value:
        NN_list.append(key)
    else:
        RT_list.append(key)
    
VB_list = np.array(VB_list)
NN_list = np.array(NN_list)
JJ_list = np.array(JJ_list)
RT_list = np.array(RT_list)

np.savez("pos_list",
            VB_list = VB_list,
            JJ_list = JJ_list,
            NN_list = NN_list,
            RT_list = RT_list)

### Embed word vector, save it
품사별로 나눠서 embed하지 않고 한 번에 embed

In [5]:
from gensim.models.fasttext import FastText
vector_size = 128
word_vector = FastText([list(pos_dict.keys())], vector_size = vector_size, window = 3, min_count = 1, workers = 1)
word_vector.save("word_vector.model")

### Generate ptxt codebook, codebook embedding model, save it

In [6]:
from CodebookModule import Codebook_HorizontalSliding
PW_rule = Codebook_HorizontalSliding
dic_codebook = {}

for pos_name, pos_list in {'NN_list': NN_list, 'VB_list':VB_list, 'JJ_list':JJ_list, 'RT_list':RT_list}.items():
    dict_pos = PW_rule(pos_list)
    encoded_pos = [list(dict_pos.keys())]
    encoded_pos_vec = FastText(encoded_pos, vector_size = vector_size, window = 3, min_count = 1, workers = 1)
    try:
        if pos_name == 'NN_list':
            encoded_pos_vec.save("NN_pwd.model")
        elif pos_name == 'VB_list':
            encoded_pos_vec.save("VB_pwd.model")
        elif pos_name == 'JJ_list':
            encoded_pos_vec.save("JJ_pwd.model")
        else:
            encoded_pos_vec.save("RT_pwd.model")
    except FileNotFoundError:
        print(f"No such list or model: {pos_list}")

    pos_codebook = []
    for pwd in encoded_pos_vec.wv.index_to_key:
        temp = np.hstack((word_vector.wv[dict_pos[pwd]], encoded_pos_vec.wv[pwd]))
        pos_codebook.append(temp)
    dic_codebook.update({pos_name:pos_codebook})

NN_codebook = dic_codebook['NN_list']
VB_codebook = dic_codebook['VB_list']
JJ_codebook = dic_codebook['JJ_list']
RT_codebook = dic_codebook['RT_list']