# MIND Utils Generation
많은 뉴스 추천법들이 워드 임베딩, 뉴스 vertical embedding, 뉴스 subvertical embedding, 유저 id 임베딩을 활용하므로 단어 사전, vertical 사전, subvertical 사전과 user id 사전을 만들어 각각을 문자열에서 색인으로 변환할 필요가 있다.   
To use the pretrain word embedding, a embedding matrix is generated as the intial weight of the word embedding layer.   
- word_dict.pkl: 뉴스 제목에 있는 단어들을 색인으로 변환한다.
- word_dict_all.pkl: 뉴스 제목과 요약에 있는 단어들을 색인으로 변환한다.
- embedding.npy: `word_dict.pkl`에 있는 단어들의 사전 훈련된 워드 임베딩 행렬
- embedding_all.npy: `word_dict_all.pkl`에 있는 단어들의 사전 훈련된 워드 임베딩 행렬
- vert_dict.pkl: convert news verticals into indexes.
- subvert_dict.pkl: convert news subverticals into indexes.
- uid2index.pkl: 유저 id들을 색인으로 변환한다.

In [1]:
import sys
import os
import pickle
import pandas as pd
import numpy as np
import scrapbook as sb
from collections import Counter
from tqdm import tqdm
from tempfile import TemporaryDirectory
from recommenders.datasets.mind import download_and_extract_glove, download_mind, extract_mind, load_glove_matrix, word_tokenize
from recommenders.datasets.download_utils import unzip_file

In [2]:
mind_type = 'demo'
word_emb_dim = 300 # should be in [50, 100, 200, 300]

tmpdir = TemporaryDirectory()
data_path = tmpdir.name
train_zip, valid_zip = download_mind(size=mind_type, dest_path=data_path)
unzip_file(train_zip, os.path.join(data_path, 'train'), clean_zip_file=False)
unzip_file(valid_zip, os.path.join(data_path, 'valid'), clean_zip_file=False)
output_path = os.path.join(data_path, 'utils')
os.makedirs(output_path, exist_ok=True)

100%|█████████████████████████████████████| 17.0k/17.0k [00:08<00:00, 2.11kKB/s]
100%|█████████████████████████████████████| 9.84k/9.84k [00:05<00:00, 1.82kKB/s]


## Prepare utils of news
- word dictionary
- vertical dictionary
- subvertical dictionary

In [3]:
news = pd.read_table(os.path.join(data_path, 'train', 'news.tsv'),
                    names=['newid', 'vertical', 'subvertical', 'title','abstract',
                           'url', 'entities in title', 'entities in abstract'],
                    usecols=['vertical', 'subvertical', 'title', 'abstract'])
news.head()

Unnamed: 0,vertical,subvertical,title,abstract
0,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the..."
1,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...
2,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi..."
3,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re..."
4,weather,weathertopstories,It's been Orlando's hottest October ever so fa...,There won't be a chill down to your bones this...


In [11]:
news_vertical = news.vertical.drop_duplicates().reset_index(drop=True)
vert_dict_inv = news_vertical.to_dict()
vert_dict = {v:k+1 for k,v in vert_dict_inv.items()}

news_subvertical = news.subvertical.drop_duplicates().reset_index(drop=True)
subvert_dict_inv = news.subvertical.to_dict()
subvert_dict = {v:k+1 for k,v in subvert_dict_inv.items()}

news.title = news.title.apply(word_tokenize)
news.abstract = news.abstract.apply(word_tokenize)

word_cnt = Counter()
word_cnt_all = Counter()

for i in tqdm(range(len(news))):
    word_cnt.update(news.loc[i]['title'])
    word_cnt_all.update(news.loc[i]['title'])
    if i < 3:
        print('a', word_cnt_all)
    else:
        pass
    word_cnt_all.update(news.loc[i]['abstract'])
    if i < 3:
        print('b', word_cnt_all)
    else:
        pass
    
word_dict = {k: v+1 for k,v in zip(word_cnt, range(len(word_cnt)))}
word_dict_all = {k: v+1 for k,v in zip(word_cnt_all, range(len(word_cnt_all)))}

  6%|██                                  | 1568/26740 [00:00<00:03, 7833.49it/s]

a Counter({',': 2, 'prince': 2, 'the': 1, 'brands': 1, 'queen': 1, 'elizabeth': 1, 'charles': 1, 'and': 1, 'philip': 1, 'swear': 1, 'by': 1})
b Counter({',': 4, 'the': 3, 'prince': 2, 'and': 2, 'brands': 1, 'queen': 1, 'elizabeth': 1, 'charles': 1, 'philip': 1, 'swear': 1, 'by': 1, 'shop': 1, 'notebooks': 1, 'jackets': 1, 'more': 1, 'that': 1, 'royals': 1, 'can': 1, 't': 1, 'live': 1, 'without': 1, '.': 1})
a Counter({'the': 5, ',': 4, 'prince': 2, 'and': 2, 'of': 2, 's': 2, 'brands': 1, 'queen': 1, 'elizabeth': 1, 'charles': 1, 'philip': 1, 'swear': 1, 'by': 1, 'shop': 1, 'notebooks': 1, 'jackets': 1, 'more': 1, 'that': 1, 'royals': 1, 'can': 1, 't': 1, 'live': 1, 'without': 1, '.': 1, 'cost': 1, 'trump': 1, 'aid': 1, 'freeze': 1, 'in': 1, 'trenches': 1, 'ukraine': 1, 'war': 1})
b Counter({'the': 7, ',': 5, '.': 4, 'of': 4, 'prince': 2, 'and': 2, 's': 2, 'in': 2, 'ukraine': 2, 'war': 2, 'to': 2, 'brands': 1, 'queen': 1, 'elizabeth': 1, 'charles': 1, 'philip': 1, 'swear': 1, 'by': 1, '

100%|███████████████████████████████████| 26740/26740 [00:03<00:00, 7569.31it/s]


In [14]:
with open(os.path.join(output_path, 'vert_dict.pkl'), 'wb') as f:
    pickle.dump(vert_dict, f)
    
with open(os.path.join(output_path, 'subvert_dict.pkl'), 'wb') as f:
    pickle.dump(subvert_dict, f)

with open(os.path.join(output_path, 'word_dict.pkl'), 'wb') as f:
    pickle.dump(word_dict, f)
    
with open(os.path.join(output_path, 'word_dict_all.pkl'), 'wb') as f:
    pickle.dump(word_dict_all, f)

## Prepare embedding matrixs
- embedding.npy : numpy array 파일
- embedding_all.npy

In [15]:
glove_path = download_and_extract_glove(data_path)

100%|███████████████████████████████████████| 842k/842k [03:19<00:00, 4.22kKB/s]


In [16]:
embedding_matrix, exist_word = load_glove_matrix(glove_path, word_dict, word_emb_dim)
embedding_all_matrix, exist_all_word = load_glove_matrix(glove_path, word_dict_all, word_emb_dim)

np.save(os.path.join(output_path, 'embedding.npy'), embedding_matrix)
np.save(os.path.join(output_path, 'embedding_all.npy'), embedding_all_matrix)

400000it [00:04, 92361.28it/s] 
400000it [00:05, 78075.08it/s] 


## Prepare uid2index.pkl

In [17]:
uid2index = {}

with open(os.path.join(data_path, 'train', 'behaviors.tsv'), 'r') as f:
    for l in tqdm(f):
        uid = l.strip('\n').split('\t')[1]
        if uid not in uid2index:
            uid2index[uid] = len(uid2index) + 1

22034it [00:00, 721356.38it/s]


In [19]:
with open(os.path.join(data_path, 'uid2index.pkl'), 'wb') as f:
    pickle.dump(uid2index, f)

In [20]:
utils_state = {'vert_num': len(vert_dict),
               'subvert_num': len(subvert_dict),
               'word_num': len(word_dict),
               'word_num_all': len(word_dict_all),
               'embedding_exist_num': len(exist_word),
               'embedding_exist_num_all': len(exist_all_word),
               'uid2index': len(uid2index)}

sb.glue('utils_state', utils_state)

tmpdir.cleanup()