In [1]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [2]:
!pwd

/content


In [3]:
%cd /gdrive/MyDrive/TAVE_NLP/GloVe

/gdrive/MyDrive/TAVE_NLP/GloVe


In [4]:
# coding: utf-8
import sys
import os
sys.path.append('..')
try:
    import urllib.request
except ImportError:
    raise ImportError('Use Python3!')
import pickle
import numpy as np


url_base = 'https://raw.githubusercontent.com/tomsercu/lstm/master/data/'
key_file = {
    'train':'ptb.train.txt',
    'test':'ptb.test.txt',
    'valid':'ptb.valid.txt'
}
save_file = {
    'train':'ptb.train.npy',
    'test':'ptb.test.npy',
    'valid':'ptb.valid.npy'
}
vocab_file = 'ptb.vocab.pkl'

dataset_dir = os.path.dirname(os.path.abspath('/gdrive/MyDrive/TAVE_NLP/GloVe'))


def _download(file_name):
    file_path = dataset_dir + '/' + file_name
    if os.path.exists(file_path):
        return

    print('Downloading ' + file_name + ' ... ')

    try:
        urllib.request.urlretrieve(url_base + file_name, file_path)
    except urllib.error.URLError:
        import ssl
        ssl._create_default_https_context = ssl._create_unverified_context
        urllib.request.urlretrieve(url_base + file_name, file_path)

    print('Done')


def load_vocab():
    vocab_path = dataset_dir + '/' + vocab_file

    if os.path.exists(vocab_path):
        with open(vocab_path, 'rb') as f:
            word_to_id, id_to_word = pickle.load(f)
        return word_to_id, id_to_word

    word_to_id = {}
    id_to_word = {}
    data_type = 'train'
    file_name = key_file[data_type]
    file_path = dataset_dir + '/' + file_name

    _download(file_name)

    words = open(file_path).read().replace('\n', '<eos>').strip().split()

    for i, word in enumerate(words):
        if word not in word_to_id:
            tmp_id = len(word_to_id)
            word_to_id[word] = tmp_id
            id_to_word[tmp_id] = word

    with open(vocab_path, 'wb') as f:
        pickle.dump((word_to_id, id_to_word), f)

    return word_to_id, id_to_word


def load_data(data_type='train'):
    '''
        :param data_type: データの種類：'train' or 'test' or 'valid (val)'
        :return:
    '''
    if data_type == 'val': data_type = 'valid'
    save_path = dataset_dir + '/' + save_file[data_type]

    word_to_id, id_to_word = load_vocab()

    if os.path.exists(save_path):
        corpus = np.load(save_path)
        return corpus, word_to_id, id_to_word

    file_name = key_file[data_type]
    file_path = dataset_dir + '/' + file_name
    _download(file_name)

    words = open(file_path).read().replace('\n', '<eos>').strip().split()
    corpus = np.array([word_to_id[w] for w in words])

    np.save(save_path, corpus)
    return corpus, word_to_id, id_to_word


if __name__ == '__main__':
    for data_type in ('train', 'val', 'test'):
        load_data(data_type)


In [15]:
# <eos>, <unk> 토큰 제거 함수
def remove_token(id_corpus, word_to_id):
  eos_id=word_to_id['<eos>']
  unk_id=word_to_id['<unk>']
  
  new_id_corpus=[]
  
  for id in id_corpus:
    if (eos_id == id) or (unk_id == id):
      pass
    else:
      new_id_corpus.append(id)
  
  return new_id_corpus



# GloVe 라이브러리 입력을 위해 단어로 된 corpus 생성 함수
def id_to_word_transfer(id_corpus, word_to_id, id_to_word):
  
  id_corpus=remove_token(id_corpus, word_to_id)

  word_corpus=[]
  for id in id_corpus:
    if id_to_word[id] != None:
      word_corpus.append(id_to_word[id])

  return word_corpus

In [38]:
id_corpus, word_to_id, id_to_word = load_data('train') 
id_corpus = id_corpus.tolist()

word_corpus=id_to_word_transfer(id_corpus, word_to_id, id_to_word)
word_corpus[:20]

['aer',
 'banknote',
 'berlitz',
 'calloway',
 'centrust',
 'cluett',
 'fromstein',
 'gitano',
 'guterman',
 'hydro-quebec',
 'ipo',
 'kia',
 'memotec',
 'mlx',
 'nahb',
 'punts',
 'rake',
 'regatta',
 'rubens',
 'sim']

In [39]:
# 토큰 잘 제거됐는지 확인
print('<eos>' in word_corpus)
print('<unk>' in word_corpus)

False
False


In [52]:
# shape 확인
word_corpus=np.array(word_corpus)
word_corpus.shape

(842501,)

In [59]:
# row number 설정
num_row=int(np.sqrt(word_corpus.shape[0]))
num_row

917

In [60]:
# row number by row number 로 reshape
word_corpus=word_corpus[:num_row*num_row]
word_corpus=word_corpus.reshape(num_row, num_row)
word_corpus.shape

(917, 917)

In [61]:
# 입력을 위해 다시 list type으로 변경
word_corpus=word_corpus.tolist()
type(id_corpus)

list

In [44]:
pip install glove_python_binary



In [45]:
from glove import Corpus, Glove

In [62]:
corpus=Corpus()
corpus.fit(word_corpus, window=5)
glove=Glove(no_components=100, learning_rate=0.05)
glove.fit(corpus.matrix, epochs=20, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)

Performing 20 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19


In [66]:
glove.most_similar('consumer')

[('futures', 0.8868279985934512),
 ('contracts', 0.8340962663477058),
 ('construction', 0.8040763858887952),
 ('corporate', 0.7973266660082352)]

In [68]:
corpus.dictionary

{'aer': 0,
 'banknote': 1,
 'berlitz': 2,
 'calloway': 3,
 'centrust': 4,
 'cluett': 5,
 'fromstein': 6,
 'gitano': 7,
 'guterman': 8,
 'hydro-quebec': 9,
 'ipo': 10,
 'kia': 11,
 'memotec': 12,
 'mlx': 13,
 'nahb': 14,
 'punts': 15,
 'rake': 16,
 'regatta': 17,
 'rubens': 18,
 'sim': 19,
 'snack-food': 20,
 'ssangyong': 21,
 'swapo': 22,
 'wachter': 23,
 'pierre': 24,
 'N': 25,
 'years': 26,
 'old': 27,
 'will': 28,
 'join': 29,
 'the': 30,
 'board': 31,
 'as': 32,
 'a': 33,
 'nonexecutive': 34,
 'director': 35,
 'nov.': 36,
 'mr.': 37,
 'is': 38,
 'chairman': 39,
 'of': 40,
 'n.v.': 41,
 'dutch': 42,
 'publishing': 43,
 'group': 44,
 'rudolph': 45,
 'and': 46,
 'former': 47,
 'consolidated': 48,
 'gold': 49,
 'fields': 50,
 'plc': 51,
 'was': 52,
 'named': 53,
 'this': 54,
 'british': 55,
 'industrial': 56,
 'conglomerate': 57,
 'form': 58,
 'asbestos': 59,
 'once': 60,
 'used': 61,
 'to': 62,
 'make': 63,
 'kent': 64,
 'cigarette': 65,
 'filters': 66,
 'has': 67,
 'caused': 68,
 'hi