### WordNet 맛보기

In [1]:
! pip install nltk



In [2]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\wousi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
from nltk.corpus import wordnet

In [4]:
wordnet.synsets('car')

[Synset('car.n.01'),
 Synset('car.n.02'),
 Synset('car.n.03'),
 Synset('car.n.04'),
 Synset('cable_car.n.01')]

In [5]:
car = wordnet.synset('car.n.01')
car.definition()

'a motor vehicle with four wheels; usually propelled by an internal combustion engine'

In [6]:
car.lemma_names()

['car', 'auto', 'automobile', 'machine', 'motorcar']

In [11]:
# 상위어 
# 다른단어와의 의미적 상하관계
car.hypernym_paths()[0] 

[Synset('entity.n.01'),
 Synset('physical_entity.n.01'),
 Synset('object.n.01'),
 Synset('whole.n.02'),
 Synset('artifact.n.01'),
 Synset('instrumentality.n.03'),
 Synset('container.n.01'),
 Synset('wheeled_vehicle.n.01'),
 Synset('self-propelled_vehicle.n.01'),
 Synset('motor_vehicle.n.01'),
 Synset('car.n.01')]

In [12]:
car = wordnet.synset('car.n.01')
novel = wordnet.synset('novel.n.01')
dog = wordnet.synset('dog.n.01')
motorcycle = wordnet.synset('motorcycle.n.01')
car.path_similarity(novel)

0.05555555555555555

In [13]:
car.path_similarity(dog)

0.07692307692307693

In [15]:
car.path_similarity(motorcycle)

0.3333333333333333

## 2.3 통계 기반 기법

In [16]:
text = 'You say goodbye and I say hello.'
text = text.lower()
text = text.replace('.',' .')
text

'you say goodbye and i say hello .'

In [17]:
words = text.split(' ')
words

['you', 'say', 'goodbye', 'and', 'i', 'say', 'hello', '.']

In [18]:
word_to_id = {}
id_to_word = {}
for word in words:
    if word not in word_to_id:
        new_id = len(word_to_id)
        word_to_id[word] = new_id
        id_to_word[new_id] = word

In [19]:
id_to_word

{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}

In [20]:
word_to_id

{'you': 0, 'say': 1, 'goodbye': 2, 'and': 3, 'i': 4, 'hello': 5, '.': 6}

In [21]:
id_to_word[1]

'say'

In [23]:
word_to_id['hello']

5

In [24]:
import numpy as np 
corpus = [word_to_id[w] for w in words]
corpus = np.array(corpus)
corpus

array([0, 1, 2, 3, 4, 1, 5, 6])

In [27]:
def preprocess(text):
    text = text.lower()
    text = text.replace('.',' .')
    words = text.split(' ')
    word_to_id ={}
    id_to_word ={}
    for word in words:
        if word not in word_to_id:
            new_id = len(word_to_id)
            word_to_id[word] = new_id
            id_to_word[new_id] = word
    corpus = np.array([word_to_id[w]] for w in words)
    return corpus, word_to_id, id_to_word

In [28]:
text = 'You say goodbye and I say hello.'
corpus , word_to_id, id_to_word = preprocess(text)

## 2.3.2. 단어의 분산표현(distributional representation)
- 분포가설: 단어의 의미는 주변단어에 의해 형성된다. 
- 단어자체에는 의미가 없고, 그 단어가 사용된 맥락(context)이 의미를 형성한다는 것.
- 맥락: 주변에 놓인 단어를 가르킨다. 

In [29]:
# 동시발생 행렬
import sys
sys.path.append('..')
import numpy as np
from common.util import preprocess
text = 'You say goodbye and I say hello.'
corpus , word_to_id, id_to_word = preprocess(text)

In [30]:
print(corpus)
print(id_to_word)

[0 1 2 3 4 1 5 6]
{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}


In [33]:
def create_co_matrix(corpus, vocab_size, window_size=1):
    corpus_size = len(corpus)
    co_matrix = np.zeros((vocab_size,vocab_size),dtype=np.int32)
    
    for idx, word_id in enumerate(corpus):
        for i in range(1,window_size+1):
            left_idx = idx-i
            right_idx = idx+i
            if left_idx>=0:
                left_word_id = corpus[left_idx]
                co_matrix[word_id,left_word_id] +=1
            if right_idx<corpus_size:
                right_word_id = corpus[right_idx]
                co_matrix[word_id, right_word_id] += 1
    return co_matrix 

In [34]:
create_co_matrix(corpus,8)

array([[0, 1, 0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 1, 1, 0, 0],
       [0, 1, 0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0]])

In [36]:
def cos_similarity(x,y,eps=1e-8):
    nx = x / np.sqrt(np.sum(x**2) + eps)
    ny = y / np.sqrt(np.sum(y**2) + eps)
    return np.dot(nx,ny)