In [8]:
import numpy as np
from gensim.corpora.dictionary import Dictionary
from sklearn.decomposition import PCA
import random

with open('raw_sentences.txt', 'r') as f:
    corpus = f.read().split()

In [2]:
def q21_cooccur_matrix(filename='raw_sentences.txt', window=2):
    '''
    Arguments
    filename: the filename of an English article
    window: context window，定義「上下文」的範圍
    Returns
    vocab: 將單字對應到 id
    inv_vocab: 將 id 對應到單字
    cooccur_matrix: NxN np.ndarray，代表 co-occurrence matrix
    '''
    with open(filename, 'r') as f:
        corpus = f.read().split()
    vocab = Dictionary([corpus], prune_at=float('inf'))
    corpus = vocab.doc2idx(corpus)
    inv_vocab = vocab.token2id
    cooccur_matrix = np.zeros([len(vocab)]*2)
    for i in range(window, len(corpus)-window):
        cooccur_matrix[ corpus[i], corpus[i-window:i] ] += 1
        cooccur_matrix[ corpus[i], corpus[i+1:i+window+1] ] += 1
    return vocab, inv_vocab, cooccur_matrix

vocab, inv_vocab, cooccur_matrix = q21_cooccur_matrix()

In [3]:
def q22_word_vectors(cooccur_matrix, dim=10):
    '''
    Arguments
    cooccur_matrix: co-occurrence matrix with shape (N, N)
    dim: PCA的維度，預設10維
    Returns
    word_vectors: word vector matrix with shape (N, 10)
    '''
    return PCA(dim).fit_transform(cooccur_matrix)

word_vectors = q22_word_vectors(cooccur_matrix)

In [9]:
def q23_similarity(word, wv=word_vectors, vocab=vocab, inv_vocab=inv_vocab):
    '''
    Arguments
    word: 要找的單字
    wv: word vector matrix with shape (N, 10)
    vocab: vocabulary
    inv_vocab: inverse vocabulary
    Returns
    ret: 長度為3的list，每個元素都是 tuple (word, similarity)
    '''
    wid = inv_vocab[word]
    norm = np.linalg.norm(wv, axis=1)
    dist = np.dot(wv, wv[wid]) / norm / norm[wid]
    sim = 1 - dist
    ret_id = np.argpartition(dist, 3)[:3]
    return [(vocab[i], sim[i]) for i in ret_id]

word = random.choice(corpus)
print(word)
for word, sim in q23_similarity(word):
    print(word, sim)

that
into 1.9781645994259374
during 1.9695623932155994
set 1.969410576564934


In [11]:
class WordVector(object):
    def __init__(self, filename, window=2, dim=10):
        self.vocab, self.inv_vocab, cooccur_matrix = q21_cooccur_matrix(filename, window)
        self.wv = q22_word_vectors(cooccur_matrix)
    def __getitem__(self, word):
        return self.inv_vocab[word]
    def most_similar(self, word):
        return q23_similarity(word, wv=self.wv, vocab=self.vocab, inv_vocab=self.inv_vocab)

wv = WordVector('raw_sentences.txt')
print(wv['office'])
for word, sim in wv.most_similar('office'):
    print(word, sim)

547
that 1.959660751700665
like 1.964935416358181
it 1.9585093742259674
