# natural language processing from scratch

# Make corpus

## import setting

In [1]:
from collections import Counter, defaultdict
import numpy as np

## split sentence into word

In [2]:
text = "You say goodbye and I say hello."

In [3]:
# note that space should have to be between first `'` and `(`
text=text.lower()
text=text.replace('.',' .')
splitted_text=text.split(" ")
print(splitted_text)

['you', 'say', 'goodbye', 'and', 'i', 'say', 'hello', '.']


## convert each word to id

In [4]:
word_to_id={}
for w in splitted_text:
    if not w in word_to_id:
        new_id=len(word_to_id)
        word_to_id[w] = new_id

id_to_word = {i:w for i,w in word_to_id.items()}

In [5]:
id_to_word

{'.': 6, 'and': 3, 'goodbye': 2, 'hello': 5, 'i': 4, 'say': 1, 'you': 0}

## make corpus

In [6]:
corpus = np.array([word_to_id[w] for w in splitted_text])
corpus

array([0, 1, 2, 3, 4, 1, 5, 6])

## create co-occurence matrix

In [7]:
def create_co_matrix(corpus, vocab_size, window_size=1):
    corpus_size = len(corpus)
    co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32)
    
    for c_idx, w_idx in enumerate(corpus):
        for i in range(1, window_size + 1):
            left_idx = c_idx-i
            right_idx = c_idx+i
            if not left_idx < 0:
                left_w_idx = corpus[left_idx]
                co_matrix[w_idx, left_w_idx] += 1
            if not right_idx >= corpus_size:
                right_w_idx = corpus[right_idx]
                co_matrix[w_idx, right_w_idx] += 1
    return co_matrix

In [8]:
co_matrix = create_co_matrix(corpus,len(word_to_id))

## cosine similarity

In [9]:
def cosine_similarity(xs, ys, eps=1e-8):
    # to prevent zero division we add eps to denominator
    normed_xs = xs/(np.linalg.norm(xs)+eps)
    normed_ys = ys/(np.linalg.norm(ys)+eps)
    cos_sim = np.dot(normed_xs,normed_ys)
    return cos_sim

In [10]:
you_vector = co_matrix[word_to_id["you"]]
i_vector = co_matrix[word_to_id["i"]]
cosine_similarity(you_vector, i_vector)

0.7071067691154799

In [11]:
cosine_similarity(np.ones(5),np.ones(5))

0.999999991055728