In [1]:
import numpy as np # just in case

a_sentence = "I just want to encode this very very long sentence"

In [2]:
"""
One Hot Encoding

Ex. 
alls well ends well -> [1,0,0,0],[0,1,0,0],[0,0,1,0],[0,1,0,0]
"""
vocab = dict()
word_count = 0

for word in a_sentence.split(" "):
    if word in vocab:
        continue
    else:
        vocab[word] = word_count
        word_count += 1
        
word_count += 1

sentence_vec = []
for word in a_sentence.split(" "):
    encoded_word = np.zeros(word_count)
    one_idx = vocab[word] if word in vocab.keys() else word_count
    encoded_word[one_idx] = 1
    sentence_vec.append(encoded_word)
np.stack(sentence_vec)

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]])

In [3]:
vocab

{'I': 0,
 'just': 1,
 'want': 2,
 'to': 3,
 'encode': 4,
 'this': 5,
 'very': 6,
 'long': 7,
 'sentence': 8}

In [7]:
print("----split----")
b_sentence = "I love this raccoon, this raccoon is very fun"
for word in b_sentence.split(" "):
    print(word)
    
print("----NLTK----")
from nltk import word_tokenize
for word in word_tokenize(b_sentence):
    print(word)

----split----
I
love
raccoons,
raccoons
are
fun
----NLTK----
I
love
raccoons
,
raccoons
are
fun


In [4]:
"""
Bag of Words

Ex. 
sentence 1 => she sells sea shells by the sea shells => [1, 1, 1, 1, 1, 1, 1, 0]
sentence 2 => she sells sea shells by the sea shore => [1, 1, 1, 1, 1, 1, 0, 1]

Count Vectorizer

Ex. 
sentence 1 => she sells sea shells by the sea shells => [1, 2, 1, 1, 2, 0, 1]
sentence 2 => she sells sea shells by the sea shore => [1, 2, 1, 1, 1, 1, 1]
"""
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
sentence_1 = "she sells sea shells by the sea shells"
sentence_2 = "she sells sea shells by the sea shore"


vectorizer.fit([sentence_1, sentence_2])
vectorizer.transform([sentence_1]).toarray()

array([[1, 2, 1, 1, 2, 0, 1]], dtype=int64)

In [4]:
"""
Glove Embeddings

Download the file here: https://nlp.stanford.edu/projects/glove/  (https://nlp.stanford.edu/data/glove.6B.zip)
If the previous website is down: 
https://www.kaggle.com/datasets/rtatman/glove-global-vectors-for-word-representation?resource=download&select=glove.6B.50d.txt


"""

glove_path = "C:\\Users\\arthur\\Documents\\Data Science Club\\cxc-2022\\NLP\\glove.6B.50d.txt"
glove_dict = dict()
with open(glove_path,'r', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], np.float32)
        glove_dict[word]=vector

In [5]:
sampled_dict_arr = np.stack(list(glove_dict.values()))
print(sampled_dict_arr.shape)

(400000, 50)


In [6]:
target_words = ["rabbit", "dog", "animal", "banana", "orange", "fruit"]

target_words_vec = np.stack([glove_dict[word] for word in target_words])

In [8]:
target_words_vec.shape

(6, 50)

In [40]:
from scipy.spatial import distance

rabbit_animal = distance.cosine(target_words_vec[0], target_words_vec[2])
dog_animal = distance.cosine(target_words_vec[1], target_words_vec[2])

banana_animal = distance.cosine(target_words_vec[3], target_words_vec[2])
orange_animal = distance.cosine(target_words_vec[4], target_words_vec[2])

rabbit_fruit = distance.cosine(target_words_vec[0], target_words_vec[5])
dog_fruit = distance.cosine(target_words_vec[1], target_words_vec[5])

banana_fruit = distance.cosine(target_words_vec[3], target_words_vec[5])
orange_fruit = distance.cosine(target_words_vec[4], target_words_vec[5])

print("----Animals----")
print("Rabbit {} || Dog {} || Banana {} || Orange {}".format(rabbit_animal, dog_animal, banana_animal, orange_animal))

print("----Fruits----")
print("Rabbit {} || Dog {} || Banana {} || Orange {}".format(rabbit_fruit, dog_fruit, banana_fruit, orange_fruit))

----Animals----
Rabbit 0.4237109422683716 || Dog 0.2747744917869568 || Banana 0.6638819873332977 || Orange 0.6706958115100861
----Fruits----
Rabbit 0.5733278393745422 || Dog 0.5328371524810791 || Banana 0.2818593978881836 || Orange 0.4039539098739624


In [48]:
"""
BERT Vectorizer
"""
import torch
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_text = tokenizer.encode(sentence_1)
print(tokenized_text)

print("--encode_plus--")
tokenized_text = tokenizer.encode_plus(sentence_1)
print(tokenized_text)

[101, 2016, 15187, 2712, 10986, 2011, 1996, 2712, 10986, 102]
--encode_plus--
{'input_ids': [101, 2016, 15187, 2712, 10986, 2011, 1996, 2712, 10986, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
