## Testing Basic Language Models

#### Test Words

In [3]:
word_pairs = [('learn', 'learning'), ('india', 'indian'), ('fame', 'famous')]

In [24]:
target = "apartment"
user = "condo"

#### GloVe Pretrained Vector Embedding Model

In [1]:
from gensim.models import KeyedVectors
from gensim.downloader import load

In [9]:
glove_model = load('glove-wiki-gigaword-50')
 
# Compute similarity for each pair of words
for pair in word_pairs:
    similarity = glove_model.similarity(pair[0], pair[1])
    print(f"Similarity between '{pair[0]}' and '{pair[1]}' using GloVe: {similarity:.3f}")

Similarity between 'learn' and 'learning' using GloVe: 0.802
Similarity between 'india' and 'indian' using GloVe: 0.865
Similarity between 'fame' and 'famous' using GloVe: 0.589


In [25]:
similarity = glove_model.similarity(target, user)
print(f"Similarity between '{target}' and '{user}' using GloVe: {similarity:.3f}")

Similarity between 'apartment' and 'condo' using GloVe: 0.741


#### Pretrained BERT

In [2]:
from transformers import BertTokenizer, BertModel
import torch
 
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
for pair in word_pairs:
    tokens = tokenizer(pair, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**tokens)
     
    # Extract embeddings for the [CLS] token
    cls_embedding = outputs.last_hidden_state[:, 0, :]
 
    similarity = torch.nn.functional.cosine_similarity(cls_embedding[0], cls_embedding[1], dim=0)
     
    print(f"Similarity between '{pair[0]}' and '{pair[1]}' using BERT: {similarity:.3f}")

Similarity between 'learn' and 'learning' using BERT: 0.930
Similarity between 'india' and 'indian' using BERT: 0.957
Similarity between 'fame' and 'famous' using BERT: 0.956


In [26]:
#target = "apa"
#user = "instructor"

tokens = tokenizer((target, user), return_tensors='pt')
with torch.no_grad():
    outputs = model(**tokens)
    
# Extract embeddings for the [CLS] token
cls_embedding = outputs.last_hidden_state[:, 0, :]

similarity = torch.nn.functional.cosine_similarity(cls_embedding[0], cls_embedding[1], dim=0)
    
print(f"Similarity between '{target}' and '{user}' using BERT: {similarity:.3f}")

Similarity between 'apartment' and 'condo' using BERT: 0.984


#### Sentence Transformer

In [27]:
from scipy.spatial import distance
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [30]:
# Sample sentence
sentences = ["Do you take any supplements or medications?",
             "How are you feeling?",
             "You should do jumping jacks for the pain."]


test = "Are you on any blood thinning medications?"
print('Test sentence:',test)
test_vec = model.encode([test])[0]


for sent in sentences:
    similarity_score = 1-distance.cosine(test_vec, model.encode([sent])[0])
    print(f'\n{sent}\nSimilarity Score = {similarity_score} ')

Test sentence: Are you on any blood thinning medications?

Do you take any supplements or medications?
Similarity Score = 0.6106548491315501 

How are you feeling?
Similarity Score = 0.23295672830552505 

You should do jumping jacks for the pain.
Similarity Score = 0.08043158294056607 
