<a href="https://colab.research.google.com/github/vriadi/CS614-Gen-AI-with-LLMs/blob/main/Topic1_cosine_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from collections import defaultdict  # specialized dictionary
import string # functions for working with strings

corpus = [
    "Hi everyone my name is Vanessa. I'm interested in this Gen AI LLM Class",
    "Main reason is because this Gen AI LLM would be useful in my product that I'm developing",
    "Another reason is that this is a trendy topic.",
]

In [None]:
def preprocess(text):
    text = text.lower() #convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))
    # creates a translation table mapping each punctuation mark to None
    # punctuation is not needed to build our vocab and counting word freq
    tokens = text.split() # Tokenize: here, we will split text into words: word = token
    return tokens

proc_corpus = [preprocess(sent) for sent in corpus]
print(proc_corpus)
print("Corpus length is", len(proc_corpus))

[['hi', 'everyone', 'my', 'name', 'is', 'vanessa', 'im', 'interested', 'in', 'this', 'gen', 'ai', 'llm', 'class'], ['main', 'reason', 'is', 'because', 'this', 'gen', 'ai', 'llm', 'would', 'be', 'useful', 'in', 'my', 'product', 'that', 'im', 'developing'], ['another', 'reason', 'is', 'that', 'this', 'is', 'a', 'trendy', 'topic']]
Corpus length is 3


In [None]:
# Build vocabulary

vocab = set()

for sent in proc_corpus:
    vocab.update(sent)

# Convert to a sorted list
vocab = sorted(list(vocab))
print("Vocab:", vocab)
print("Vocab length is", len(vocab))

Vocab: ['a', 'ai', 'another', 'be', 'because', 'class', 'developing', 'everyone', 'gen', 'hi', 'im', 'in', 'interested', 'is', 'llm', 'main', 'my', 'name', 'product', 'reason', 'that', 'this', 'topic', 'trendy', 'useful', 'vanessa', 'would']
Vocab length is 27


In [None]:
def create_bow(sentence, vocab):
    vector = [0] * len(vocab)
    for word in sentence:
        if word in vocab:
            idx = vocab.index(word)
            vector[idx] += 1
    return vector

bow_vectors = [create_bow(sent, vocab) for sent in proc_corpus]
print("BOW Vectors:")
for vector in bow_vectors:
    print(vector)
print("Vector length is", len(vector))

BOW Vectors:
[0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0]
[0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1]
[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0]
Vector length is 27


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print("Vocabulary:", vectorizer.get_feature_names_out())
print("BoW Representation:")
print(X.toarray())
print("Vector length is", len(X.toarray()[0]))

Vocabulary: ['ai' 'another' 'be' 'because' 'class' 'developing' 'everyone' 'gen' 'hi'
 'in' 'interested' 'is' 'llm' 'main' 'my' 'name' 'product' 'reason' 'that'
 'this' 'topic' 'trendy' 'useful' 'vanessa' 'would']
BoW Representation:
[[1 0 0 0 1 0 1 1 1 1 1 1 1 0 1 1 0 0 0 1 0 0 0 1 0]
 [1 0 1 1 0 1 0 1 0 1 0 1 1 1 1 0 1 1 1 1 0 0 1 0 1]
 [0 1 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 1 1 1 1 1 0 0 0]]
Vector length is 25


In [None]:
#compare vocab
vocab_manual = set(vocab)
vocab_sklearn = set(vectorizer.get_feature_names_out())

missing_in_sklearn = list(vocab_manual - vocab_sklearn)
print("Missing in sklearn:", missing_in_sklearn)

Missing in sklearn: ['a', 'im']


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(corpus)
print("Vocabulary:", vectorizer.get_feature_names_out())
print("Count Vocabulary:", len(vectorizer.get_feature_names_out()))
print("TF-IDF Representation:")
print(X_tfidf.toarray())

Vocabulary: ['ai' 'another' 'be' 'because' 'class' 'developing' 'everyone' 'gen' 'hi'
 'in' 'interested' 'is' 'llm' 'main' 'my' 'name' 'product' 'reason' 'that'
 'this' 'topic' 'trendy' 'useful' 'vanessa' 'would']
Count Vocabulary: 25
TF-IDF Representation:
[[0.24559104 0.         0.         0.         0.3229227  0.
  0.3229227  0.24559104 0.3229227  0.24559104 0.3229227  0.19072335
  0.24559104 0.         0.24559104 0.3229227  0.         0.
  0.         0.19072335 0.         0.         0.         0.3229227
  0.        ]
 [0.22190169 0.         0.29177405 0.29177405 0.         0.29177405
  0.         0.22190169 0.         0.22190169 0.         0.17232645
  0.22190169 0.29177405 0.22190169 0.         0.29177405 0.22190169
  0.22190169 0.17232645 0.         0.         0.29177405 0.
  0.29177405]
 [0.         0.41166084 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.48626704
  0.         0.         0.         0.         0.         0.31307868

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between the BOW vectors
cosine_sim_bow = cosine_similarity(bow_vectors)

# Print the cosine similarity matrix
print("Cosine Similarity Matrix (BOW):")
print(cosine_sim_bow)

Cosine Similarity Matrix (BOW):
[[1.         0.51856298 0.24174689]
 [0.51856298 1.         0.36563621]
 [0.24174689 0.36563621 1.        ]]


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between the BOW vectors
cosine_sim_tfidf = cosine_similarity(X_tfidf)

# Print the cosine similarity matrix
print("Cosine Similarity Matrix (TF_IDF):")
print(cosine_sim_tfidf)

Cosine Similarity Matrix (TF_IDF):
[[1.         0.33821868 0.13911372]
 [0.33821868 1.         0.26464039]
 [0.13911372 0.26464039 1.        ]]


#Word2Vec

In [None]:
!pip install scipy numpy gensim

Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy
  Downloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.2/38.2 MB[0m 

In [None]:
from gensim.models import Word2Vec
import gensim.downloader as api
from gensim.utils import simple_preprocess
import gensim.models

In [None]:
wv = api.load('text8')   # wv = api.load('20-newsgroups')
# download the corpus and return it opened as an iterable
# other datasets available at - https://github.com/piskvorky/gensim-data

model = Word2Vec(wv,min_count=3)  # train a model from the corpus



In [None]:
vocab = list(model.wv.key_to_index.keys())
print(vocab)



In [None]:
model.wv.most_similar("car")

[('driver', 0.7839148044586182),
 ('truck', 0.7359927296638489),
 ('cars', 0.731209397315979),
 ('taxi', 0.7097084522247314),
 ('motorcycle', 0.7059168815612793),
 ('vehicle', 0.6812731623649597),
 ('automobile', 0.6628857851028442),
 ('racing', 0.6575387716293335),
 ('passenger', 0.6567563414573669),
 ('cab', 0.6382613778114319)]

In [None]:
model.wv.similarity('car', 'university')

-0.058388777

In [None]:
model.wv.most_similar(positive=['dog'], topn=5)

[('cat', 0.8239580392837524),
 ('hound', 0.7858039736747742),
 ('pig', 0.7593693137168884),
 ('hamster', 0.7485567331314087),
 ('cow', 0.7462452054023743)]

In [None]:
print(model.wv.doesnt_match(['dog', 'cat', 'pig', 'horse', 'university', 'goat']))

university


In [None]:
model.wv.most_similar(positive=['astronomy'], topn=5)

[('astrophysics', 0.7999925017356873),
 ('astronomical', 0.7518278956413269),
 ('optics', 0.7371143102645874),
 ('physics', 0.7364563941955566),
 ('cosmology', 0.7260266542434692)]