Jaccard Similarity

In [None]:
#Text embedding
sentence_1 = "mountain is very beautiful to see"
sentence_2 = "gargeous to see mountain"
def jaccard_similarity(sentence_1, sentence_2):
  intersection_sen = len(set.intersection(*[set(sentence_1),set(sentence_2)]))
  union_sen = len(set.union(*[set(sentence_1),set(sentence_2)]))
  return intersection_sen/float(union_sen)

sentence_1.split()
sentence_2.split()
print(jaccard_similarity(sentence_1, sentence_2))

0.6470588235294118


In [None]:
!pip install spacy
import spacy
import numpy as np
# Load a spaCy language model (you need to download one first, e.g., 'en_core_web_sm')
nlp = spacy.load('en_core_web_sm')



In [None]:
from math import sqrt

def squared_sen(x):
    return round(sqrt(sum([a*a for a in x])), 3)

def vector_magnitudes(vectors):
    return [squared_sen(vector) for vector in vectors]

embeddings_1 = [nlp(sentence).vector for sentence in sentence_1]
magnitudes_1 = vector_magnitudes(embeddings_1)
print(magnitudes_1)

embeddings_2 = [nlp(sentence).vector for sentence in sentence_2]
magnitudes_2 = vector_magnitudes(embeddings_2)
print(magnitudes_2)


[9.252, 8.921, 8.985, 8.541, 7.91, 7.577, 9.138, 8.541, 13.141, 9.138, 8.636, 13.141, 8.44, 9.627, 8.043, 7.49, 13.141, 8.113, 9.627, 7.577, 8.985, 7.91, 9.138, 8.245, 8.985, 7.869, 13.141, 7.91, 8.921, 13.141, 8.636, 9.627, 9.627]
[8.695, 7.577, 8.043, 8.695, 9.627, 8.921, 8.985, 8.636, 13.141, 7.91, 8.921, 13.141, 8.636, 9.627, 9.627, 13.141, 9.252, 8.921, 8.985, 8.541, 7.91, 7.577, 9.138, 8.541]


Euclidean Distance

In [None]:
from math import sqrt

def euclidean_distance(vec1, vec2):
    return round(sqrt(sum((a - b) ** 2 for a, b in zip(vec1, vec2))), 3)

embeddings_1 = [nlp(sentence).vector for sentence in sentence_1]
embeddings_2 = [nlp(sentence).vector for sentence in sentence_2]

distances = [euclidean_distance(vec1, vec2) for vec1, vec2 in zip(embeddings_1, embeddings_2)]
print(distances)


[8.679, 9.23, 7.545, 10.457, 8.267, 9.23, 8.49, 10.543, 0.0, 10.142, 8.042, 0.0, 8.704, 0.0, 7.013, 15.654, 15.919, 6.362, 6.51, 9.171, 7.724, 9.105, 0.0, 9.338]


Vector Converting

In [None]:
# vectorize converting

import spacy
from sklearn.feature_extraction.text import CountVectorizer

# Load a spaCy language model (you need to download one first, e.g., 'en_core_web_sm')
nlp = spacy.load('en_core_web_sm')

#Text embedding
sentence_1 = "mountain is very beautiful to see"
sentence_2 = "gargeous to see mountain"

# Create a CountVectorizer object
vectorizer = CountVectorizer()

# Fit the vectorizer to the sentences
vectorizer.fit([sentence_1, sentence_2])

# Transform the sentences into vectors
vector_1 = vectorizer.transform([sentence_1])
vector_2 = vectorizer.transform([sentence_2])

# Print the vectors
print("Vector for sentence 1:")
print(vector_1.toarray())
print("\nVector for sentence 2:")
print(vector_2.toarray())


Vector for sentence 1:
[[1 0 1 1 1 1 1]]

Vector for sentence 2:
[[0 1 0 1 1 1 0]]


cosine similarity

In [None]:
#cosine similarity

def cosine_similarity(vec1, vec2):
  dot_product = sum(a * b for a, b in zip(vec1, vec2))
  magnitude_vec1 = sqrt(sum(a * a for a in vec1))
  magnitude_vec2 = sqrt(sum(b * b for b in vec2))
  return dot_product / (magnitude_vec1 * magnitude_vec2)
vector1 = embeddings_1[0]
vector2 = embeddings_2[0]
similarity = cosine_similarity(vector1, vector2)
print("Cosine Similarity:", similarity)

Cosine Similarity: 0.5337411579499725


bag of words

In [None]:

sentence_1 = "mountain is very beautiful to see"
sentence_2 = "gargeous to see mountain"
vocab = set(sentence_1.split() + sentence_2.split())

def create_bow_vector(sentence, vocab):
  vector = [0] * len(vocab)
  for word in sentence.split():
    if word in vocab:
      vector[list(vocab).index(word)] += 1
  return vector

bow_vector_1 = create_bow_vector(sentence_1, vocab)
bow_vector_2 = create_bow_vector(sentence_2, vocab)

print("Bag-of-Words Vector for sentence 1:", bow_vector_1)
print("Bag-of-Words Vector for sentence 2:", bow_vector_2)


Bag-of-Words Vector for sentence 1: [1, 1, 1, 1, 1, 0, 1]
Bag-of-Words Vector for sentence 2: [0, 0, 1, 1, 1, 1, 0]


Continous Bag of words

In [None]:
# continos bag of words

def generate_context_word_pairs(corpus, window_size):
  context_word_pairs = []
  for sentence in corpus:
    for i, word in enumerate(sentence):
      context = []
      # Get words before the target word
      for j in range(i - window_size, i):
        if j >= 0:
          context.append(sentence[j])
      # Get words after the target word
      for j in range(i + 1, i + window_size + 1):
        if j < len(sentence):
          context.append(sentence[j])
      if context:  # Only add pairs with non-empty context
        context_word_pairs.append((context, word))

  return context_word_pairs

# Example usage:
corpus = [
    ["the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"],
    ["a", "cat", "sat", "on", "the", "mat"]
]

window_size = 2
context_word_pairs = generate_context_word_pairs(corpus, window_size)

for context, word in context_word_pairs:
  print(f"Context: {context}, Word: {word}")

# Now you can use these context-word pairs to train a CBOW model.
# This would involve creating a vocabulary, mapping words to indices,
# and building a neural network that predicts the target word given its context.


Context: ['quick', 'brown'], Word: the
Context: ['the', 'brown', 'fox'], Word: quick
Context: ['the', 'quick', 'fox', 'jumps'], Word: brown
Context: ['quick', 'brown', 'jumps', 'over'], Word: fox
Context: ['brown', 'fox', 'over', 'the'], Word: jumps
Context: ['fox', 'jumps', 'the', 'lazy'], Word: over
Context: ['jumps', 'over', 'lazy', 'dog'], Word: the
Context: ['over', 'the', 'dog'], Word: lazy
Context: ['the', 'lazy'], Word: dog
Context: ['cat', 'sat'], Word: a
Context: ['a', 'sat', 'on'], Word: cat
Context: ['a', 'cat', 'on', 'the'], Word: sat
Context: ['cat', 'sat', 'the', 'mat'], Word: on
Context: ['sat', 'on', 'mat'], Word: the
Context: ['on', 'the'], Word: mat


Text clustering

In [None]:
k-means clustering for above programing

from sklearn.cluster import KMeans

# Assuming you have your embeddings in a list called 'embeddings'
# For example, using the spaCy embeddings from your previous code:
embeddings = [nlp(sentence).vector for sentence in [sentence_1, sentence_2]]

# Choose the number of clusters (k)
k = 2

# Create a KMeans object
kmeans = KMeans(n_clusters=k)

# Fit the model to your embeddings
kmeans.fit(embeddings)

# Get the cluster assignments for each sentence
labels = kmeans.labels_
print("Cluster Assignments:", labels)

# Get the cluster centers
centers = kmeans.cluster_centers_
print("Cluster Centers:", centers)


Cluster Assignments: [0 1]
Cluster Centers: [[-0.13056597 -0.10324321  0.04716581 -0.17158373 -0.91761899 -0.29834393
   0.71635962  0.03759912 -0.10324943  0.3631146   0.09690655 -0.76376706
  -0.16356631  0.56602079 -0.66687679  0.13439052  0.34797394 -0.700562
  -0.29152271  0.22376704 -0.16747396  0.09409789 -0.78555465 -0.07938088
  -0.08531129 -0.64172858  0.72307581  0.22541553 -0.19762675  0.35831079
  -0.09355607  0.4766798   0.3932066  -0.08393378  0.49032846 -0.25784597
  -0.34539744 -0.32916462  0.39670733  0.30233672  0.02041661 -0.31373152
   0.08226699  0.48066464  0.37098348 -0.08990838  0.09107318 -0.1868359
   0.39018884  0.01386564 -0.57389456  0.08593389 -0.32066861 -0.3436034
  -0.1747634   0.55364507 -0.08726767  0.3616676  -0.15308623 -0.03415532
  -0.07015631 -0.28114811 -0.27613759 -0.02510474  0.35842291  0.49779788
  -0.18734144 -0.62526011  1.11536539  0.12866455  0.02784591 -0.48017403
  -0.56910115  0.48310483 -0.38741454  0.01555627  0.00234882 -0.5560624

  super()._check_params_vs_input(X, default_n_init=10)
