# Bag-of-Words Approach

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
# Sample corpus
corpus = [
"I love machine learning",
"Machine learning is fascinating",
"I love programming"
]

In [2]:
# Step 1: Count Vectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

# Convert to array and normalize
word_counts = X.toarray()
normalized_counts = word_counts / np.sum(word_counts, axis=1,keepdims=True)

In [3]:
print("Word counts:\n", word_counts)
print("Normalized word counts:\n", normalized_counts)

Word counts:
 [[0 0 1 1 1 0]
 [1 1 1 0 1 0]
 [0 0 0 1 0 1]]
Normalized word counts:
 [[0.         0.         0.33333333 0.33333333 0.33333333 0.        ]
 [0.25       0.25       0.25       0.         0.25       0.        ]
 [0.         0.         0.         0.5        0.         0.5       ]]


# TF-IDF (Term Frequency-Inverse Document Frequency)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
# Step 2: TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())

TF-IDF Matrix:
 [[0.         0.         0.57735027 0.57735027 0.57735027 0.        ]
 [0.5628291  0.5628291  0.42804604 0.         0.42804604 0.        ]
 [0.         0.         0.         0.60534851 0.         0.79596054]]


# Word2Vec Embeddings

In [6]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
# Sample corpus
corpus = [
"I love machine learning",
"Machine learning is fascinating",
"I love programming"
]

In [8]:
# Step 3: Tokenize sentences
tokenized_corpus = [word_tokenize(sentence.lower()) for sentence in corpus]

In [9]:
# Train Word2Vec model
model = Word2Vec(tokenized_corpus, vector_size=100, window=5,
min_count=1, sg=0)

In [10]:
# Example: Get word embedding for 'machine'
word_embedding = model.wv['machine']
print("Word embedding for 'machine':", word_embedding)

Word embedding for 'machine': [-8.6196875e-03  3.6657380e-03  5.1898835e-03  5.7419385e-03
  7.4669183e-03 -6.1676754e-03  1.1056137e-03  6.0472824e-03
 -2.8400505e-03 -6.1735227e-03 -4.1022300e-04 -8.3689485e-03
 -5.6000124e-03  7.1045388e-03  3.3525396e-03  7.2256695e-03
  6.8002474e-03  7.5307419e-03 -3.7891543e-03 -5.6180597e-04
  2.3483764e-03 -4.5190323e-03  8.3887316e-03 -9.8581640e-03
  6.7646410e-03  2.9144168e-03 -4.9328315e-03  4.3981876e-03
 -1.7395747e-03  6.7113843e-03  9.9648498e-03 -4.3624435e-03
 -5.9933780e-04 -5.6956373e-03  3.8508223e-03  2.7866268e-03
  6.8910765e-03  6.1010956e-03  9.5384968e-03  9.2734173e-03
  7.8980681e-03 -6.9895042e-03 -9.1558648e-03 -3.5575271e-04
 -3.0998408e-03  7.8943167e-03  5.9385742e-03 -1.5456629e-03
  1.5109634e-03  1.7900408e-03  7.8175711e-03 -9.5101865e-03
 -2.0553112e-04  3.4691966e-03 -9.3897223e-04  8.3817719e-03
  9.0107834e-03  6.5365066e-03 -7.1162102e-04  7.7104042e-03
 -8.5343346e-03  3.2071066e-03 -4.6379971e-03 -5.088955