# Assignment 2

Name: Vivek Mule
Roll: 381072
PRN: 22420145

Perform bag-of-words approach (count occurrence, normalized count occurrence), TF-IDF on data. Create embeddings using Word2Vec

In [1]:
!pip install nltk gensim scikit-learn

Defaulting to user installation because normal site-packages is not writeable


In [2]:
#imports
import nltk
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vivek\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# sample dataset

documents = [
    "Using NLTK for text processing",
    "NLTK provides easy-to-use interfaces",
    "Pyton is great for NLP tasks",
    "NLP includes tokenization, stemming, and more"
]

print(documents)


['Using NLTK for text processing', 'NLTK provides easy-to-use interfaces', 'Pyton is great for NLP tasks', 'NLP includes tokenization, stemming, and more']


In [4]:
#Bag-of-Words (Count Occurrence)

count_vectorizer = CountVectorizer()
bow_counts = count_vectorizer.fit_transform(documents)

print("Vocabulary:")
print(count_vectorizer.get_feature_names_out())

print("\nBag-of-Words (Count Occurrence):")
print(bow_counts.toarray())

Vocabulary:
['and' 'easy' 'for' 'great' 'includes' 'interfaces' 'is' 'more' 'nlp'
 'nltk' 'processing' 'provides' 'pyton' 'stemming' 'tasks' 'text' 'to'
 'tokenization' 'use' 'using']

Bag-of-Words (Count Occurrence):
[[0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1]
 [0 1 0 0 0 1 0 0 0 1 0 1 0 0 0 0 1 0 1 0]
 [0 0 1 1 0 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0]
 [1 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 1 0 0]]


In [None]:
# Bag-of-Words (Normalized Count Occurrence) (L2 Normalization)
from sklearn.preprocessing import normalize

count_vectorizer_norm = CountVectorizer()
bow_counts = count_vectorizer_norm.fit_transform(documents)
bow_normalized = normalize(bow_counts, norm='l2', axis=1)

print("Vocabulary:")
print(count_vectorizer_norm.get_feature_names_out())

print("\nNormalized Bag-of-Words (L2 Normalization):")
print(bow_normalized.toarray())

Vocabulary:
['ai' 'and' 'are' 'fascinating' 'field' 'is' 'learning' 'machine' 'nlp'
 'of' 'part' 'python' 'the' 'transforming' 'using' 'we' 'world']

Normalized Bag-of-Words (L2 Normalization):
[[0.40824829 0.         0.         0.40824829 0.40824829 0.40824829
  0.         0.         0.40824829 0.40824829 0.         0.
  0.         0.         0.         0.         0.        ]
 [0.37796447 0.37796447 0.37796447 0.         0.         0.
  0.         0.         0.37796447 0.         0.         0.
  0.37796447 0.37796447 0.         0.         0.37796447]
 [0.40824829 0.         0.         0.         0.         0.40824829
  0.40824829 0.40824829 0.         0.40824829 0.40824829 0.
  0.         0.         0.         0.         0.        ]
 [0.         0.         0.40824829 0.         0.         0.
  0.40824829 0.         0.40824829 0.         0.         0.40824829
  0.         0.         0.40824829 0.40824829 0.        ]]


In [5]:
# TF-IDF Representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

print("Vocabulary:")
print(tfidf_vectorizer.get_feature_names_out())

print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())


Vocabulary:
['and' 'easy' 'for' 'great' 'includes' 'interfaces' 'is' 'more' 'nlp'
 'nltk' 'processing' 'provides' 'pyton' 'stemming' 'tasks' 'text' 'to'
 'tokenization' 'use' 'using']

TF-IDF Matrix:
[[0.         0.         0.38274272 0.         0.         0.
  0.         0.         0.         0.38274272 0.48546061 0.
  0.         0.         0.         0.48546061 0.         0.
  0.         0.48546061]
 [0.         0.42176478 0.         0.         0.         0.42176478
  0.         0.         0.         0.3325242  0.         0.42176478
  0.         0.         0.         0.         0.42176478 0.
  0.42176478 0.        ]
 [0.         0.         0.34431452 0.43671931 0.         0.
  0.43671931 0.         0.34431452 0.         0.         0.
  0.43671931 0.         0.43671931 0.         0.         0.
  0.         0.        ]
 [0.42176478 0.         0.         0.         0.42176478 0.
  0.         0.42176478 0.3325242  0.         0.         0.
  0.         0.42176478 0.         0.         0. 

In [None]:
# Tokenize Sentences for Word2Vec
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt_tab')
tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]
print(tokenized_docs)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


[['nlp', 'is', 'a', 'fascinating', 'field', 'of', 'ai'], ['ai', 'and', 'nlp', 'are', 'transforming', 'the', 'world'], ['machine', 'learning', 'is', 'part', 'of', 'ai'], ['we', 'are', 'learning', 'nlp', 'using', 'python']]


In [None]:
# Train Word2Vec Model
w2v_model = Word2Vec(
    sentences=tokenized_docs,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4
)

In [None]:
# Word Embeddings (Vector Representation)
print("Vector for word 'nlp':")
print(w2v_model.wv['nlp'])

print("\nVector size:", w2v_model.wv.vector_size)

Vector for word 'nlp':
[-8.6207762e-03  3.6693334e-03  5.1942971e-03  5.7473937e-03
  7.4650599e-03 -6.1747055e-03  1.1099416e-03  6.0544759e-03
 -2.8448051e-03 -6.1771143e-03 -4.0740482e-04 -8.3730584e-03
 -5.6021605e-03  7.1088444e-03  3.3524618e-03  7.2231065e-03
  6.8022693e-03  7.5322362e-03 -3.7926226e-03 -5.6901621e-04
  2.3529716e-03 -4.5188968e-03  8.3920360e-03 -9.8620411e-03
  6.7665880e-03  2.9115782e-03 -4.9358848e-03  4.4020070e-03
 -1.7429485e-03  6.7098825e-03  9.9619338e-03 -4.3645748e-03
 -5.9599307e-04 -5.6999368e-03  3.8509031e-03  2.7887921e-03
  6.8953354e-03  6.1001154e-03  9.5395697e-03  9.2723612e-03
  7.8964084e-03 -6.9908407e-03 -9.1608996e-03 -3.5524677e-04
 -3.1000818e-03  7.8951921e-03  5.9356242e-03 -1.5428711e-03
  1.5138414e-03  1.7952634e-03  7.8164274e-03 -9.5088966e-03
 -2.0412030e-04  3.4708700e-03 -9.3429489e-04  8.3816377e-03
  9.0165604e-03  6.5354626e-03 -7.1288715e-04  7.7175130e-03
 -8.5373772e-03  3.2064954e-03 -4.6427255e-03 -5.0939051e-03
 

In [None]:
#Similar Words using Word2Vec
similar_words = w2v_model.wv.most_similar('ai')
print("Words similar to 'ai':")
print(similar_words)


Words similar to 'ai':
[('fascinating', 0.21886461973190308), ('part', 0.21618622541427612), ('the', 0.09310683608055115), ('using', 0.09290151298046112), ('world', 0.07948420941829681), ('machine', 0.06284788995981216), ('field', 0.05455850437283516), ('we', 0.027049632743000984), ('python', 0.016147596761584282), ('nlp', -0.010376579128205776)]
