In [2]:
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
!pip install gensim
from gensim.models import Word2Vec
import numpy as np

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m98.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [3]:
documents = [
    "I love natural language processing",
    "Natural language processing is amazing",
    "I love machine learning",
    "Machine learning and NLP are related"
]


In [4]:
count_vectorizer = CountVectorizer()
bow_count = count_vectorizer.fit_transform(documents)

print("Vocabulary:")
print(count_vectorizer.get_feature_names_out())

print("\nCount Occurrence Matrix:")
print(bow_count.toarray())

Vocabulary:
['amazing' 'and' 'are' 'is' 'language' 'learning' 'love' 'machine'
 'natural' 'nlp' 'processing' 'related']

Count Occurrence Matrix:
[[0 0 0 0 1 0 1 0 1 0 1 0]
 [1 0 0 1 1 0 0 0 1 0 1 0]
 [0 0 0 0 0 1 1 1 0 0 0 0]
 [0 1 1 0 0 1 0 1 0 1 0 1]]


In [6]:
tfidf_vectorizer = TfidfVectorizer(norm='l2')
bow_normalized = tfidf_vectorizer.fit_transform(documents)

print("\nNormalized Count Matrix (TF-IDF):")
print(bow_normalized.toarray())


Normalized Count Matrix (TF-IDF):
[[0.         0.         0.         0.         0.5        0.
  0.5        0.         0.5        0.         0.5        0.        ]
 [0.50867187 0.         0.         0.50867187 0.40104275 0.
  0.         0.         0.40104275 0.         0.40104275 0.        ]
 [0.         0.         0.         0.         0.         0.57735027
  0.57735027 0.57735027 0.         0.         0.         0.        ]
 [0.         0.43671931 0.43671931 0.         0.         0.34431452
  0.         0.34431452 0.         0.43671931 0.         0.43671931]]


In [7]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

print("TF-IDF Vocabulary:")
print(tfidf_vectorizer.get_feature_names_out())

print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())


TF-IDF Vocabulary:
['amazing' 'and' 'are' 'is' 'language' 'learning' 'love' 'machine'
 'natural' 'nlp' 'processing' 'related']

TF-IDF Matrix:
[[0.         0.         0.         0.         0.5        0.
  0.5        0.         0.5        0.         0.5        0.        ]
 [0.50867187 0.         0.         0.50867187 0.40104275 0.
  0.         0.         0.40104275 0.         0.40104275 0.        ]
 [0.         0.         0.         0.         0.         0.57735027
  0.57735027 0.57735027 0.         0.         0.         0.        ]
 [0.         0.43671931 0.43671931 0.         0.         0.34431452
  0.         0.34431452 0.         0.43671931 0.         0.43671931]]


In [9]:
import nltk
nltk.download('punkt_tab') # Download the missing resource
tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]
print(tokenized_docs)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...


[['i', 'love', 'natural', 'language', 'processing'], ['natural', 'language', 'processing', 'is', 'amazing'], ['i', 'love', 'machine', 'learning'], ['machine', 'learning', 'and', 'nlp', 'are', 'related']]


[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [10]:
w2v_model = Word2Vec(
    sentences=tokenized_docs,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4
)


In [11]:
vector_nlp = w2v_model.wv['nlp']
print("Embedding for word 'nlp':")
print(vector_nlp)
print("Vector size:", len(vector_nlp))


Embedding for word 'nlp':
[-0.00516529 -0.00666611 -0.00777445  0.00832348 -0.0019855  -0.00686369
 -0.00414805  0.00514986 -0.00288222 -0.00375776  0.00163125 -0.00278467
 -0.00157543  0.00106845 -0.0029711   0.00852681  0.00391159 -0.00997239
  0.00625557 -0.00677056  0.00076876  0.00441551 -0.00509217 -0.00211932
  0.00809817 -0.0042443  -0.00764569  0.00925809 -0.0021554  -0.0047261
  0.00858085  0.00428055  0.00433615  0.00929944 -0.00845911  0.00526827
  0.00204263  0.0041959   0.00169704  0.00445951  0.0044902   0.00611132
 -0.00320902 -0.00457812 -0.00042772  0.0025304  -0.00326847  0.00606732
  0.00415291  0.0077703   0.00256765  0.00812078 -0.00138923  0.00808172
  0.00372094 -0.00805493 -0.00393644 -0.00247479  0.00490311 -0.00087734
 -0.00283884  0.00783519  0.00934181 -0.00161813 -0.00517465 -0.00470158
 -0.00485904 -0.00960407  0.00136252 -0.00423184  0.00252485  0.00562758
 -0.00406335 -0.00959374  0.00154251 -0.00670499  0.00249388 -0.00379233
  0.00707454  0.00063953  

In [12]:
print("Words similar to 'language':")
print(w2v_model.wv.most_similar('language'))


Words similar to 'language':
[('love', 0.13158591091632843), ('are', 0.07523085176944733), ('machine', 0.06794975697994232), ('i', 0.04181476682424545), ('amazing', 0.04126745089888573), ('and', 0.013051219284534454), ('is', -0.009268556721508503), ('processing', -0.013450139202177525), ('related', -0.013759840279817581), ('natural', -0.044627055525779724)]
