In [None]:
import re
import string
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import gensim
from gensim.models import Word2Vec
import nltk

nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

corpus = [
    "Natural Language Processing is fun and exciting.",
    "Machine learning and deep learning are subsets of AI.",
    "The field of NLP involves computer science, linguistics, and statistics.",
    "Word embeddings like Word2Vec capture semantic relationships between words.",
    "Data preprocessing is a crucial step in machine learning workflows."
]

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[' + string.punctuation + ']', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens), tokens


processed_corpus = []
tokenized_corpus = []

for doc in corpus:
    processed_text, tokens = preprocess_text(doc)
    processed_corpus.append(processed_text)
    tokenized_corpus.append(tokens)

print("Processed Corpus:")
for doc in processed_corpus:
    print(doc)


Processed Corpus:
natural language processing fun exciting
machine learning deep learning subsets ai
field nlp involves computer science linguistics statistics
word embeddings like word2vec capture semantic relationships words
data preprocessing crucial step machine learning workflows


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
count_vectorizer = CountVectorizer()

bow_counts = count_vectorizer.fit_transform(processed_corpus)

bow_df = pd.DataFrame(bow_counts.toarray(), columns=count_vectorizer.get_feature_names_out())
print("Bag-of-Words (Raw Counts):")
print(bow_df)


Bag-of-Words (Raw Counts):
   ai  capture  computer  crucial  data  deep  embeddings  exciting  field  \
0   0        0         0        0     0     0           0         1      0   
1   1        0         0        0     0     1           0         0      0   
2   0        0         1        0     0     0           0         0      1   
3   0        1         0        0     0     0           1         0      0   
4   0        0         0        1     1     0           0         0      0   

   fun  ...  relationships  science  semantic  statistics  step  subsets  \
0    1  ...              0        0         0           0     0        0   
1    0  ...              0        0         0           0     0        1   
2    0  ...              0        1         0           1     0        0   
3    0  ...              1        0         1           0     0        0   
4    0  ...              0        0         0           0     1        0   

   word  word2vec  words  workflows  
0     0  

In [None]:
def normalize_counts(count_matrix):
    counts = count_matrix.toarray().astype(float)
    row_sums = counts.sum(axis=1, keepdims=True)
    normalized = counts / (row_sums + 1e-10)
    return normalized

normalized_bow = normalize_counts(bow_counts)
normalized_bow_df = pd.DataFrame(normalized_bow, columns=count_vectorizer.get_feature_names_out())
print("\nBag-of-Words (Normalized Counts):")
print(normalized_bow_df)



Bag-of-Words (Normalized Counts):
         ai  capture  computer   crucial      data      deep  embeddings  \
0  0.000000    0.000  0.000000  0.000000  0.000000  0.000000       0.000   
1  0.166667    0.000  0.000000  0.000000  0.000000  0.166667       0.000   
2  0.000000    0.000  0.142857  0.000000  0.000000  0.000000       0.000   
3  0.000000    0.125  0.000000  0.000000  0.000000  0.000000       0.125   
4  0.000000    0.000  0.000000  0.142857  0.142857  0.000000       0.000   

   exciting     field  fun  ...  relationships   science  semantic  \
0       0.2  0.000000  0.2  ...          0.000  0.000000     0.000   
1       0.0  0.000000  0.0  ...          0.000  0.000000     0.000   
2       0.0  0.142857  0.0  ...          0.000  0.142857     0.000   
3       0.0  0.000000  0.0  ...          0.125  0.000000     0.125   
4       0.0  0.000000  0.0  ...          0.000  0.000000     0.000   

   statistics      step   subsets   word  word2vec  words  workflows  
0    0.000000  0

In [None]:
embedding_size = 100
window_size = 5
min_word_count = 1
workers = 4

w2v_model = Word2Vec(sentences=tokenized_corpus,
                     vector_size=embedding_size,
                     window=window_size,
                     min_count=min_word_count,
                     workers=workers,
                     sg=1)

word = 'nlp'
if word in w2v_model.wv:
    print(f"\nWord2Vec embedding for '{word}':")
    print(w2v_model.wv[word])
else:
    print(f"\nWord '{word}' not found in vocabulary.")

print("\nWords most similar to 'nlp':")
if 'nlp' in w2v_model.wv:
    similar_words = w2v_model.wv.most_similar('nlp', topn=5)
    for word, similarity in similar_words:
        print(f"{word}: {similarity:.4f}")
else:
    print("Word 'nlp' not found in vocabulary.")



Word2Vec embedding for 'nlp':
[-1.9442164e-03 -5.2675214e-03  9.4471136e-03 -9.2987325e-03
  4.5039477e-03  5.4041781e-03 -1.4092624e-03  9.0070926e-03
  9.8853596e-03 -5.4750429e-03 -6.0210000e-03 -6.7469729e-03
 -7.8948820e-03 -3.0479168e-03 -5.5940272e-03 -8.3446801e-03
  7.8290224e-04  2.9946566e-03  6.4147436e-03 -2.6289499e-03
 -4.4534765e-03  1.2495709e-03  3.9146186e-04  8.1169987e-03
  1.8280029e-04  7.2315861e-03 -8.2645155e-03  8.4335366e-03
 -1.8889094e-03  8.7011540e-03 -7.6168370e-03  1.7963862e-03
  1.0564864e-03  4.6005251e-05 -5.1032533e-03 -9.2476979e-03
 -7.2642174e-03 -7.9511739e-03  1.9137275e-03  4.7846674e-04
 -1.8131376e-03  7.1201660e-03 -2.4756920e-03 -1.3473093e-03
 -8.9005642e-03 -9.9254129e-03  8.9493981e-03 -5.7539381e-03
 -6.3729975e-03  5.1994072e-03  6.6699935e-03 -6.8316413e-03
  9.5975993e-04 -6.0084737e-03  1.6473436e-03 -4.2892788e-03
 -3.4407973e-03  2.1856665e-03  8.6615775e-03  6.7281104e-03
 -9.6770572e-03 -5.6221043e-03  7.8803329e-03  1.98935