In [6]:
documents = "My most recent trek to the Himalayas was a test of both stamina and spirit. I will never forget the moment the sun finally broke through the heavy morning mist at the summit, turning the jagged, snow-capped peaks into a shimmering wall of gold. Standing there with my friends, gasping for the thin, cold air, I felt a profound sense of peace that made every grueling mile of the ascent worth it."



In [9]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
bow = cv.fit_transform([documents]) # Wrap documents in a list

print("Vocabulary:", cv.get_feature_names_out())
print("BoW Matrix:\n", bow.toarray())

Vocabulary: ['air' 'and' 'ascent' 'at' 'both' 'broke' 'capped' 'cold' 'every' 'felt'
 'finally' 'for' 'forget' 'friends' 'gasping' 'gold' 'grueling' 'heavy'
 'himalayas' 'into' 'it' 'jagged' 'made' 'mile' 'mist' 'moment' 'morning'
 'most' 'my' 'never' 'of' 'peace' 'peaks' 'profound' 'recent' 'sense'
 'shimmering' 'snow' 'spirit' 'stamina' 'standing' 'summit' 'sun' 'test'
 'that' 'the' 'there' 'thin' 'through' 'to' 'trek' 'turning' 'wall' 'was'
 'will' 'with' 'worth']
BoW Matrix:
 [[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 4 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 8 1 1 1 1 1 1 1 1 1 1 1]]


In [16]:
!pip install gensim



In [11]:
from sklearn.preprocessing import normalize

bow_normalized = normalize(bow, norm='l1')
print("Normalized BoW:\n", bow_normalized.toarray())


Normalized BoW:
 [[0.01470588 0.01470588 0.01470588 0.01470588 0.01470588 0.01470588
  0.01470588 0.01470588 0.01470588 0.01470588 0.01470588 0.01470588
  0.01470588 0.01470588 0.01470588 0.01470588 0.01470588 0.01470588
  0.01470588 0.01470588 0.01470588 0.01470588 0.01470588 0.01470588
  0.01470588 0.01470588 0.01470588 0.01470588 0.02941176 0.01470588
  0.05882353 0.01470588 0.01470588 0.01470588 0.01470588 0.01470588
  0.01470588 0.01470588 0.01470588 0.01470588 0.01470588 0.01470588
  0.01470588 0.01470588 0.01470588 0.11764706 0.01470588 0.01470588
  0.01470588 0.01470588 0.01470588 0.01470588 0.01470588 0.01470588
  0.01470588 0.01470588 0.01470588]]


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform([documents])

print("TF-IDF Vocabulary:", tfidf.get_feature_names_out())
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())

TF-IDF Vocabulary: ['air' 'and' 'ascent' 'at' 'both' 'broke' 'capped' 'cold' 'every' 'felt'
 'finally' 'for' 'forget' 'friends' 'gasping' 'gold' 'grueling' 'heavy'
 'himalayas' 'into' 'it' 'jagged' 'made' 'mile' 'mist' 'moment' 'morning'
 'most' 'my' 'never' 'of' 'peace' 'peaks' 'profound' 'recent' 'sense'
 'shimmering' 'snow' 'spirit' 'stamina' 'standing' 'summit' 'sun' 'test'
 'that' 'the' 'there' 'thin' 'through' 'to' 'trek' 'turning' 'wall' 'was'
 'will' 'with' 'worth']
TF-IDF Matrix:
 [[0.08512565 0.08512565 0.08512565 0.08512565 0.08512565 0.08512565
  0.08512565 0.08512565 0.08512565 0.08512565 0.08512565 0.08512565
  0.08512565 0.08512565 0.08512565 0.08512565 0.08512565 0.08512565
  0.08512565 0.08512565 0.08512565 0.08512565 0.08512565 0.08512565
  0.08512565 0.08512565 0.08512565 0.08512565 0.17025131 0.08512565
  0.34050261 0.08512565 0.08512565 0.08512565 0.08512565 0.08512565
  0.08512565 0.08512565 0.08512565 0.08512565 0.08512565 0.08512565
  0.08512565 0.08512565 0.085

In [17]:
!pip install gensim



In [20]:
from gensim.models import Word2Vec


tokenized_docs = [doc.split() for doc in documents.split('.') if doc.strip()]


w2v_model = Word2Vec(
    sentences=tokenized_docs,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4
)


print("Embedding for 'Himalayas':\n", w2v_model.wv["trek"])

Embedding for 'Himalayas':
 [-2.3770889e-03  7.5349063e-03  2.0405215e-03  2.8949510e-03
 -1.2489252e-03 -9.9079786e-03  1.4162252e-03  2.9140005e-03
 -9.8047629e-03 -3.9253761e-03 -2.3415049e-03 -5.0717257e-03
  3.9423816e-03  2.1020854e-03  4.3348530e-03 -6.4793611e-03
 -6.0008075e-03  4.7239116e-03  6.7804609e-03  5.5134068e-03
  4.7985516e-03 -7.2399122e-03  8.1925178e-03 -6.9671525e-03
 -7.6708291e-03  8.5110366e-03 -2.4861631e-03 -9.3940496e-03
 -3.0835455e-03  5.9686652e-03 -2.8900944e-03  4.9250685e-03
  1.2426807e-04  4.0465365e-03  1.1801581e-03 -9.3269181e-03
 -3.6604227e-03  8.8276789e-03  1.5334127e-03 -5.9038545e-03
  7.0107463e-03  7.4519757e-03  7.2361277e-03  1.1844197e-05
 -9.1378922e-03 -3.4896722e-03  5.1250504e-03 -6.5128198e-03
  9.7368658e-03 -9.7647719e-03 -8.1211384e-03 -5.3297840e-03
  6.0181124e-03 -7.6776501e-03  8.6389109e-03 -9.8816641e-03
  3.5088807e-03  3.2840671e-03  5.1903226e-05  9.5459260e-03
  5.1000169e-03  2.7690810e-04  4.8308191e-03  6.4074313e