# Workshop: Train Word2Vec Model Using NLTK Corpus and Gensim
Train a Word2Vec model on an NLTK corpus with specified parameters and explore the results.

In [1]:

# Install required packages if not installed
!pip install nltk gensim

import nltk
from nltk.corpus import brown
from gensim.models import Word2Vec

# Download NLTK corpus if not present
nltk.download('brown')

# Load sentences from brown corpus
sentences = brown.sents()
print(f"Number of sentences in Brown corpus: {len(sentences)}")




[nltk_data] Downloading package brown to
[nltk_data]     /Users/veerasakkritsanapraphan/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


Number of sentences in Brown corpus: 57340


In [2]:

# Train Word2Vec model
model = Word2Vec(sentences, vector_size=100, min_count=2, workers=4, epochs=10)

print("Model trained.")

Model trained.


## Explore the model

In [3]:

word = "money"

# Get vector of the word
vector = model.wv[word]
print(f"Vector for '{word}':")
print(vector)

# Find most similar words
similar_words = model.wv.most_similar(word, topn=5)
print(f"Most similar words to '{word}':")
for sim_word, score in similar_words:
    print(f"  {sim_word}: {score:.4f}")


Vector for 'money':
[ 0.20204382 -0.3970416  -0.72957945 -0.17900383  0.20552394  0.16122836
 -0.34996548 -0.2703475   0.01354433  0.57275677 -1.0623801   1.0993418
  0.32998484  0.464477   -0.3945429  -0.25700504  0.60897547 -0.3958488
 -0.25278345 -0.67258817  0.04035529  1.0840985   0.5336659   1.1214137
  0.03693031 -0.83697164 -0.05911044 -0.62830645 -0.7343833  -0.7157814
  0.6385317  -0.09562686  1.259468   -0.93024045 -0.63455516 -0.09626816
 -0.31110427  0.10619533 -0.88848543 -0.23655832  0.4638314   0.13898766
 -0.40205792  0.18673924  0.89754313  0.04507443 -0.4271935  -0.22454605
  0.7194409   0.39669716  0.57682514 -1.1033375  -1.2135612   0.4046668
 -0.257732   -0.64722204  0.7756729  -1.0083573  -1.1220272  -0.32181594
 -0.7606081   0.7689562  -0.17075035  0.23238744  0.84357196  1.1139731
  0.23416989 -0.47714314  0.53903866  0.05787619  1.0333409   0.6353914
  0.5684768  -0.49905443  0.5255486  -1.0135815   0.6783544  -0.31450823
  0.30832866 -1.0520549  -0.68848246  

## Save the model

In [4]:

model.save("brown_word2vec.model")
print("Model saved as 'brown_word2vec.model'.")


Model saved as 'brown_word2vec.model'.
