# Step-by-Step Implementation of Word2Vec

In [1]:
!pip install gensim



## Step 1: Data Preparation

In [2]:
# Sample text corpus
corpus = [
    "Word embeddings are essential for NLP tasks.",
    "Word2Vec is a popular word embedding technique.",
    "Text preprocessing is necessary to clean the data.",
    "Training Word2Vec models can be done with Gensim.",
    "Word embeddings capture semantic relationships."
]

## Step 2: Preprocessing

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download NLTK stopwords (if not already downloaded)
nltk.download('stopwords')
nltk.download('punkt')

# Tokenize and clean the text
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove punctuation and convert to lowercase
    tokens = [word.lower() for word in tokens if word.isalpha()]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    return tokens

# Preprocess the corpus
preprocessed_corpus = [preprocess_text(text) for text in corpus]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Step 3: Training

In [4]:
from gensim.models import Word2Vec

# Train Word2Vec model
model = Word2Vec(sentences=preprocessed_corpus, vector_size=100, window=5, sg=1, min_count=1)

## Step 4: Embedding Generation

In [5]:
# Retrieve the word embedding for a word
word_embedding = model.wv['word']
print("Word Embedding for 'word':", word_embedding)

Word Embedding for 'word': [-5.3622725e-04  2.3643136e-04  5.1033497e-03  9.0092728e-03
 -9.3029495e-03 -7.1168090e-03  6.4588725e-03  8.9729885e-03
 -5.0154282e-03 -3.7633716e-03  7.3805046e-03 -1.5334714e-03
 -4.5366134e-03  6.5540518e-03 -4.8601604e-03 -1.8160177e-03
  2.8765798e-03  9.9187379e-04 -8.2852151e-03 -9.4488179e-03
  7.3117660e-03  5.0702621e-03  6.7576934e-03  7.6286553e-04
  6.3508903e-03 -3.4053659e-03 -9.4640139e-04  5.7685734e-03
 -7.5216377e-03 -3.9361035e-03 -7.5115822e-03 -9.3004224e-04
  9.5381187e-03 -7.3191668e-03 -2.3337686e-03 -1.9377411e-03
  8.0774371e-03 -5.9308959e-03  4.5162440e-05 -4.7537340e-03
 -9.6035507e-03  5.0072931e-03 -8.7595852e-03 -4.3918253e-03
 -3.5099984e-05 -2.9618145e-04 -7.6612402e-03  9.6147433e-03
  4.9820580e-03  9.2331432e-03 -8.1579173e-03  4.4957981e-03
 -4.1370760e-03  8.2453608e-04  8.4986202e-03 -4.4621765e-03
  4.5175003e-03 -6.7869602e-03 -3.5484887e-03  9.3985079e-03
 -1.5776526e-03  3.2137157e-04 -4.1406299e-03 -7.6826881e-

## Step 5: Utilization

In [6]:
# Example: Finding similar words
similar_words = model.wv.most_similar('embedding')
print("Words similar to 'embedding':", similar_words)

Words similar to 'embedding': [('preprocessing', 0.31900981068611145), ('done', 0.1747603714466095), ('gensim', 0.11928531527519226), ('necessary', 0.11117951571941376), ('relationships', 0.1088901162147522), ('training', 0.10560833662748337), ('word', 0.09291724115610123), ('models', 0.08058696985244751), ('capture', 0.07913302630186081), ('popular', 0.00484249135479331)]
