In [None]:
import gensim
from gensim.models import Word2Vec
import logging
import multiprocessing
import numpy as np
import json
import pickle

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
def read_corpus(file_path):
    """Read the corpus from a text file."""
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            tokens = line.lower().strip().split()
            if tokens:
                sentences.append(tokens)
    return sentences

In [None]:
def train_word2vec_model(sentences, output_model_path):
    """Train a Word2Vec model using CBOW with hierarchical softmax."""
    cores = multiprocessing.cpu_count()
    
    model = Word2Vec(
        vector_size=300,
        window=5,
        min_count=5,
        sg=0,                 # CBOW model
        hs=1,                 # Use hierarchical softmax
        workers=cores,
        alpha=0.025,
        min_alpha=0.0001,
        sample=1e-5
    )
    
    print("Building vocabulary...")
    model.build_vocab(sentences)
    
    print(f"Training model on {len(sentences)} sentences...")
    model.train(
        sentences,
        total_examples=model.corpus_count,
        epochs=10
    )
    
    # Save the full model
    model.save(output_model_path)
    print(f"Full model saved to {output_model_path}")
    
    return model

In [None]:
def save_embeddings_word2vec_format(model, output_path):
    """
    Save the embeddings in Word2Vec text format.
    This is a widely used format compatible with many NLP tools.
    """
    model.wv.save_word2vec_format(output_path, binary=False)
    print(f"Embeddings saved in Word2Vec text format to {output_path}")
    
    # Also save in binary format for faster loading
    binary_path = output_path + '.bin'
    model.wv.save_word2vec_format(binary_path, binary=True)
    print(f"Embeddings saved in Word2Vec binary format to {binary_path}")

In [None]:
corpus_file = "te.txt"

In [None]:
model_file = "word2vec_cbow_hs_model.model"

In [None]:
embeddings_prefix = "word_embeddings"

In [None]:
sentences = read_corpus(corpus_file)

In [None]:
len(sentences)

In [None]:
sentences[0]

In [None]:
model = train_word2vec_model(sentences, model_file)

In [None]:
def perform_word_analogy(model, word1, word2, word3):
    """
    Perform word analogy: word1 is to word2 as word3 is to X
    Example: man is to king as woman is to X (queen)
    
    Parameters:
    - model: Trained Word2Vec model
    - word1, word2, word3: Words for the analogy
    
    Returns:
    - List of tuples containing the most similar words and their similarity scores
    """
    try:
        # Check if all words are in the vocabulary
        for word in [word1, word2, word3]:
            if word not in model.wv.key_to_index:
                print(f"Warning: '{word}' not in vocabulary")
                return []
        
        # Perform the analogy
        result = model.wv.most_similar(positive=[word2, word3], negative=[word1], topn=5)
        return result
    except KeyError as e:
        print(f"Error: {e}. One of the words is not in vocabulary.")
        return []


In [None]:
perform_word_analogy(model, "పురుషుడు", "రాజు", "స్త్రీ")

In [None]:
4