# Import Libraries

In [5]:
import json
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

Collecting matplotlib
  Downloading matplotlib-3.10.0-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.3.1-cp312-cp312-win_amd64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.55.3-cp312-cp312-win_amd64.whl.metadata (168 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Using cached kiwisolver-1.4.7-cp312-cp312-win_amd64.whl.metadata (6.4 kB)
Collecting pillow>=8 (from matplotlib)
  Using cached pillow-11.0.0-cp312-cp312-win_amd64.whl.metadata (9.3 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Using cached pyparsing-3.2.0-py3-none-any.whl.metadata (5.0 kB)
Downloading matplotlib-3.10.0-cp312-cp312-win_amd64.whl (8.0 MB)
   ---------------------------------------- 0.0/8.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/8.0 MB ? eta -:--:--
   -


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Configure

In [6]:
input_data = '../data/processed_data_lite.json'
input_vocab = '../data/vocab_lite.json'
output_model_prefix = '../models/model_lite'

embedding_dim = 100  # Kích thước vector nhúng

# Utils

In [7]:
# Hàm tạo các cặp Skip-gram pairs
def skipgram_pairs(corpus, window_size=2):
    word_pairs = []
    for sentence in corpus:
        words_in_sentence = sentence.split()
        for i, word in enumerate(words_in_sentence):
            # Tạo cặp từ cho từ trung tâm và từ ngữ cảnh
            context = words_in_sentence[max(i - window_size, 0):i] + words_in_sentence[i + 1:i + window_size + 1]
            for context_word in context:
                word_pairs.append((word, context_word))
    return word_pairs

In [8]:
class SkipGram:
    def __init__(self, vocab_size, embedding_dim, learning_rate=0.01):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.learning_rate = learning_rate
        self.losses = []
        
        # Khởi tạo trọng số với Xavier Initialization
        limit = np.sqrt(6 / (vocab_size + embedding_dim))
        self.W1 = np.random.uniform(-limit, limit, (vocab_size, embedding_dim))  # |V| x d
        self.W2 = np.random.uniform(-limit, limit, (embedding_dim, vocab_size))  # d x |V|

    def softmax(self, x):
        e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))  # Stabilize computation
        return e_x / e_x.sum(axis=-1, keepdims=True)

    def forward(self, center_idx):
        """
        Forward pass to compute probabilities of context words.
        center_idx: Indices of the center words (batch_size).
        """
        center_vectors = self.W1[center_idx]  # batch_size x d
        logits = np.dot(center_vectors, self.W2)  # batch_size x |V|
        probabilities = self.softmax(logits)  # batch_size x |V|
        return probabilities, center_vectors

    def backward(self, probabilities, center_idx, context_idx):
        """
        Backward pass to compute gradients and update weights for a batch.
        probabilities: Softmax probabilities (batch_size x |V|).
        center_idx: Indices of the center words (batch_size).
        context_idx: Indices of the context words (batch_size).
        """
        batch_size = len(center_idx)
        
        # One-hot encoding for context words
        targets = np.zeros_like(probabilities)  # batch_size x |V|
        targets[np.arange(batch_size), context_idx] = 1

        # Error between predicted probabilities and target
        error = probabilities - targets  # batch_size x |V|

        # Gradients for W2 and W1
        grad_W2 = np.dot(self.W1[center_idx].T, error)  # d x |V|
        grad_W1 = np.dot(error, self.W2.T)  # batch_size x d

        # Gradient aggregation for W1 (sum gradients for same indices)
        unique_idx, inverse_idx = np.unique(center_idx, return_inverse=True)
        grouped_gradients = np.zeros((len(unique_idx), self.embedding_dim))
        np.add.at(grouped_gradients, inverse_idx, grad_W1)

        # Cập nhật W1
        self.W1[unique_idx] -= self.learning_rate * grouped_gradients
        self.W2 -= self.learning_rate * grad_W2

    def train(self, word_pairs, vocab, epochs=10, batch_size=64):
        """
        Train the Skip-gram model.
        word_pairs: List of (center_word, context_word) tuples.
        vocab: Vocabulary mapping word to index.
        """
        print("Training Skip-gram model...")
        for epoch in range(epochs):
            total_loss = 0
            np.random.shuffle(word_pairs)
            batches = [word_pairs[i:i + batch_size] for i in range(0, len(word_pairs), batch_size)]

            for batch in tqdm(batches, desc=f"Epoch {epoch+1}/{epochs}"):
                # Extract indices for batch
                center_idx = np.array([vocab[center] for center, _ in batch])
                context_idx = np.array([vocab[context] for _, context in batch])

                # Forward pass
                probabilities, _ = self.forward(center_idx)

                # Compute loss (negative log likelihood)
                batch_loss = -np.sum(np.log(probabilities[np.arange(len(context_idx)), context_idx]))
                total_loss += batch_loss

                # Backward pass
                self.backward(probabilities, center_idx, context_idx)
            
            avg_loss = total_loss / len(word_pairs)
            self.losses.append(avg_loss)
            print(f"Epoch {epoch+1}/{epochs}, Avg Loss: {avg_loss:.4f}")

    def get_embedding(self, word, vocab):
        """
        Retrieve the embedding vector for a given word.
        """
        return self.W1[vocab[word]]

    def cosine_similarity(self, word1, word2, vocab):
        """
        Compute cosine similarity between two words.
        """
        vec1 = self.get_embedding(word1, vocab)
        vec2 = self.get_embedding(word2, vocab)
        
        norm_vec1 = np.linalg.norm(vec1)
        norm_vec2 = np.linalg.norm(vec2)
        similarity = np.dot(vec1, vec2) / (norm_vec1 * norm_vec2)
        return similarity
    
    def save(self, path_prefix):
        np.save(f"{path_prefix}_W1.npy", self.W1)
        np.save(f"{path_prefix}_W2.npy", self.W2)
        print(f"Model saved to {path_prefix}_W1.npy and {path_prefix}_W2.npy")

    @classmethod
    def load(cls, path_prefix, vocab_size, embedding_dim, learning_rate=0.01):
        model = cls(vocab_size, embedding_dim, learning_rate)
        model.W1 = np.load(f"{path_prefix}_W1.npy")
        model.W2 = np.load(f"{path_prefix}_W2.npy")
        print(f"Model loaded from {path_prefix}_W1.npy and {path_prefix}_W2.npy")
        return model


In [9]:
def cosine_similarity(vec1, vec2):
    # Tính tích vô hướng giữa vec1 và vec2
    dot_product = np.dot(vec1, vec2)

    # Tính độ dài của từng vector
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)

    # Tính cosine similarity
    similarity = dot_product / (norm_vec1 * norm_vec2)

    return similarity

def word_similarity(word1, word2, vocab, embedding_matrix):
    idx1 = vocab[word1]
    idx2 = vocab[word2]

    vec1 = embedding_matrix[idx1]
    vec2 = embedding_matrix[idx2]

    return cosine_similarity(vec1, vec2)


# Main

In [10]:
# Load data
with open(input_data, 'r') as f:
    data = json.load(f)

with open(input_vocab, 'r') as f:
    vocab = json.load(f)

In [11]:
word_pairs = [skipgram_pairs(doc['contents'], 4) for doc in tqdm(data)]
word_pairs = [item for sublist in tqdm(word_pairs) for item in sublist]

100%|██████████| 2964/2964 [00:00<00:00, 3496.73it/s]
100%|██████████| 2964/2964 [00:00<00:00, 16084.83it/s]


In [13]:
model = SkipGram(vocab_size=len(vocab), embedding_dim=embedding_dim, learning_rate=0.01)
model.train(word_pairs, vocab, epochs=13, batch_size=1024)

Training Skip-gram model...


Epoch 1/13:   0%|          | 7/5892 [00:04<58:15,  1.68it/s]


KeyboardInterrupt: 

In [10]:
similarity = model.cosine_similarity('ông', 'cha', vocab)
print('Similarity between "ông" and "cha":', similarity)
similarity = model.cosine_similarity('ông', 'mẹ', vocab)
print('Similarity between "ông" and "mẹ":', similarity)
similarity = model.cosine_similarity('giáo_viên', 'buồn_bã', vocab)
print('Similarity between "giáo_viên" and "buồn_bã":', similarity)
similarity = model.cosine_similarity('hạnh_phúc', 'vua', vocab)
print('Similarity between "hạnh_phúc" and "vua":', similarity)
similarity = model.cosine_similarity('học', 'giáo_dục', vocab)
print('Similarity between "học" and "giáo_dục":', similarity)
similarity = model.cosine_similarity('anh', 'chị', vocab)
print('Similarity between "anh" and "chị":', similarity)

Similarity between "ông" and "cha": 0.4748734326576516
Similarity between "ông" and "mẹ": 0.28641870964357563
Similarity between "giáo_viên" and "buồn_bã": 0.1674076263902389
Similarity between "hạnh_phúc" and "vua": 0.13152055262311524
Similarity between "học" and "giáo_dục": 0.39925688612246435
Similarity between "anh" and "chị": 0.7177625838393557


In [2]:
model.save(output_model_prefix)

NameError: name 'model' is not defined