In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import re
from tqdm import tqdm

# ==== Load data ====
df_train = pd.read_csv("train.csv")  

# ==== Preprocess text ====
def preprocess(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text.strip()

df_train['text_clean'] = df_train['text'].apply(preprocess)

# ==== Tạo vocab với min_count ====
sentences = [s.split() for s in df_train['text_clean']]
words = [w for s in sentences for w in s]
min_count = 2
freq = {}
for w in words:
    freq[w] = freq.get(w, 0) + 1
vocab = sorted([w for w in set(words) if freq[w] >= min_count])
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}
print("Vocab size:", len(vocab))

# ==== training pairs Skip-gram ====
window_size = 2
pairs = []
for s in sentences:
    s = [w for w in s if w in word2idx]
    for i, target in enumerate(s):
        target_idx = word2idx[target]
        for j in range(max(0, i - window_size), min(len(s), i + window_size + 1)):
            if i != j:
                context_idx = word2idx[s[j]]
                pairs.append((target_idx, context_idx))

pairs = np.array(pairs)
print("Training pairs:", pairs.shape)

# ==== Model params ====
vocab_size = len(vocab)
embed_dim = 50
batch_size = 512
epochs = 25  
num_sampled = 100  

# Dataset
target_words = tf.constant(pairs[:, 0], dtype=tf.int32)
context_words = tf.constant(pairs[:, 1], dtype=tf.int32)
dataset = tf.data.Dataset.from_tensor_slices((target_words, context_words))
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)
dataset = dataset.cache().prefetch(tf.data.AUTOTUNE)

# Variables
embeddings = tf.Variable(tf.random.normal([vocab_size, embed_dim], stddev=0.1))
nce_weights = tf.Variable(tf.random.normal([vocab_size, embed_dim], stddev=0.1))
nce_biases = tf.Variable(tf.zeros([vocab_size]))

# Optimizer
optimizer = tf.keras.optimizers.Adam(0.005)  

# Checkpoint
ckpt = tf.train.Checkpoint(embeddings=embeddings,
                           nce_weights=nce_weights,
                           nce_biases=nce_biases,
                           optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, "./checkpoints", max_to_keep=8)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print(f"✅ Restored from {ckpt_manager.latest_checkpoint}")
else:
    print("⚡ Training from scratch")

# ==== Training step ---->
@tf.function
def train_step(x, y):
    with tf.GradientTape() as tape:
        embed = tf.nn.embedding_lookup(embeddings, x)
        loss = tf.reduce_mean(
            tf.nn.nce_loss(weights=nce_weights,
                           biases=nce_biases,
                           labels=tf.reshape(y, (-1,1)),
                           inputs=embed,
                           num_sampled=num_sampled,
                           num_classes=vocab_size)
        )
    grads = tape.gradient(loss, [embeddings, nce_weights, nce_biases])
    optimizer.apply_gradients(zip(grads, [embeddings, nce_weights, nce_biases]))
    return loss

# ==== Training loop ====
for epoch in range(epochs):
    total_loss = 0
    for step, (x, y) in enumerate(tqdm(dataset, desc=f"Epoch {epoch+1}")):
        loss = train_step(x, y)
        total_loss += loss.numpy()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataset):.4f}")
    ckpt_manager.save()
    print("💾 Checkpoint saved!")

print("✅ Training completed!")

# ==== Save embeddings ---->
np.savez_compressed("word_embeddings.npz",
                    embeddings=embeddings.numpy(),
                    vocab=np.array(vocab))
print("💾 Saved embeddings to 'word_embeddings.npz'")

# ==== Test embedding ---->
word = "pizza"
if word in word2idx:
    vec = embeddings.numpy()[word2idx[word]]
    print(f"Embedding cho '{word}':", vec[:10], "...")


In [None]:
import tensorflow as tf
import numpy as np
import os
import pickle

ckpt_path = './checkpoints1/ckpt-26'
vocab_path = 'vocab.pkl' 

def preprocess(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text.strip()

df_train = pd.read_csv('train.csv')
df_train['text_clean'] = df_train['text'].apply(preprocess)
sentences = [s.split() for s in df_train['text_clean']]
words = [w for s in sentences for w in s]
min_count = 2
freq = {}
for w in words:
    freq[w] = freq.get(w, 0) + 1
vocab = sorted([w for w in set(words) if freq[w] >= min_count])

with open('vocab.pkl', 'wb') as f:
    pickle.dump(vocab, f)

# Load checkpoint
embedding_dim = 50  
ckpt = tf.train.Checkpoint(embeddings=tf.Variable(tf.random.normal([len(vocab), embedding_dim], stddev=0.1)))
ckpt.restore(ckpt_path).expect_partial()
np.savez_compressed('word_embeddings.npz', embeddings=ckpt.embeddings.numpy(), vocab=np.array(vocab))
print('✅ Đã xuất word_embeddings.npz!')
