In [1]:
!pip install torch



In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [3]:
with open('corpus.txt', 'r') as file:
    data = file.read()

In [4]:
words = data.split()
word_to_ix = {word: i for i, word in enumerate(set(words))}
ix_to_word = {i: word for word, i in word_to_ix.items()}
vocab_size = len(word_to_ix)

In [5]:
data_idx = [word_to_ix[word] for word in words]

In [6]:
seq_length = 10
sequences = []
for i in range(len(data_idx) - seq_length):
    sequences.append(data_idx[i:i+seq_length+1])

In [7]:
sequences = np.array(sequences)
X = torch.from_numpy(sequences[:, :-1])
y = torch.from_numpy(sequences[:, -1])

In [8]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [9]:
dataset = TextDataset(X, y)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

In [20]:
import torch.nn.functional as F

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout=0.2):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x):
        embeds = self.embedding(x)
        lstm_out, _ = self.lstm(embeds)
        lstm_out = self.dropout(lstm_out)
        out = self.fc(lstm_out[:, -1, :])
        return out
    
    def generate_text(self, start_token, length, temperature=1.0):
        generated_text = [start_token]
        current_token = start_token

        for _ in range(length):
            input_token = torch.tensor([[current_token]])
            output_probs = F.softmax(self.forward(input_token), dim=-1).squeeze().detach().cpu().numpy()
            next_token = np.random.choice(len(output_probs), p=output_probs)
            generated_text.append(next_token)
            current_token = next_token

        return generated_text


In [21]:
embedding_dim = 100
hidden_dim = 150
num_layers = 2
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, num_layers)

In [22]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
def generate_text(seed_text, next_words, model, word_to_ix, ix_to_word, temperature=1.0):
    generated_text = seed_text
    for _ in range(next_words):
        seed_tokens = seed_text.split()
        if len(seed_tokens) < seq_length:
            pad_length = seq_length - len(seed_tokens)
            seed_tokens = ['<pad>'] * pad_length + seed_tokens
        seed_idx = torch.tensor([[word_to_ix.get(word, 0) for word in seed_tokens]])
        with torch.no_grad():
            output = model(seed_idx)
        
        output_dist = output.squeeze().div(temperature).exp()
        word_idx = torch.multinomial(output_dist, 1).item()
        
        predicted_word = ix_to_word.get(word_idx, '<unk>')
        generated_text += " " + predicted_word
        seed_text = ' '.join(seed_text.split()[1:]) + " " + predicted_word
    return generated_text

In [None]:
num_epochs = 100
for epoch in range(num_epochs):
    for inputs, labels in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    seed_text = "Người dân Quỳnh Đôi không đồng tình ghép tên với xã"
    generated_text = generate_text(seed_text, 20, model, word_to_ix, ix_to_word, temperature=1)
    print(generated_text)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

In [None]:
def generate_text(seed_text, next_words, model, word_to_ix, ix_to_word, temperature=1.0):
    generated_text = seed_text
    for _ in range(next_words):
        seed_tokens = seed_text.split()
        if len(seed_tokens) < seq_length:
            pad_length = seq_length - len(seed_tokens)
            seed_tokens = ['<pad>'] * pad_length + seed_tokens
        seed_idx = torch.tensor([[word_to_ix.get(word, 0) for word in seed_tokens]])
        with torch.no_grad():
            output = model(seed_idx)
        
        output_dist = output.squeeze().div(temperature).exp()
        word_idx = torch.multinomial(output_dist, 1).item()
        
        predicted_word = ix_to_word.get(word_idx, '<unk>')
        generated_text += " " + predicted_word
        seed_text = ' '.join(seed_text.split()[1:]) + " " + predicted_word
    return generated_text

In [16]:
seed_text = "AI"
generated_text = generate_text(seed_text, 20, model, word_to_ix, ix_to_word, temperature=1)
print(generated_text)

AI Chưa bên 1 triệu đồng đối với nhiệm vụ án đầu Quảng Ninh Bình Dương, cuộc họp bình nhất, hành


In [17]:
model_path = "lstm_model.pth"
torch.save(model.state_dict(), model_path)

In [None]:
# Load the model
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, num_layers)
model.load_state_dict(torch.load(model_path))
model.eval()
print("Model loaded successfully.")
