In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
# Create a list of special characters and numbers
special_characters_numbers = ['0', '1', '2', '3', '4','5', '6', '7', '8', '9',
                              '!', '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '_', '+', '=', '{', '}', '[', ']',
                              '|', '\\', ':', ';', '"', '\'', '<', '>', ',', '.', '/', '?']

# Create a list of Vietnamese characters
vietnamese_characters = ['a', 'á', 'à', 'ả', 'ã', 'ạ', 'ă', 'ắ', 'ằ', 'ẳ', 'ẵ', 'ặ', 'â', 'ấ', 'ầ', 'ẩ', 'ẫ', 'ậ',
                         'b', 'c', 'd', 'đ', 'e', 'é', 'è', 'ẻ', 'ẽ', 'ẹ', 'ê', 'ế', 'ề', 'ể', 'ễ', 'ệ', 'g', 'h',
                         'i', 'í', 'ì', 'ỉ', 'ĩ', 'ị', 'k', 'l', 'm', 'n', 'o', 'ó', 'ò', 'ỏ', 'õ', 'ọ', 'ô', 'ố', 'ồ',
                         'ổ', 'ỗ', 'ộ', 'ơ', 'ớ', 'ờ', 'ở', 'ỡ', 'ợ', 'p', 'q', 'r', 's', 't', 'u', 'ú', 'ù', 'ủ', 'ũ',
                         'ụ', 'ư', 'ứ', 'ừ', 'ử', 'ữ', 'ự', 'v', 'x', 'y', 'ý', 'ỳ', 'ỷ', 'ỹ', 'ỵ']

def remove_non_vietnamese_characters(text):
    # Remove characters not in the valid Vietnamese characters list
    cleaned_text = ''.join(char for char in text if char in vietnamese_characters)
    return cleaned_text

def remove_non_vietnamese_characters_return_specials(text):
    # Remove characters not in the valid Vietnamese characters list
    cleaned_text = ''.join(char for char in text if char in vietnamese_characters + ['-'])
    special_chars = ''.join(char for char in text if char not in vietnamese_characters + ['-'])
    return [cleaned_text, special_chars]

In [5]:
with open('/content/truyen_kieu.txt') as f:
  content = f.readlines()
content = [line.strip('0123456789. \n').lower() for line in content]
content = ' '.join(content)
content = content.split(' ')

words = [remove_non_vietnamese_characters(word) for word in content]

In [6]:
def read_truyen_ngan(file_path):
  with open('/content/tat_den.txt') as f:
    content = f.readlines()

  content = ' '.join(content).lower()
  content = content.split(' ')

  # Dùng map để áp dụng remove_non_vietnamese_characters_return_specials lên tất cả các từ
  words2 = list(map(remove_non_vietnamese_characters_return_specials, content))

  # Convert the list of lists to a flat list
  words2 = [word for sublist in words2 for word in sublist]

  # Dùng list comprehension để loại bỏ những từ rỗng
  words2 = [word for word in words2 if word != '']

  return words2

In [7]:
truyen_ngan_paths = ['/content/tho_ho_xuan_huong.txt',
                     '/content/top_bai_tho.txt',
                     '/content/lao_hac.txt',
                     '/content/tat_den.txt']

In [8]:
for truyen_ngan in truyen_ngan_paths:
  words2 = read_truyen_ngan(truyen_ngan)
  words = words + words2

In [9]:
words = words + vietnamese_characters
words = words + special_characters_numbers
words = words + [' ', '\n']

words = [word for word in words if word != '']

In [10]:
len(words)

192985

In [11]:
global words_unique
words_unique = sorted(list(set(words))) # Unique words in the dataset
word_to_int = dict((c, i) for i, c in enumerate(words_unique)) # A dictionary to map unique words to integers. When we call enumerate function, we will get something like this [(0, 'A'), (1, 'B'), (2, 'C'), (3, 'D')]
int_to_word = dict((i, c) for c, i in word_to_int.items()) # A dictionary to transform integers back to characters. This is just reversing of char_to_int.

In [12]:
n_words = len(words)
n_vocab = len(words_unique)
dataX = []
dataY = []
for i in range(0, n_words - 30, 1):
    seq_in = words[i:i + 30]
    seq_out = words[i + 30]
    dataX.append([word_to_int[word] for word in seq_in])
    dataY.append(word_to_int[seq_out])

In [13]:
n_sequence = len(dataX)
X = torch.tensor(dataX, dtype=torch.float32).reshape(n_sequence, 30, 1)
X = X / float(n_vocab)
y = torch.tensor(dataY)

In [14]:
import pickle

# Save list and dictionary to file
with open('/content/myTokenizer.pkl', 'wb') as file:
    pickle.dump(words_unique, file)
    pickle.dump(word_to_int, file)
    pickle.dump(int_to_word, file)
    pickle.dump(n_vocab, file)

In [15]:
class LanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.lstm = nn.LSTM(input_size=1, hidden_size=256, num_layers=1, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(256, n_vocab)
    def forward(self, x):
        x, _ = self.lstm(x)
        x = x[:, -1, :] # X is the output of LSTM, it will have this shape [batch_size, seq_len, hidden_size]. We only need to get the last value of the output sequence so we use -1.
        x = self.dropout(x)
        x = self.linear(x) # Predicting one of the 46 characters in the vocab
        return x

In [16]:
model = LanguageModel().to(device)
optimizer = optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss()
loader = data.DataLoader(data.TensorDataset(X, y), shuffle=True, batch_size=128)
model.train()
for epoch in range(1000): # This will take a while
    print("Running Epoch %d ..." % epoch)
    for X_batch, y_batch in loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        y_pred = model(X_batch)
        loss = loss_fn(y_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
print("Finished training")

Running Epoch 0 ...
Running Epoch 1 ...
Running Epoch 2 ...
Running Epoch 3 ...
Running Epoch 4 ...
Running Epoch 5 ...
Running Epoch 6 ...
Running Epoch 7 ...
Running Epoch 8 ...
Running Epoch 9 ...
Running Epoch 10 ...
Running Epoch 11 ...
Running Epoch 12 ...
Running Epoch 13 ...
Running Epoch 14 ...
Running Epoch 15 ...
Running Epoch 16 ...
Running Epoch 17 ...
Running Epoch 18 ...
Running Epoch 19 ...
Running Epoch 20 ...
Running Epoch 21 ...
Running Epoch 22 ...
Running Epoch 23 ...
Running Epoch 24 ...
Running Epoch 25 ...
Running Epoch 26 ...
Running Epoch 27 ...
Running Epoch 28 ...
Running Epoch 29 ...
Running Epoch 30 ...
Running Epoch 31 ...
Running Epoch 32 ...
Running Epoch 33 ...
Running Epoch 34 ...
Running Epoch 35 ...
Running Epoch 36 ...
Running Epoch 37 ...
Running Epoch 38 ...
Running Epoch 39 ...
Running Epoch 40 ...
Running Epoch 41 ...
Running Epoch 42 ...
Running Epoch 43 ...
Running Epoch 44 ...
Running Epoch 45 ...
Running Epoch 46 ...
Running Epoch 47 ...
Ru

In [17]:
import torch
import torch.nn as nn

# Assuming you have already prepared your evaluation data in the correct format

def evaluate(model, data_loader, loss_function):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0.0
    total_correct = 0

    with torch.no_grad():
        for inputs, targets in data_loader:
            # Move inputs and targets to the device (CPU or GPU)
            inputs = inputs.to(device)
            targets = targets.to(device)

            # Forward pass
            outputs = model(inputs)

            # Calculate the loss
            loss = loss_function(outputs, targets)
            total_loss += loss.item()

            # Calculate the number of correct predictions (assuming you are doing a classification task)
            _, predicted_indices = torch.max(outputs, 1)
            total_correct += (predicted_indices == targets).sum().item()

    # Calculate the average loss and accuracy
    average_loss = total_loss / len(data_loader.dataset)
    accuracy = total_correct / len(data_loader.dataset)

    return average_loss, accuracy

# Assuming you have already created your LanguageModel instance and loaded model parameters

# Set your loss function (e.g., CrossEntropyLoss for classification tasks)
loss_function = nn.CrossEntropyLoss()

# Assuming you have your evaluation data in a DataLoader

eval_data_loader = data.DataLoader(data.TensorDataset(X, y), shuffle=True, batch_size=128)  # Replace ... with your evaluation data loader


In [18]:
# Evaluate the model
average_loss, accuracy = evaluate(model, eval_data_loader, loss_function)

print(f"1. Average Loss: {average_loss:.4f}, Accuracy: {accuracy:.4f}")

1. Average Loss: 0.0037, Accuracy: 0.9082


In [19]:
def encode_promt(prompt):
  prompt = prompt.split(' ')
  prompt_processed = []
  for word in prompt:
    if word in words_unique:
      prompt_processed.append(word)
    else:
      for c in word:
        if c in words_unique:
          prompt_processed.append(c)
    prompt_processed.append(' ')
  prompt_processed = [word for word in prompt_processed if word != '']
  return prompt_processed

In [20]:
def predict(prompt):
    prompt = encode_promt(prompt)
    sequence = [word_to_int[word] for word in prompt]
    answer = ''
    with torch.no_grad():
        for i in range(len(sequence)*3):
            x = np.reshape(sequence, (1, len(sequence), 1)) / float(n_vocab) # Reshape and normalize
            x = torch.tensor(x, dtype=torch.float32).to(device)
            prediction = model(x)
            index = int(prediction.argmax()) # Predict an array of n_vocab integers
            answer += ' ' + int_to_word[index]
            sequence.append(index) # Append the predicted integer into the current sequence
            sequence = sequence[1:] # Remove the first integer from the sequence
    return ' '.join([int_to_word[i] for i in sequence]), answer # Convert all the integers into characters

In [21]:
predict("""Cảo thơm lần giở trước đèn. Phong tình có lục còn truyền sử xanh.""")[1]

' thế đã bà mày mày có với , - bà vẫn con vẫn với nhà . ông ð con con con nó con chưa giờ mới khóc ấy được làm . , không sen sộ cho - không nói câm sao cho đấy , ấy đây hãy ông , rồi nghị , lại được tha làm vào vào cho một , chị dậu tôi trộm bước cái ông giữ con lệ , bị làm lấy cái vào ý chồng lắm buồng - vâng làm ông cụ nhà đấy , rồi con chó ra cho chồng .\n rồi nó lo chị dậu tiền , quan ở để cùng lí nhà anh xong , ra thêm ra sao hay'

In [22]:
# Using this, you do not need to define the model class again:

model_scripted = torch.jit.script(model) # Export to TorchScript
model_scripted.save('/content/myModel.pt') # Save
# Load
#model = torch.jit.load('model_scripted.pt')
#model.eval()

In [23]:
from google.colab import files
files.download('/content/myModel.pt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [24]:
!pip install gradio
import gradio as gr

model.eval() # Change to evaluation mode because we don't want Dropout Layer to automatically drop Neural Network nodes when we are making prediction
def text_generation(prompt):
    return predict(prompt)[1]
gr.Interface(fn=text_generation, inputs=["text"], outputs=["text"]).launch(share=True)

Collecting gradio
  Downloading gradio-3.39.0-py3-none-any.whl (19.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.9/19.9 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.1.0-py3-none-any.whl (14 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.100.1-py3-none-any.whl (65 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.8/65.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.1.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client>=0.3.0 (from gradio)
  Downloading gradio_client-0.3.0-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.2/294.2 kB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx (from gradio)
  Downloading httpx-0.24.1-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━



In [26]:
from google.colab import drive
drive.mount('/content/drive')


import shutil

# File paths
colab_file_path = '/content/myModel.pt'
drive_file_path = '/content/drive/MyDrive/AI FPT/model_LSTM_for_5_tac_pham_Truyen_Kieu_Tat_den_HXH_Tho_Lao_Hac.pt'

# Copy the file
shutil.copyfile(colab_file_path, drive_file_path)


MessageError: ignored