##### **Download library**

In [2]:
!pip install pyvi

Collecting pyvi
  Downloading pyvi-0.1.1-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting sklearn-crfsuite (from pyvi)
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite->pyvi)
  Downloading python_crfsuite-0.9.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading pyvi-0.1.1-py2.py3-none-any.whl (8.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m51.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m64.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite, pyvi
Successfully installed python-crfsuite-0.9.11 pyvi-0.1.1 sklearn-crfsuite-0.5.0


##### **Import Library**

In [64]:
import nltk
from collections import Counter
import random
from pyvi import ViTokenizer
import requests
from bs4 import BeautifulSoup
import re
import math

nltk.download("punkt")
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

##### **Crawl Data**

In [39]:
def crawl_tuoitre_rss(rss_url="https://tuoitre.vn/rss/thoi-su.rss", \
                      output_file="tuoitre.txt"):
    headers = {"User-Agent": "Mozilla/5.0"}
    all_texts = []

    # Load RSS feed (XML)
    r = requests.get(rss_url, headers=headers)
    soup = BeautifulSoup(r.text, "xml")

    # Get link from tag <item><link>
    links = [item.find("link").text for item in soup.find_all("item")]
    print("Found", len(links), "paper from RSS")

    # Get content in each link
    for link in links:
        try:
            art = requests.get(link, headers=headers)
            soup_art = BeautifulSoup(art.text, "html.parser")

            # Get content
            paragraphs = soup_art.select("div.detail-content p")
            # print(paragraphs)
            content = " ".join(p.get_text() for p in paragraphs)
            # print(content)

            if content.strip():
                all_texts.append(content)
        except Exception as e:
            print("Error:", e)

    # Save into .txt
    with open(output_file, "w", encoding="utf-8") as f:
        for line in all_texts:
            f.write(line + "\n")

    print(f"Crawl {len(all_texts)}, saved at {output_file}")

In [36]:
crawl_tuoitre_rss("https://tuoitre.vn/rss/thoi-su.rss", "tuoitre.txt")

Found 50 paper from RSS
Crawl 50, saved at tuoitre.txt


##### **Data Preprocessing**

In [40]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)
    text = re.sub(r"[^0-9a-zA-ZÀ-Ỹà-ỹ\s\.,!?]", " ", text)
    return text.strip()

In [42]:
with open("tuoitre.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

cleaned_text = clean_text(raw_text)

with open("tuoitre_clean.txt", "w", encoding="utf-8") as f:
    f.write(cleaned_text)

##### **Model**

###### Defination

In [74]:
class NgramModel:
    def __init__(self, n, alpha=0.01):
        self.n = n
        self.ngram_counts = Counter()
        self.context_counts = Counter()
        self.vocab = set()
        self.alpha = alpha


    '''
    Tokenize text function
    '''
    def tokenize_text(self, text):
        # separate sentences
        sentences = [s.strip() for s in text.split('.') if s.strip()]
        # tokenize
        tokens = [ViTokenizer.tokenize(sent).split() for sent in sentences]
        # Add start-of-sentence (<s>) and end-of-sentence (</s>) tokens
        # to each sentence for proper n-gram training
        tokens = [["<s>"] + t + ["</s>"] for t in tokens]
        return tokens


    '''
    Tokenize sentence function
    '''
    def tokenize_sentence(self, sentence):
        return ["<s>"] + ViTokenizer.tokenize(sentence.lower()).split() + ["</s>"]


    '''
    Model training function
    '''
    def train(self, file_path):
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read().lower()

        tokens = self.tokenize_text(text)

        for sent in tokens:
            self.vocab.update(sent)
            for i in range(len(sent)-self.n+1):
                ngram = tuple(sent[i:i+self.n])
                context = ngram[:-1]
                self.ngram_counts[ngram] += 1
                self.context_counts[context] += 1


    '''
    Probability n-gram
    '''
    def prob(self, ngram):
        context = ngram[:-1]
        count_ngram = self.ngram_counts[ngram]
        count_context = self.context_counts[context]
        V = len(self.vocab)
        return (count_ngram + self.alpha) / (count_context + V * self.alpha)



    '''
    Predict the next word
    '''
    def predict_next(self, context, top_k=5):
        if isinstance(context, str):
            context = ViTokenizer.tokenize(context.lower()).split()
        context = tuple(context[-(self.n-1):])
        candidates = {w: self.prob(context + (w,)) for w in self.vocab}
        sorted_candidates = sorted(candidates.items(),
                                   key=lambda x: x[1], reverse=True)
        return sorted_candidates[:top_k]


    '''
    Probability of a sentence
    '''
    def sentence_prob(self, sentence):
        tokens = self.tokenize_sentence(sentence)
        log_prob = 0.0
        for i in range(len(tokens)-self.n+1):
            ngram = tuple(tokens[i:i+self.n])
            log_prob += math.log(self.prob(ngram))
        return math.exp(log_prob)



###### Training

In [75]:
# Train bigram model
lm2 = NgramModel(n=2)
lm2.train("tuoitre_clean.txt")

# Train trigram model
lm3 = NgramModel(n=3)
lm3.train("tuoitre_clean.txt")

# Test probability of one sentence
sentence = "nhiều khu trung tâm thương mại"
print("Prob (bigram model):", lm2.sentence_prob(sentence))
print("Prob (trigram model):", lm3.sentence_prob(sentence))

# Predict the next word
print("\nPrediction (bigram model):", lm2.predict_next(sentence))
print("Prediction (trigram model):", lm3.predict_next(sentence))

Prob (bigram model): 3.763653436918376e-12
Prob (trigram model): 6.332875283257362e-09

Prediction (bigram model): [(',', 0.05068078668683811), ('xây_dựng', 0.025466464952092788), ('điện_tử', 0.025466464952092788), ('sản_phẩm', 0.025466464952092788), ('vincom', 0.025466464952092788)]
Prediction (trigram model): [('vincom', 0.03000594177064765), ('quyết_định', 0.00029708853238265), ('chủ_đề', 0.00029708853238265), ('dỡ', 0.00029708853238265), ('cai_trị', 0.00029708853238265)]
