In [22]:
import numpy as np
import math

sentences = [
    "Mèo là loài động vật có vú.",
    "Chó là bạn tốt của con người.",
    "Mèo và chó đều là thú cưng phổ biến.",
    "Trời hôm nay rất đẹp.",
    "Hôm nay trời mưa.",
    "Tôi thích ăn cơm với cá.",
    "Mèo thường ăn cá.",
    "Chó thích chạy nhảy ngoài trời.",
    "Trời mưa làm ướt mọi thứ.",
    "Cơm là món ăn truyền thống của người Việt."
]


In [23]:
# Bước 1: Tokenize các câu và xây dựng tập từ vựng
def tokenize(sentence):
    return sentence.lower().split()

vocab = set()
tokenized_sentences = []
for sentence in sentences:
    tokens = tokenize(sentence)
    tokenized_sentences.append(tokens)
    vocab.update(tokens)

vocab = sorted(vocab)
vocab_index = {word: idx for idx, word in enumerate(vocab)}


In [24]:
# Bước 2: tính toán TF (Term frequency)
def compute_tf(tokens, vocab_index):
    tf = np.zeros(len(vocab_index))
    for word in tokens:
        if word in vocab_index:
            tf[vocab_index[word]] += 1
    tf = tf / len(tokens)
    return tf

tf_matrix = np.array([compute_tf(tokens, vocab_index) for tokens in tokenized_sentences])

In [25]:
# Bước 3: Tính toán IDF (Inverse Document Frequency)
def compute_idf(tokenized_sentences, vocab_index):
    idf = np.zeros(len(vocab_index))
    total_docs = len(tokenized_sentences)

    for tokens in tokenized_sentences:
        for word in set(tokens): # Đảm bảo mỗi từ chỉ được tính một lần trong mỗi tài liệu
            if word in vocab_index:
                idf[vocab_index[word]] += 1

    idf = np.log((total_docs + 1) / (idf + 1)) + 1 # Thêm 1 để tránh chia cho 0
    return idf

idf_vector = compute_idf(tokenized_sentences, vocab_index)    


In [26]:
# Bước 4: Tính toán TF-IDF
tfidf_matrix = tf_matrix * idf_vector

# Bước 5: Tính toán cosine similarity
def cosine_similarity(v1, v2):
    dot_product = np.dot(v1, v2)
    norm_v1 = np.linalg.norm(v1)
    norm_v2 = np.linalg.norm(v2)
    return dot_product / (norm_v1 * norm_v2)

# Tính ma trận tương đồng
similarity_matrix = np.zeros((len(sentences), len(sentences)))


In [27]:
for i in range(len(sentences)):
    for j in range(len(sentences)):
        similarity_matrix[i][j] = cosine_similarity(tfidf_matrix[i], tfidf_matrix[j])

# Hiển thị ma trận tương đồng
import pandas as pd
df = pd.DataFrame(similarity_matrix, index=sentences, columns=sentences)

In [28]:
df

Unnamed: 0,Mèo là loài động vật có vú.,Chó là bạn tốt của con người.,Mèo và chó đều là thú cưng phổ biến.,Trời hôm nay rất đẹp.,Hôm nay trời mưa.,Tôi thích ăn cơm với cá.,Mèo thường ăn cá.,Chó thích chạy nhảy ngoài trời.,Trời mưa làm ướt mọi thứ.,Cơm là món ăn truyền thống của người Việt.
Mèo là loài động vật có vú.,1.0,0.074739,0.147326,0.0,0.0,0.0,0.134367,0.0,0.0,0.065511
Chó là bạn tốt của con người.,0.074739,1.0,0.15086,0.0,0.0,0.0,0.0,0.100752,0.0,0.177959
Mèo và chó đều là thú cưng phổ biến.,0.147326,0.15086,1.0,0.0,0.0,0.0,0.119738,0.08768,0.0,0.058379
Trời hôm nay rất đẹp.,0.0,0.0,0.0,1.0,0.577163,0.0,0.0,0.0,0.117386,0.0
Hôm nay trời mưa.,0.0,0.0,0.0,0.577163,1.0,0.0,0.0,0.0,0.135554,0.0
Tôi thích ăn cơm với cá.,0.0,0.0,0.0,0.0,0.0,1.0,0.349097,0.144799,0.0,0.215326
Mèo thường ăn cá.,0.134367,0.0,0.119738,0.0,0.0,0.349097,1.0,0.0,0.0,0.120603
Chó thích chạy nhảy ngoài trời.,0.0,0.100752,0.08768,0.0,0.0,0.144799,0.0,1.0,0.0,0.0
Trời mưa làm ướt mọi thứ.,0.0,0.0,0.0,0.117386,0.135554,0.0,0.0,0.0,1.0,0.0
Cơm là món ăn truyền thống của người Việt.,0.065511,0.177959,0.058379,0.0,0.0,0.215326,0.120603,0.0,0.0,1.0
