In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import datetime

# 讀取資料
train_questions = pd.read_csv('train_question.csv')
test_questions = pd.read_csv('test_question.csv')
documents_data = pd.read_csv('documents_data.csv')

# 定義預處理函數
def clean_html(document_html, is_html=False):
    if is_html:
        soup = BeautifulSoup(document_html, 'html.parser')
        text = soup.get_text()
    else:
        text = document_html
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    return text

# 預處理文檔和訓練集
documents_data['cleaned_text'] = documents_data['Document_HTML'].apply(lambda x: clean_html(x, is_html=True))
train_questions['cleaned_question'] = train_questions['Question'].apply(clean_html)

# 預處理測試集
test_questions['cleaned_question'] = test_questions['Question'].apply(clean_html)

# 建立詞彙表（僅根據訓練集和文檔中的詞彙）
docs_tokens = documents_data['cleaned_text'].apply(lambda x: x.split())
train_questions_tokens = train_questions['cleaned_question'].apply(lambda x: x.split())
all_tokens = [token for tokens in docs_tokens for token in tokens] + [token for tokens in train_questions_tokens for token in tokens]
vocabulary = list(set(all_tokens))
vocab_size = len(vocabulary)

# 建立詞彙到索引的映射
word_to_index = {word: i for i, word in enumerate(vocabulary)}

# 建立文檔-詞矩陣 (Document-Term Matrix)
num_docs = len(documents_data)
DTM = np.zeros((num_docs, vocab_size))

for i, tokens in enumerate(docs_tokens):
    token_indices = [word_to_index[token] for token in tokens if token in word_to_index]
    counts = np.bincount(token_indices, minlength=vocab_size)
    DTM[i] = counts

# 建立訓練查詢-詞矩陣 (Query-Term Matrix) for train
num_train_queries = len(train_questions)
QTM_train = np.zeros((num_train_queries, vocab_size))

for i, tokens in enumerate(train_questions_tokens):
    token_indices = [word_to_index[token] for token in tokens if token in word_to_index]
    counts = np.bincount(token_indices, minlength=vocab_size)
    QTM_train[i] = counts

# 計算IDF（使用訓練集和文檔）
DF = np.sum(DTM > 0, axis=0)  # 包含每個詞的文檔數
N = num_docs
IDF = np.log(N / (1 + DF))  # 防止除以0

# 計算TF
doc_lengths = np.sum(DTM, axis=1, keepdims=True)  # 文檔中詞的總數
doc_lengths[doc_lengths == 0] = 1  # 防止除以0
TF_docs = DTM / doc_lengths

train_query_lengths = np.sum(QTM_train, axis=1, keepdims=True)
train_query_lengths[train_query_lengths == 0] = 1
TF_train_queries = QTM_train / train_query_lengths

# 計算TF-IDF for documents and train queries
TFIDF_docs = TF_docs * IDF
TFIDF_train_queries = TF_train_queries * IDF

# 正規化TF-IDF向量
norms_docs = np.linalg.norm(TFIDF_docs, axis=1, keepdims=True)
norms_docs[norms_docs == 0] = 1
normalized_TFIDF_docs = TFIDF_docs / norms_docs

norms_train_queries = np.linalg.norm(TFIDF_train_queries, axis=1, keepdims=True)
norms_train_queries[norms_train_queries == 0] = 1
normalized_TFIDF_train_queries = TFIDF_train_queries / norms_train_queries

# 計算餘弦相似度矩陣 for train queries
similarity_matrix_train = np.dot(normalized_TFIDF_train_queries, normalized_TFIDF_docs.T)

# 對每個訓練查詢找到最相似的前三個文檔
top_k = 3
top_k_indices_train = np.argsort(similarity_matrix_train, axis=1)[:, -top_k:][:, ::-1]
predicted_docs_train = documents_data['Document ID'].values

# 提取訓練查詢的預測文檔ID
results_train = [predicted_docs_train[indices] for indices in top_k_indices_train]

# 現在對測試查詢進行預測
# 建立測試查詢-詞矩陣 (Query-Term Matrix) for test
test_questions_tokens = test_questions['cleaned_question'].apply(lambda x: x.split())
num_test_queries = len(test_questions)
QTM_test = np.zeros((num_test_queries, vocab_size))

for i, tokens in enumerate(test_questions_tokens):
    token_indices = [word_to_index[token] for token in tokens if token in word_to_index]
    counts = np.bincount(token_indices, minlength=vocab_size)
    QTM_test[i] = counts

# 計算TF for test queries
test_query_lengths = np.sum(QTM_test, axis=1, keepdims=True)
test_query_lengths[test_query_lengths == 0] = 1
TF_test_queries = QTM_test / test_query_lengths

# 計算TF-IDF for test queries
TFIDF_test_queries = TF_test_queries * IDF

# 正規化TF-IDF向量 for test queries
norms_test_queries = np.linalg.norm(TFIDF_test_queries, axis=1, keepdims=True)
norms_test_queries[norms_test_queries == 0] = 1
normalized_TFIDF_test_queries = TFIDF_test_queries / norms_test_queries

# 計算餘弦相似度矩陣 for test queries
similarity_matrix_test = np.dot(normalized_TFIDF_test_queries, normalized_TFIDF_docs.T)

# 對每個測試查詢找到最相似的前三個文檔
top_k_indices_test = np.argsort(similarity_matrix_test, axis=1)[:, -top_k:][:, ::-1]
predicted_docs_test = documents_data['Document ID'].values

# 提取測試查詢的預測文檔ID
results_test = [predicted_docs_test[indices] for indices in top_k_indices_test]

# 保存結果 for test queries
now = datetime.datetime.now()
formatted_time = now.strftime("%m%d_%H%M")
csv_filename = f'vector_{formatted_time}_test.csv'
result_df_test = pd.DataFrame({
    "index": list(range(1, len(results_test) + 1)),
    "answer": [' '.join(map(str, doc_ids)) for doc_ids in results_test]
})

result_df_test.to_csv(csv_filename, index=False)

# 輸出測試查詢結果
for i, doc_ids in enumerate(results_test):
    answer = ' '.join(map(str, doc_ids))
    print(f"Test Query {i+1}: {answer}")

print(f"Results for test queries saved to {csv_filename}")


Test Query 1: 966 849 730
Test Query 2: 145 328 663
Test Query 3: 551 932 982
Test Query 4: 359 874 111
Test Query 5: 973 204 357
Test Query 6: 975 688 348
Test Query 7: 807 847 794
Test Query 8: 299 441 756
Test Query 9: 546 885 394
Test Query 10: 780 269 691
Test Query 11: 443 263 409
Test Query 12: 683 543 809
Test Query 13: 79 905 20
Test Query 14: 881 511 803
Test Query 15: 596 684 923
Test Query 16: 411 563 726
Test Query 17: 360 181 957
Test Query 18: 389 340 221
Test Query 19: 618 867 734
Test Query 20: 394 82 649
Test Query 21: 420 835 473
Test Query 22: 620 530 300
Test Query 23: 735 286 575
Test Query 24: 285 804 200
Test Query 25: 827 965 894
Test Query 26: 244 539 574
Test Query 27: 369 631 316
Test Query 28: 954 788 969
Test Query 29: 353 350 871
Test Query 30: 282 570 994
Test Query 31: 811 736 885
Test Query 32: 208 771 470
Test Query 33: 927 210 884
Test Query 34: 971 629 699
Test Query 35: 885 552 898
Test Query 36: 875 262 523
Test Query 37: 738 604 961
Test Query 38

In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import datetime

# 讀取資料
train_questions = pd.read_csv('train_question.csv')
test_questions = pd.read_csv('test_question.csv')
documents_data = pd.read_csv('documents_data.csv')

# 停用詞列表（可根據需要擴充）
stop_words = set(['the', 'is', 'in', 'at', 'of', 'and', 'a', 'to'])

# 定義文本預處理函數
def clean_text(text, is_html=False):
    if is_html:
        soup = BeautifulSoup(text, 'html.parser')
        text = soup.get_text()
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    return text

# 預處理文檔和查詢
documents_data['cleaned_text'] = documents_data['Document_HTML'].apply(lambda x: clean_text(x, is_html=True))
train_questions['cleaned_question'] = train_questions['Question'].apply(clean_text)
test_questions['cleaned_question'] = test_questions['Question'].apply(clean_text)

# 建立詞彙表（基於訓練集和文檔）
docs_tokens = documents_data['cleaned_text'].apply(lambda x: x.split())
train_tokens = train_questions['cleaned_question'].apply(lambda x: x.split())

# 計算所有詞的出現次數
all_tokens = [token for tokens in docs_tokens for token in tokens] + [token for tokens in train_tokens for token in tokens]
all_word_counts = pd.Series(all_tokens).value_counts()

# 建立詞彙表
vocabulary = all_word_counts.index.tolist()
vocab_size = len(vocabulary)

# 詞彙到索引的映射
word_to_index = {word: idx for idx, word in enumerate(vocabulary)}

# 構建文檔-詞矩陣 (DTM)
num_docs = len(documents_data)
DTM = np.zeros((num_docs, vocab_size))
doc_lengths = np.zeros(num_docs)

for i, tokens in enumerate(docs_tokens):
    token_indices = [word_to_index[token] for token in tokens if token in word_to_index]
    counts = np.bincount(token_indices, minlength=vocab_size)
    DTM[i] = counts
    doc_lengths[i] = np.sum(counts)

# 計算平均文檔長度
avg_doc_length = np.mean(doc_lengths)

# 計算IDF（逆文檔頻率）
df = np.sum(DTM > 0, axis=0)
N = num_docs
IDF = np.log((N - df + 0.5) / (df + 0.5) + 1)

# 設定 BM25 的參數
k1 = 1.5
b = 0.75

# 計算 BM25 文檔矩陣
BM25_docs = np.zeros((num_docs, vocab_size))

for i in range(num_docs):
    for j in range(vocab_size):
        tf = DTM[i, j]
        if tf > 0:
            numerator = tf * (k1 + 1)
            denominator = tf + k1 * (1 - b + b * (doc_lengths[i] / avg_doc_length))
            BM25_docs[i, j] = IDF[j] * (numerator / denominator)

# 處理測試查詢
test_tokens = test_questions['cleaned_question'].apply(lambda x: x.split())
num_test_queries = len(test_questions)
BM25_test_queries = np.zeros((num_test_queries, vocab_size))

for i, tokens in enumerate(test_tokens):
    token_indices = [word_to_index[token] for token in tokens if token in word_to_index]
    query_term_freq = np.bincount(token_indices, minlength=vocab_size)
    BM25_test_queries[i] = query_term_freq * IDF

# 計算查詢與文檔的 BM25 相似度
similarity_matrix = np.dot(BM25_test_queries, BM25_docs.T)

# 對每個查詢找到最相關的前三個文檔
top_k = 3
top_k_indices = np.argsort(similarity_matrix, axis=1)[:, -top_k:][:, ::-1]
predicted_docs = documents_data['Document ID'].values

# 提取預測結果
results = [predicted_docs[indices] for indices in top_k_indices]

# 保存結果
now = datetime.datetime.now().strftime("%m%d_%H%M")
csv_filename = f'vector_{now}_test.csv'
result_df = pd.DataFrame({
    "index": range(1, len(results) + 1),
    "answer": [' '.join(map(str, doc_ids)) for doc_ids in results]
})
result_df.to_csv(csv_filename, index=False)

# 輸出結果
for i, doc_ids in enumerate(results):
    print(f"Test Query {i+1}: {' '.join(map(str, doc_ids))}")

print(f"Results saved to {csv_filename}")


FileNotFoundError: [Errno 2] No such file or directory: 'train_question.csv'