In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip /content/drive/MyDrive/CS419/Cranfield.zip -d Cranfield

Archive:  /content/drive/MyDrive/CS419/Cranfield.zip
   creating: Cranfield/Cranfield/
  inflating: Cranfield/Cranfield/1.txt  
  inflating: Cranfield/Cranfield/10.txt  
  inflating: Cranfield/Cranfield/100.txt  
  inflating: Cranfield/Cranfield/1000.txt  
  inflating: Cranfield/Cranfield/1001.txt  
  inflating: Cranfield/Cranfield/1002.txt  
  inflating: Cranfield/Cranfield/1003.txt  
  inflating: Cranfield/Cranfield/1004.txt  
  inflating: Cranfield/Cranfield/1005.txt  
  inflating: Cranfield/Cranfield/1006.txt  
  inflating: Cranfield/Cranfield/1007.txt  
  inflating: Cranfield/Cranfield/1008.txt  
  inflating: Cranfield/Cranfield/1009.txt  
  inflating: Cranfield/Cranfield/101.txt  
  inflating: Cranfield/Cranfield/1010.txt  
  inflating: Cranfield/Cranfield/1011.txt  
  inflating: Cranfield/Cranfield/1012.txt  
  inflating: Cranfield/Cranfield/1013.txt  
  inflating: Cranfield/Cranfield/1014.txt  
  inflating: Cranfield/Cranfield/1015.txt  
  inflating: Cranfield/Cranfield/1016.tx

### Library

In [None]:
import nltk
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from numpy.linalg import norm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter,defaultdict

from tqdm import tqdm
import json
import re
import os
import math
import numpy as np
import pandas as pd
import operator
from matplotlib import pyplot as plt

from collections import OrderedDict

In [None]:
nltk.download('punkt_tab')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Load data

In [None]:
path = "/content/Cranfield/Cranfield"
path_query = "/content/Cranfield/TEST/query.txt"
path_res = "/content/Cranfield/TEST/RES/{}.txt"

In [None]:
documents = []

for filename in sorted(os.listdir(path), key=lambda x: int(x.split('.')[0])):
    if filename.endswith('.txt'):
        file_path = os.path.join(path, filename)
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            text = f.read()
            doc_id = filename.replace('.txt', '')
            documents.append((doc_id, text))

In [None]:
documents[0]

('1',
 'experimental investigation of the aerodynamics of a wing in a slipstream . an experimental study of a wing in a propeller slipstream was made in order to determine the spanwise distribution of the lift increase due to slipstream at different angles of attack of the wing and at different free stream to slipstream velocity ratios .  the results were intended in part as an evaluation basis for different theoretical treatments of this problem . the comparative span loading curves, together with supporting evidence, showed that a substantial part of the lift increment produced by the slipstream was due to a /destalling/ or boundary layer control effect .  the integrated remaining lift increment, after subtracting this destalling lift, was found to agree well with a potential flow theory . an empirical evaluation of the destalling effects was made for the specific configuration of the experiment . ')

In [None]:
query = []
with open(path_query, 'r', encoding='UTF-8') as f:
  for i in range(1, 226):
    for line in f:
      parts = line.strip().split("\t", 1)
      if len(parts) == 2:
        query_id, query_text = parts
        query.append((query_id, query_text))

In [None]:
query[0]

('1',
 'what similarity laws must be obeyed when constructing aeroelastic models of heated high speed aircraft .')

In [None]:
RES = []

for index in range(1, 226):  # từ 1 đến 225
    with open(path_res.format(index), 'r', encoding='UTF-8') as f:
        data = f.read().split()
        ids = [int(data[i]) for i in range(1, len(data), 3)]
        RES.append(ids)

In [None]:
print(RES[0])
print("\n",len(RES))

[184, 29, 31, 12, 51, 102, 13, 14, 15, 57, 378, 859, 185, 30, 37, 52, 142, 195, 875, 56, 66, 95, 462, 497, 858, 876, 879, 880, 486]

 225


### Preprocessing data

In [None]:
def text_process(text):
  # Tokenization
  tokens = word_tokenize(text)

  # Lowercase
  tokens_lower = [word.lower() for word in tokens]

  # Loại bỏ Punctuations và các ký tự đặc biệt
  cleaned_tokens = [re.sub(r'[^A-Za-z]+', '', word) for word in tokens_lower]
  cleaned_tokens = [word for word in cleaned_tokens if word]

  # Stopword
  stop_words = set(stopwords.words('english'))
  tokens_no_stopwords = [word for word in cleaned_tokens if word not in stop_words]

  # stemming
  porter_stemmer = PorterStemmer()
  stemmed_tokens = [porter_stemmer.stem(word) for word in tokens_no_stopwords]

  return stemmed_tokens

In [None]:
doc_ids = []
processed_docs = []
for doc_id, text in documents:
    processed_text = text_process(text)
    processed_docs.append((doc_id, processed_text))

In [None]:
processed_docs[0]

('1',
 ['experiment',
  'investig',
  'aerodynam',
  'wing',
  'slipstream',
  'experiment',
  'studi',
  'wing',
  'propel',
  'slipstream',
  'made',
  'order',
  'determin',
  'spanwis',
  'distribut',
  'lift',
  'increas',
  'due',
  'slipstream',
  'differ',
  'angl',
  'attack',
  'wing',
  'differ',
  'free',
  'stream',
  'slipstream',
  'veloc',
  'ratio',
  'result',
  'intend',
  'part',
  'evalu',
  'basi',
  'differ',
  'theoret',
  'treatment',
  'problem',
  'compar',
  'span',
  'load',
  'curv',
  'togeth',
  'support',
  'evid',
  'show',
  'substanti',
  'part',
  'lift',
  'increment',
  'produc',
  'slipstream',
  'due',
  'destal',
  'boundari',
  'layer',
  'control',
  'effect',
  'integr',
  'remain',
  'lift',
  'increment',
  'subtract',
  'destal',
  'lift',
  'found',
  'agre',
  'well',
  'potenti',
  'flow',
  'theori',
  'empir',
  'evalu',
  'destal',
  'effect',
  'made',
  'specif',
  'configur',
  'experi'])

### TF-IDF calculation and indexing

In [None]:
# Trích ra văn bản đã tiền xử lý
texts = [text for _, text in processed_docs]
doc_ids = [doc_id for doc_id, _ in processed_docs]

# Tính TF-IDF
vectorizer = TfidfVectorizer(
    sublinear_tf=True, #Sử dụng công thức chuẩn log tf
    smooth_idf=False, #Không cộng thêm 1 vào công thức idf
    norm='l2', #Sử dụng chuẩn cosine
    tokenizer=lambda x: x, #Không sử dụng tokenizer mặc định
    preprocessor=lambda x: x, #Không sử dụng preprocessor mặc định
    token_pattern=None #Tắt token pattern mặc định để không bị override
    )

tfidf_matrix = vectorizer.fit_transform(texts)  # Ma trận (số tài liệu x số term)

In [None]:
terms = vectorizer.get_feature_names_out()
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=terms)

print(df_tfidf)

       ab  abbrevi  abil  abl  ablat  abrupt  abruptli  absenc  absent  \
0     0.0      0.0   0.0  0.0    0.0     0.0       0.0     0.0     0.0   
1     0.0      0.0   0.0  0.0    0.0     0.0       0.0     0.0     0.0   
2     0.0      0.0   0.0  0.0    0.0     0.0       0.0     0.0     0.0   
3     0.0      0.0   0.0  0.0    0.0     0.0       0.0     0.0     0.0   
4     0.0      0.0   0.0  0.0    0.0     0.0       0.0     0.0     0.0   
...   ...      ...   ...  ...    ...     ...       ...     ...     ...   
1395  0.0      0.0   0.0  0.0    0.0     0.0       0.0     0.0     0.0   
1396  0.0      0.0   0.0  0.0    0.0     0.0       0.0     0.0     0.0   
1397  0.0      0.0   0.0  0.0    0.0     0.0       0.0     0.0     0.0   
1398  0.0      0.0   0.0  0.0    0.0     0.0       0.0     0.0     0.0   
1399  0.0      0.0   0.0  0.0    0.0     0.0       0.0     0.0     0.0   

      absolut  ...  zamm  zbrozek  zehnder      zero  zeroth  zhukhovitskii  \
0         0.0  ...   0.0      0.

In [None]:
# Tạo danh sách chỉ mục đảo
index = {}
for term_id, term in enumerate(terms):
    for doc_id in tfidf_matrix[:, term_id].nonzero()[0]:
        score = float(tfidf_matrix[doc_id, term_id])
        index.setdefault(term, []).append((doc_ids[doc_id], round(score, 4)))

In [None]:
# Ví dụ một phần chỉ mục
term = 'engin'
print(f"Term: '{term}'")
for doc_id, weight in index.get(term, []):
    print(f" - Document ID: {doc_id}, Weight: {weight}")

Term: 'engin'
 - Document ID: 12, Weight: 0.1189
 - Document ID: 33, Weight: 0.136
 - Document ID: 42, Weight: 0.2139
 - Document ID: 57, Weight: 0.1174
 - Document ID: 78, Weight: 0.1535
 - Document ID: 100, Weight: 0.2065
 - Document ID: 144, Weight: 0.1062
 - Document ID: 193, Weight: 0.0806
 - Document ID: 232, Weight: 0.0994
 - Document ID: 253, Weight: 0.1063
 - Document ID: 270, Weight: 0.1524
 - Document ID: 378, Weight: 0.1903
 - Document ID: 409, Weight: 0.1586
 - Document ID: 554, Weight: 0.1179
 - Document ID: 589, Weight: 0.1343
 - Document ID: 695, Weight: 0.1571
 - Document ID: 721, Weight: 0.1608
 - Document ID: 722, Weight: 0.0951
 - Document ID: 724, Weight: 0.0992
 - Document ID: 725, Weight: 0.1241
 - Document ID: 729, Weight: 0.1031
 - Document ID: 866, Weight: 0.2156
 - Document ID: 906, Weight: 0.1783
 - Document ID: 908, Weight: 0.1492
 - Document ID: 911, Weight: 0.1233
 - Document ID: 968, Weight: 0.1469
 - Document ID: 1025, Weight: 0.0965
 - Document ID: 109

### Evaluate

In [None]:
def evaluate_model(retrieved_results, true_relevant_docs, k_values=[5, 10, 20]):
    metrics = {}
    num_relevant_retrieved = 0
    precision_at_k = {}
    sum_precisions = 0.0

    true_relevant_set = set(true_relevant_docs)
    num_true_relevant = len(true_relevant_set)

    recalls = []
    precisions = []

    for i, doc_id in enumerate(retrieved_results):
        if doc_id in true_relevant_set:
            num_relevant_retrieved += 1
            current_precision = num_relevant_retrieved / (i + 1)
            sum_precisions += current_precision

        # Precision@K
        for k in k_values:
            if (i + 1) == k:
                precision_at_k[f'P@{k}'] = num_relevant_retrieved / k

        if num_true_relevant > 0:
            recalls.append(num_relevant_retrieved / num_true_relevant)
        else:
            recalls.append(0.0)
        precisions.append(num_relevant_retrieved / (i + 1))


    total_retrieved = len(retrieved_results)
    if total_retrieved > 0:
        metrics['Precision'] = num_relevant_retrieved / total_retrieved
    else:
        metrics['Precision'] = 0.0

    if num_true_relevant > 0:
        metrics['Recall'] = num_relevant_retrieved / num_true_relevant
    else:
        metrics['Recall'] = 0.0

    if (metrics['Precision'] + metrics['Recall']) > 0:
        metrics['F1-score'] = 2 * (metrics['Precision'] * metrics['Recall']) / (metrics['Precision'] + metrics['Recall'])
    else:
        metrics['F1-score'] = 0.0

    # Average Precision (AP)
    if num_true_relevant > 0:
        metrics['AP'] = sum_precisions / num_true_relevant
    else:
        metrics['AP'] = 0.0

    metrics.update(precision_at_k)
    metrics['recalls_list'] = recalls
    metrics['precisions_list'] = precisions
    return metrics

In [None]:
def Recall_and_Precision_r(list_re_pre):
    rs = list()
    trec = np.arange(0, 1.1, 0.1) # Standard recall levels

    for q_id in list_re_pre.keys():
        recall_list = list_re_pre[q_id]['recalls_list'] # Corrected key
        precision_list = list_re_pre[q_id]['precisions_list'] # Corrected key

        tempt = []
        for rec_level in trec:
            max_prec_at_level = 0.0
            found_at_level = False
            for i in range(len(recall_list)):
                if recall_list[i] >= rec_level:
                    max_prec_at_level = max(max_prec_at_level, max(precision_list[i:]))
                    found_at_level = True
                    break
            if found_at_level:
                tempt.append(max_prec_at_level)
            else:
                tempt.append(0.0)

        # Ensure 11 points (for 0.0 to 1.0 recall levels)
        while len(tempt) < 11:
            tempt.append(0.0)
        rs.append(tempt)
    return rs

def MAPr(list_pre_APr):
    APr = list()
    for i in list_pre_APr:
        APr.append(np.mean(i))
    MAPr_value = np.mean(APr)
    return MAPr_value

In [None]:
# Tiền xử lý cho query
processed_query = []
for doc_id, text in query:
    processed_text = text_process(text)
    processed_query.append(processed_text)

In [None]:
processed_query[0]

['similar',
 'law',
 'must',
 'obey',
 'construct',
 'aeroelast',
 'model',
 'heat',
 'high',
 'speed',
 'aircraft']

In [None]:
all_query_metrics = []
full_recall_precision_data = {}

for idx, q in enumerate(processed_query):
    # Xây dưng vector trọng số TF-IDF cho query
    query_vector = vectorizer.transform([q])

    # Tính cosine similarity giữa query và toàn bộ documents
    similarities = cosine_similarity(query_vector, tfidf_matrix)[0]

    # Sắp xếp các document theo độ tương đồng giảm dần
    ranked_indices = np.argsort(similarities)[::-1]
    ranked_doc_ids = [doc_ids[i] for i in ranked_indices]

    # Lấy kết quả thực tế từ RES
    relevant_docs = [str(doc_id) for doc_id in RES[idx]]  # Convert sang str để so sánh đúng
    eval_metrics = evaluate_model(ranked_doc_ids, relevant_docs)
    all_query_metrics.append(eval_metrics)

    # Lưu danh sách recall và precision để tính MAPr
    full_recall_precision_data[idx + 1] = {
        "recalls_list": eval_metrics["recalls_list"],
        "precisions_list": eval_metrics["precisions_list"]
    }

# --- Tổng hợp kết quả ---
total_precision = 0.0
total_recall = 0.0
total_f1 = 0.0
total_ap = 0.0
total_p_at_5 = 0.0
total_p_at_10 = 0.0
total_p_at_20 = 0.0

num_queries_evaluated = len(all_query_metrics)

for metrics in all_query_metrics:
    total_precision += metrics.get('Precision', 0)
    total_recall += metrics.get('Recall', 0)
    total_f1 += metrics.get('F1-score', 0)
    total_ap += metrics.get('AP', 0)
    total_p_at_5 += metrics.get('P@5', 0)
    total_p_at_10 += metrics.get('P@10', 0)
    total_p_at_20 += metrics.get('P@20', 0)

print("\n--- Kết quả Đánh giá Mô hình Vector Space ---")
if num_queries_evaluated > 0:
    print(f"Precision Trung bình: {total_precision / num_queries_evaluated:.4f}")
    print(f"Recall Trung bình: {total_recall / num_queries_evaluated:.4f}")
    print(f"F1-score Trung bình: {total_f1 / num_queries_evaluated:.4f}")
    print(f"Mean Average Precision (MAP): {total_ap / num_queries_evaluated:.4f}")
    print(f"Precision@5 Trung bình: {total_p_at_5 / num_queries_evaluated:.4f}")
    print(f"Precision@10 Trung bình: {total_p_at_10 / num_queries_evaluated:.4f}") # Changed total_queries_evaluated to num_queries_evaluated
    print(f"Precision@20 Trung bình: {total_p_at_20 / num_queries_evaluated:.4f}")
else:
    print("Không có truy vấn nào được đánh giá.")

# --- Tính toán MAP nội suy (MAPr) ---
if num_queries_evaluated > 0:
    avg_recall_precision_r_data = Recall_and_Precision_r(full_recall_precision_data)
    map_r_value = MAPr(avg_recall_precision_r_data)
    print(f"Mean Average Precision (Interpolated) (MAPr): {map_r_value:.4f}")
else:
    print("Không thể tính MAPr do không có truy vấn nào được đánh giá.")


--- Kết quả Đánh giá Mô hình Vector Space ---
Precision Trung bình: 0.0058
Recall Trung bình: 0.9995
F1-score Trung bình: 0.0115
Mean Average Precision (MAP): 0.3982
Precision@5 Trung bình: 0.4089
Precision@10 Trung bình: 0.2893
Precision@20 Trung bình: 0.1938
Mean Average Precision (Interpolated) (MAPr): 0.4177
