In [None]:
!pip install numpy pandas scikit-learn nltk
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
!unzip '/content/FUM_IR_1402-02_HW#1_Resources.zip'
dataset_path='/content/FUM_IR_1402-02_HW#1_Resources'

In [4]:
import os
def load_dataset(path):
  docs_path = os.path.join(path, 'cranfieldDocs')
  query_path = os.path.join(path, 'queries.txt')
  my_query_path = os.path.join(path, 'my-query.txt')
  file_list = os.listdir(docs_path)
  docs=[]
  for file_name in file_list:
    file_path = os.path.join(docs_path, file_name)
    with open(file_path, 'r') as file:
        content = file.read()
        docs.append((file_name,content))

    queries=[]
    with open(query_path, 'r') as file:
        lines = file.readlines()
        # Remove newline characters from each line
        queries = [(id+1,line.strip()) for id,line in enumerate(lines)]
    my_queries=[]
    with open(my_query_path, 'r') as file:
        lines = file.readlines()
        # Remove newline characters from each line
        my_queries = [(id+1,line.strip()) for id,line in enumerate(lines)]
    dataset={
      'docs':docs,
      'queries':queries,
      'my_query':my_queries
      }
  return dataset

In [5]:
def load_relevace(folder_path):
    id_dict = {}
    filename = os.path.join(folder_path, 'relevance.txt')
    with open(filename, 'r') as file:
        for line in file:
            id, element = line.strip().split()
            id = int(id)
            element = int(element)
            if id not in id_dict:
                id_dict[id] = []
            id_dict[id].append(element)
    return id_dict

In [7]:
import re

dataset = load_dataset(dataset_path)
relevance = load_relevace(dataset_path)
def preprocess_document(doc):
    doc = doc.lower()

    # Remove SGML tags and only keep title and text
    title = re.search(r'<title>(.*?)</title>', doc, re.S)
    text = re.search(r'<text>(.*?)</text>', doc, re.S)
    doc = ""
    if title:
        doc += title.group(1) + " "
    if text:
        doc += text.group(1)

    # Remove SGML tags
    doc = re.sub(r'<.*?>', '', doc)

    # Remove punctuation marks and numbers
    doc = re.sub(r'[^a-z\s]', ' ', doc)

    tokens = word_tokenize(doc)

    stop_words = set(stopwords.words('english'))
    porter = PorterStemmer()

    tokens = [porter.stem(token) for token in tokens if token not in stop_words]

    tokens = [token for token in tokens if len(token) > 2]

    return ' '.join(tokens)

In [8]:
documents = []
doc_ids = []
for doc_id,text in dataset['docs']:
    preprocessed_doc = preprocess_document(text)
    documents.append(preprocessed_doc)
    doc_ids.append(doc_id)

In [9]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

def preprocess_query(query):
    # Remove punctuation marks and numbers
    doc = re.sub(r'[^a-z\s]', ' ', query)

    tokens = word_tokenize(doc)

    stop_words = set(stopwords.words('english'))
    porter = PorterStemmer()

    tokens = [porter.stem(token) for token in tokens if token not in stop_words]

    tokens = [token for token in tokens if len(token) > 2]

    return ' '.join(tokens)
def related_docs(n,queries=dataset['queries']):
  top={}
  # Iterate over queries and perform vector space retrieval
  for query_id,text in queries:
      query_text = preprocess_query(text)
      query_vector = vectorizer.transform([query_text])

      # Compute cosine similarity between the query and all documents
      similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()
      # print(f'q: {query_vector}')
      # print(f'tf: {tfidf_matrix}')

      # Get the top n most similar documents
      top_indices = similarities.argsort()[-1*n:][::-1]
      top_docs = [(doc_ids[idx], similarities[idx]) for idx in top_indices]
      top[query_id]=[]
      # Print the query ID and top n documents
      # print(f"Query ID: {query_id}")
      for doc_id, score in top_docs:
          print(f'(query_id:{query_id},doc_id:{doc_id}, Score: {score})')
          # print(f"Doc ID: {doc_id}, Score: {score}")
          top[query_id].append(int(doc_id))
      print("\n")
  return top



In [None]:
top= related_docs(10)

In [11]:
def calculate_metrics(top_lists):
    metrics={}
    precision_sum=0
    recall_sum=0
    for query_id,top in top_lists.items():
        #precision
        tp=0
        for i in top:
          if i in relevance[query_id] :
            tp+=1
        precision = tp/len(top)
        recall= tp/len(relevance[query_id])
        precision_sum+=precision
        recall_sum+=recall
        metrics[query_id]=(precision,recall)
    avg_pre=precision_sum/10
    avg_recall=recall_sum/10
    return metrics ,avg_pre ,avg_recall

In [None]:
top

In [13]:
metrics ,avg_pre ,avg_recall =calculate_metrics(top)

In [None]:
list_metrics={}
for n in [10,20,50,100,500]:
  print(f'n={n}')
  top=related_docs(n)
  metrics ,avg_pre ,avg_recall =calculate_metrics(top)
  list_metrics[n]={
      'precision_average':avg_pre ,
      'recall_average':avg_recall,
      'queries':metrics
  }

In [None]:
list_metrics

In [None]:
import matplotlib.pyplot as plt

# The provided list_metrics
# list_metrics = {
#     10: {'precision_average': 0.23000000000000004, 'recall_average': 0.20190058479532164},
#     20: {'precision_average': 0.175, 'recall_average': 0.3157748538011696},
#     50: {'precision_average': 0.1, 'recall_average': 0.43157894736842106},
#     100: {'precision_average': 0.06999999999999999, 'recall_average': 0.5564766081871345},
#     500: {'precision_average': 0.0238, 'recall_average': 0.9430555555555555}
# }

# Extracting the IDs, precision, and recall values
ids = list(list_metrics.keys())
precision_averages = [list_metrics[id]['precision_average'] for id in ids]
recall_averages = [list_metrics[id]['recall_average'] for id in ids]

# Plotting the list_metrics
plt.figure(figsize=(10, 6))

plt.plot(ids, precision_averages, marker='o', label='Average Precision')
plt.plot(ids, recall_averages, marker='o', label='Average Recall')
# Adding the IDs as x-tick labels
plt.xticks(ids, ids)
plt.xlabel('N')
plt.ylabel('Average Value')
plt.title('Average Precision and Recall')
plt.legend()
plt.grid(True)

plt.show()


In [None]:


for n in [10,20,50,100,500]:
  print(f'n={n}')
  top=related_docs(n,queries=dataset['my_query'])

In [None]:
print(f'n={3}')
top=related_docs(3,queries=dataset['my_query'])