# Academic Search Engine

## Noninteractive part

This section is used for run all needed code without running interactive part of notebook. This can be done by choosing this section in *Table of content* and choosing option *Run cells in section*.

### Dependencies

In [10]:
!pip install datasets==2.16.1 python-terrier==0.10.0



### Google Drive Supporter

In [11]:
def cd_drive_directory(dir = ''):
  """Change directory to specified path in Google Drive."""
  mount_google_drive()
  %mkdir /content/drive/{dir}
  %cd /content/drive/{dir}

def mount_google_drive():
  """Mount Google Drive."""
  from google.colab import drive
  drive.mount('/content/drive')

### Dataset getter

In [12]:
"""This module downloads dataset from huggingface.co"""

import datasets
import pandas as pd

def get_dataset(
    name: str, split: str, subset: str | None = None
  ) -> pd.DataFrame:
  """Execute download dataset and return specified split from dataset.

  Args:
    name: name of the dataset
    split: part of the dataset to return
    subset: subset of the dataset
  """
  full_dataset = _download_dataset(name, subset)
  splitted_dataset = full_dataset[split]
  return splitted_dataset.to_pandas()

def _download_dataset(
    name: str, subset: str | None = None
  ) -> datasets.DatasetDict:
  """Download dataset from huggingface.co."""
  return datasets.load_dataset(name, subset)

### BeIR dataset preparrer

In [13]:
"""This module prepare BeIR dataset"""

def get_beir_quora_dataset(
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
  """Return converted corpus, converted queries and qrels from BeIR dataset.

  This function downloads corpus of `BeIR/quora`, queries of `BeIR/quora` and
  qrels of `BeIR/quora`. Then they convert corpus of `BeIR/quora` and queries
  of `BeIR/quora` to the appropriate formats.
  """
  original_quora_corpus_dataset = get_dataset(
    'BeIR/quora', 'corpus', 'corpus')
  converted_quora_corpus_dataset = _convert_quora_corpus(
    original_quora_corpus_dataset)
  original_quora_queries_dataset = get_dataset(
    'BeIR/quora', 'queries', 'queries')
  converted_quora_queries_dataset = _convert_quora_queries(
    original_quora_queries_dataset)
  original_quora_qrels_dataset = get_dataset('BeIR/quora-qrels', 'validation')
  converted_quora_qrels_dataset = _convert_quora_qrels(
    original_quora_qrels_dataset)
  return (
    converted_quora_corpus_dataset,
    converted_quora_queries_dataset,
    converted_quora_qrels_dataset,
  )

def _convert_quora_corpus(dataset: pd.DataFrame) -> pd.DataFrame:
  """Convert corpus to expected format.

  Downloaded corpus contain:
  - title of the document (which is always empty for `BeIR/quora`),
  - text of the document,
  - unique id of the document.
  It converts it to format expected by `pyterrier.DFIndexer` which is
  DataFrame with columns `docno` and `text`.
  """
  return pd.DataFrame({
    'docno': dataset['_id'],
    'text': dataset['text'],
  })

def _convert_quora_queries(dataset: pd.DataFrame) -> pd.DataFrame:
  """Convert queries to expected format.

  Downloaded queries contain:
  - text of the query
  - unique id of the query
  It converts it to format expected by `pyterrier.BatchRetrieve.transform`
  function which is DataFrame with columns `qid` and `query`.
  """
  return pd.DataFrame({
    'qid': dataset['_id'],
    'query': dataset['text'].str.replace(r'[/?()\\#\'{}^!\":*]', ''),
  })

def _convert_quora_qrels(dataset: pd.DataFrame) -> pd.DataFrame:
  """Convert qrels to expected format.

  Downloaded qrels contain:
  - unique id of the query
  - unique id of the document
  - score equal 1
  It converts it to format expected by `pyterrier.Experiment` function which is
  DataFrame with columns `qid`, `docno`, `label`.
  """
  return pd.DataFrame({
    'qid': dataset['query-id'].apply(str),
    'docno': dataset['corpus-id'].apply(str),
    'label': dataset['score'],
  })

### Text Preprocessor

In [14]:
# """This module preprocess text"""

# import pandas as pd
# import re
# import nltk
# import warnings
# from pathlib import Path
# from typing import Tuple

# ReusableObjects = Tuple[
#   nltk.tokenize.WordPunctTokenizer,
#   nltk.corpus.WordListCorpusReader,
#   nltk.stem.porter.PorterStemmer,
#   re.Pattern,
# ]

# """                                                                        """
# def preprocess_text_in_dataset(
#     dataset: pd.DataFrame, cache_key: str
#   ) -> pd.DataFrame:
#   """Preprocess text in dataset using cache if available.

#   This function check if text was preprocessed earlier. If preprocessed
#   dataset is available then it returns this preprocessed dataset. Otherwise
#   it calls function to preprocess the dataset.
#   """
#   reusable_objects = _prepare_reusable_objects()
#   cache_filename = f'preprocessed_dataset_{cache_key}.csv'
#   is_cached = Path(cache_filename).is_file()
#   if is_cached:
#     return _return_preprocessed_dataset_from_cache(cache_filename)
#   else:
#     return _preprocess_text_in_dataset(
#       dataset,
#       cache_filename,
#       reusable_objects
#     )

# def _preprocess_text_in_dataset(
#     dataset: pd.DataFrame,
#     cache_filename: str,
#     reusable_objects: ReusableObjects) -> pd.DataFrame:
#   """Preprocess text in dataset.

#   This functions apply function to preprocess text for every row in column
#   `text` in dataset. Function modifies input argument, so ensure to pass copy
#   of dataset.
#   """
#   modified_dataset = dataset.copy(deep=True)
#   modified_dataset['text'] = modified_dataset['text'].apply(
#     lambda t: preprocess_text(t, *reusable_objects))
#   modified_dataset.to_csv(cache_filename, index=False)
#   return modified_dataset

# def _return_preprocessed_dataset_from_cache(
#     cache_filename: str) -> pd.DataFrame:
#   """Return preprocessed dataset from cache file.

#   This functions warns about using cache, to protect before unintended using
#   of cache for example in case of needed to make new computations. It read
#   data from csv file and save them to DataFrame.
#   """
#   warnings.warn('Using cache for preprocessed dataset')
#   return pd.read_csv(cache_filename)

# def _prepare_reusable_objects(
#     tokenizer: nltk.tokenize.WordPunctTokenizer | None = None,
#     stopwords: nltk.corpus.WordListCorpusReader | None = None,
#     stemmer: nltk.stem.porter.PorterStemmer | None = None,
#     non_words_regex: re.Pattern | None = None
#   ) -> ReusableObjects:
#   """Returns tokenizer, set of stopwords and steemer.

#   This function returns all objects needed to use in this module. Creation of
#   this objects is separated in this function to help avoid unnecessary
#   creation of objects that would slow down calculations.
#   """
#   tokenizer = tokenizer if tokenizer else nltk.tokenize.WordPunctTokenizer()
#   if not stopwords:
#     nltk.download('stopwords')
#   stopwords = (stopwords if stopwords
#                else nltk.corpus.stopwords.words('english'))
#   stemmer = stemmer if stemmer else nltk.stem.porter.PorterStemmer()
#   non_words_regex = re.compile(r'[^\w]\s')
#   return tokenizer, stopwords, stemmer, non_words_regex

# def preprocess_text(
#     text: str,
#     tokenizer: nltk.tokenize.WordPunctTokenizer | None = None,
#     stopwords: nltk.corpus.WordListCorpusReader | None = None,
#     stemmer: nltk.stem.porter.PorterStemmer | None = None,
#     non_words_regex: re.Pattern | None = None
#   ) -> str:
#   """Return preprocessed text.

#   It convert text to lowercase, remove non words, remove stop words and
#   perform stemming.
#   """
#   tokenizer, stopwords, stemmer = _prepare_reusable_objects(
#     tokenizer, stopwords, stemmer, non_words_regex)
#   text = _convert_to_lowercase(text)
#   text = _remove_non_words(text, non_words_regex)
#   tokens = _change_text_to_tokens(text, tokenizer)
#   tokens = _remove_stopwords(tokens, stopwords)
#   tokens = _stem(tokens, stemmer)
#   return _change_tokens_to_text(tokens)

# def _convert_to_lowercase(text: str) -> str:
#   return text.lower()

# def _remove_non_words(text: str, non_words_regex: re.Pattern) -> str:
#   """Remove non words from text.

#   This function removes non words (in the sense of regular expressions)
#   followed by a white mark.
#   """
#   return re.sub('', text)

# def _change_text_to_tokens(
#     text: str, tokenizer: nltk.tokenize.WordPunctTokenizer | None = None,
#   ) -> list[str]:
#   """Split text to tokens.

#   It divides given text into substrings by splitting on whitespace and
#   punctation.
#   """
#   return tokenizer.tokenize(text)

# def _remove_stopwords(
#     tokens: list[str], stopwords: nltk.corpus.WordListCorpusReader,
#   ) -> list[str]:
#   """Remove stopwords from list of tokens."""
#   tokens_witkout_stopwords = [t for t in tokens if t not in stopwords]
#   return tokens_witkout_stopwords

# def _stem(
#     tokens: list[str], stemmer: nltk.stem.porter.PorterStemmer
#   ) -> list[str]:
#   """Perform stemming on every word (token) in a lit of tokens.

#   Stemming is a process of reducing words to their base or root form.
#   """
#   return [stemmer.stem(t) for t in tokens]

# def _change_tokens_to_text(tokens: list[str]) -> str:
#   """Change list of tokens to one text."""
#   return ' '.join(tokens)

### Index Creator

In [15]:
"""This module create index."""

import pyterrier
import warnings
from pathlib import Path

def create_index(dataset: pd.DataFrame, cache_key: str) -> pyterrier.IndexRef:
  """Build index for dataset using cache if available.

  This function check if index was built earlier. If cache saved by pyterrier
  is available then it returns index based on cached data. Otherwise
  it calls function to preprocess the dataset.
  """
  _initialize_pyterrier()
  cache_path = f'./index_dataset_{cache_key}'
  is_cached = Path(cache_path).is_dir()
  if is_cached:
    return _return_index_from_cache(cache_path)
  else:
    return _create_index(dataset, cache_path)

def _create_index(dataset: pd.DataFrame, cache_path: str) -> pyterrier.IndexRef:
  """Build index for dataset.

  This function creates instance of DFIndexer and index dataset with two
  columns: text (contain text of the documents) and docno (contain id of the
  documents).
  Indexing is a process of creating index. Index is a data structure that
  allows for efficient search in dataset.
  """
  indexer = pyterrier.DFIndexer(cache_path, verbose=True)
  return indexer.index(dataset['text'], dataset['docno'])

def _return_index_from_cache(cache_path: str) -> pyterrier.IndexRef:
  """Return index created from cache files.

  This functions warns about using cache, to protect before unintended using
  of cache for example in case of needed to make new computations. It load
  existing index saved in cache files and create Index.
  """
  warnings.warn('Using cache for index')
  return pyterrier.IndexFactory.of(cache_path)

def _initialize_pyterrier() -> None:
  """Initialize pyterrier if not initialized.

  Pyterrier have to be initialized because it relies on Java-based Terrier
  library. So this functions start Java Virtual Machine, download Terrier files
  and configure them to work with Python.
  """
  if not pyterrier.started():
    pyterrier.init()


### Search Performer

In [16]:
"""This module perform search."""

import pandas as pd
import pyterrier
from typing import Literal

def search_interactive_using_bm25(
    index_ref: pyterrier.IndexRef, dataset: pd.DataFrame,
  ) -> pd.DataFrame:
  """Perform search based on user input using BM25 weighting model."""
  query = input('Input your query: ')
  _search(index_ref, "BM25", dataset, query)

def search_interactive_using_tfidf(
    index_ref: pyterrier.IndexRef, dataset: pd.DataFrame,
  ) -> pd.DataFrame:
  """Perform search based on user input using TF_IDF weighting model."""
  query = input('Input your query: ')
  _search(index_ref, "TF_IDF", dataset, query)

def _search(
    index_ref: pyterrier.IndexRef,
    weighting_model: Literal["BM25", "TF_IDF"],
    dataset: pd.DataFrame,
    query: str,
  ):
  """Perform search.

  This function preprocess query. It use retriever to perform search. Return
  readable results.
  """
  preprocessed_query = query
  retriever = get_retriever(index_ref, weighting_model)
  results = retriever.search(preprocessed_query)
  return _prepare_readable_results(results, dataset)

def get_retriever(
    index_ref: pyterrier.IndexRef, weighting_model: Literal["BM25", "TF_IDF"],
  ) -> pyterrier.IndexFactory:
  """Return instance of class which allow to perform search in Index."""
  return pyterrier.BatchRetrieve(
    index_ref, wmodel=weighting_model, verbose=True)

def _prepare_readable_results(
    results: pd.DataFrame, dataset: pd.DataFrame,
  ) -> pd.DataFrame:
  """Display readable results.

  It merges Dataframe with first 50 results with dataset Dataframe. Merge is
  made by `docno` column from `results` and `_id` from `dataset`. It merge by
  left join which means that all rows from left Dataframe are included in the
  merged result. It displays result of merging.
  """
  readable_results = pd.merge(results[:10], dataset, left_on='docno',
                              right_on='docno', how='left')
  display(readable_results)

### Search Engine Evaluator

In [17]:
"""This module evaluate search engine."""

import pyterrier
import statistics
from typing import List

def evaluate_search_engine(
    index_ref: pyterrier.IndexRef,
    weighting_model: Literal["BM25", "TF_IDF"],
    queries_dataset: pd.DataFrame,
    qrels_dataset: pd.DataFrame,
):
  precision_and_recall_results = _calculate_precision_and_recall(
    index_ref, weighting_model, queries_dataset, qrels_dataset)
  results = _calculate_f1_score(precision_and_recall_results)
  return results

def _calculate_precision_and_recall(
    index_ref: pyterrier.IndexRef,
    weighting_model: Literal["BM25", "TF_IDF"],
    queries_dataset: pd.DataFrame,
    qrels_dataset: pd.DataFrame
):
  retriever = get_retriever(index_ref, weighting_model)
  return pyterrier.Experiment(
      [retriever], queries_dataset, qrels_dataset,
      eval_metrics=['P', 'recall'], verbose=True)

def _calculate_f1_score(precision_and_recall_results: pd.DataFrame):
  calculate_metric_at = [5, 10, 15, 20, 30, 100, 200, 500, 1000]
  for k in calculate_metric_at:
    precision_and_recall_results[f'F1@{k}'] = (
      precision_and_recall_results.apply(
        lambda r: _calculate_f1_score_for_row(r, k), axis=1))
  return precision_and_recall_results

def _calculate_f1_score_for_row(r: pd.Series, k: int):
  return statistics.harmonic_mean([r[f'P@{k}'], r[f'R@{k}']])

### Main

In [18]:
cd_drive_directory('MyDrive/Udostępnione/20240120.223123_KF_IR.SearchEngine3/SearchEngine/')

Mounted at /content/drive
/content/drive/MyDrive/Udostępnione/20240120.223123_KF_IR.SearchEngine3/SearchEngine


In [19]:
quora_corpus, quora_queries, quora_qrels = get_beir_quora_dataset()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/25.3M [00:00<?, ?B/s]

Generating corpus split:   0%|          | 0/522931 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/664k [00:00<?, ?B/s]

Generating queries split:   0%|          | 0/15000 [00:00<?, ? examples/s]

  'query': dataset['text'].str.replace(r'[/?()\\#\'{}^!\":*]', ''),


Downloading readme:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/125k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/256k [00:00<?, ?B/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [20]:
index_ref = create_index(quora_corpus, 'quora_corpus')

terrier-assemblies 5.8 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.8 jar not found, downloading to /root/.pyterrier...
Done


PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8



  0%|          | 0/522931 [00:00<?, ?documents/s]

22:28:45.956 [main] WARN org.terrier.structures.indexing.Indexer - Indexed 218 empty documents


## Interactive part

In [None]:
search_interactive_using_bm25(index_ref, quora_corpus)

Input your query: Which question should I ask on Quora


BR(BM25):   0%|          | 0/1 [00:00<?, ?q/s]

Unnamed: 0,qid,docid,docno,rank,score,query,text
0,1,182493,188177,0,25.307848,Which question should I ask on Quora,How do I ask a question on Quora and what shou...
1,1,370273,381367,1,25.130616,Which question should I ask on Quora,How-to Question: How do I ask a question on Qu...
2,1,44,45,2,24.754331,Which question should I ask on Quora,What are the questions should not ask on Quora?
3,1,3589,3703,3,24.754331,Which question should I ask on Quora,How many questions have you asked on Quora?
4,1,3590,3704,4,24.754331,Which question should I ask on Quora,How many questions have been asked on Quora?
5,1,30622,31586,5,24.754331,Which question should I ask on Quora,How do I ask questions on Quora?
6,1,40291,41568,6,24.754331,Which question should I ask on Quora,How do I ask someone a question on Quora?
7,1,60251,62138,7,24.754331,Which question should I ask on Quora,"Is there any question, no one asked in Quora?"
8,1,61784,63729,8,24.754331,Which question should I ask on Quora,How so I ask questions on Quora?
9,1,68209,70356,9,24.754331,Which question should I ask on Quora,Where do you ask questions in Quora?


In [None]:
search_interactive_using_tfidf(index_ref, quora_corpus)

Input your query: Which question should I ask on Quora


BR(TF_IDF):   0%|          | 0/1 [00:00<?, ?q/s]

Unnamed: 0,qid,docid,docno,rank,score,query,text
0,1,182493,188177,0,13.861069,Which question should I ask on Quora,How do I ask a question on Quora and what shou...
1,1,370273,381367,1,13.765924,Which question should I ask on Quora,How-to Question: How do I ask a question on Qu...
2,1,44,45,2,13.560058,Which question should I ask on Quora,What are the questions should not ask on Quora?
3,1,3589,3703,3,13.560058,Which question should I ask on Quora,How many questions have you asked on Quora?
4,1,3590,3704,4,13.560058,Which question should I ask on Quora,How many questions have been asked on Quora?
5,1,30622,31586,5,13.560058,Which question should I ask on Quora,How do I ask questions on Quora?
6,1,40291,41568,6,13.560058,Which question should I ask on Quora,How do I ask someone a question on Quora?
7,1,60251,62138,7,13.560058,Which question should I ask on Quora,"Is there any question, no one asked in Quora?"
8,1,61784,63729,8,13.560058,Which question should I ask on Quora,How so I ask questions on Quora?
9,1,68209,70356,9,13.560058,Which question should I ask on Quora,Where do you ask questions in Quora?


## Part with long calculations

In [21]:
bm25_evaluation_results = evaluate_search_engine(
  index_ref, "BM25", quora_queries, quora_qrels)
display(bm25_evaluation_results.transpose())

pt.Experiment:   0%|          | 0/1 [00:00<?, ?system/s]

BR(BM25):   0%|          | 0/15000 [00:00<?, ?q/s]

Unnamed: 0,0
name,BR(BM25)
P@5,0.20316
P@10,0.1137
P@15,0.079493
P@20,0.06154
P@30,0.04222
P@100,0.013614
P@200,0.007024
P@500,0.002896
P@1000,0.001467


In [22]:
tfidf_evaluation_results = evaluate_search_engine(
  index_ref, "TF_IDF", quora_queries, quora_qrels)
display(tfidf_evaluation_results.transpose())

pt.Experiment:   0%|          | 0/1 [00:00<?, ?system/s]

BR(TF_IDF):   0%|          | 0/15000 [00:00<?, ?q/s]

Unnamed: 0,0
name,BR(TF_IDF)
P@5,0.20324
P@10,0.11364
P@15,0.079467
P@20,0.06148
P@30,0.042207
P@100,0.013598
P@200,0.00702
P@500,0.002896
P@1000,0.001467


## Word2Vec

In [23]:
!pip install gensim



In [25]:
query_corpus_dict = quora_qrels.groupby('qid')['docno'].apply(lambda ids: [str(id) for id in ids]).to_dict()

subset_query_corpus_dict = dict(list(query_corpus_dict.items())[:10])


In [26]:
import gensim.downloader as api
from gensim.parsing.preprocessing import preprocess_string
import pandas as pd

model = api.load('word2vec-google-news-300')



In [27]:
def find_similar_documents(query_id, quora_queries, quora_corpus, model, threshold=0.5):
    query_text = quora_queries[quora_queries['qid'] == query_id]['query'].iloc[0]
    query_tokens = preprocess_string(query_text)

    query_tokens = [token for token in query_tokens if token in model.key_to_index]

    similarity = []
    for _, row in quora_corpus.iterrows():
        doc_id = row['docno']
        doc_tokens = preprocess_string(row['text'])
        doc_tokens = [token for token in doc_tokens if token in model.key_to_index]

        if query_tokens and doc_tokens:
            sim = model.n_similarity(query_tokens, doc_tokens)
            similarity.append((doc_id, sim))

    if not similarity:
        return []

    max_sim = max(similarity, key=lambda x: x[1])[1]
    similarity = [(doc_id, sim / max_sim) for doc_id, sim in similarity if sim / max_sim >= threshold]

    return [doc_id for doc_id, sim in similarity]

similar_documents = find_similar_documents('46', quora_queries, quora_corpus, model, threshold=0.5)
print(f"Documents with similarity >= 0.5 to query ID 46: {similar_documents}")


Documents with similarity >= 0.5 to query ID 46: ['37', '38', '45', '76', '241', '242', '319', '320', '549', '886', '952', '953', '966', '967', '1122', '1123', '1219', '1256', '1257', '1357', '1358', '1359', '1360', '1401', '1402', '1438', '1439', '1513', '1602', '1717', '1718', '1752', '1781', '1782', '1927', '1928', '2209', '2210', '2366', '2367', '2374', '2375', '2602', '2603', '2782', '2860', '2861', '2929', '2930', '3029', '3097', '3210', '3211', '3228', '3229', '3279', '3307', '3308', '3332', '3373', '3374', '3430', '3468', '3515', '3575', '3576', '3613', '3614', '3703', '3704', '3784', '3785', '3804', '3805', '3857', '3858', '4041', '4115', '4212', '4213', '4329', '4330', '4407', '4408', '4411', '4412', '4476', '4583', '4625', '4626', '4702', '4703', '4762', '4901', '4902', '4950', '4951', '4971', '4972', '4979', '4980', '5041', '5042', '5108', '5109', '5152', '5153', '5264', '5265', '5284', '5285', '5287', '5348', '5632', '5633', '5656', '5787', '5788', '5827', '5828', '5882', 

In [None]:
search_results = {}

for query_id in quora_queries['qid'].unique():
    retrieved_doc_ids = find_similar_documents(query_id, quora_queries, quora_corpus, model, threshold=0.5)

    search_results[query_id] = retrieved_doc_ids


for query_id, doc_ids in search_results.items():
    if doc_ids:
        print(f"Document IDs retrieved for query ID {query_id} with similarity >= 0.5: {doc_ids}")
    else:
        print(f"No documents found with similarity >= 0.5 for query ID {query_id}")


In [28]:
import pickle
import os

pickle_filename = 'data.pkl'
if not os.path.exists(pickle_filename):
    with open(pickle_filename, 'wb') as file:
        pickle.dump(search_results, file)
else:
    with open(pickle_filename, 'rb') as file:
        search_results = pickle.load(file)

In [29]:
true_positives = 0
false_positives = 0
false_negatives = 0


for query_id, retrieved_doc_ids in search_results.items():
    expected_doc_ids = set(query_corpus_dict.get(query_id, []))
    retrieved_doc_ids_set = set(retrieved_doc_ids)


    true_positives += len(retrieved_doc_ids_set.intersection(expected_doc_ids))
    false_positives += len(retrieved_doc_ids_set.difference(expected_doc_ids))
    false_negatives += len(expected_doc_ids.difference(retrieved_doc_ids_set))


precision = true_positives / (true_positives + false_positives) if true_positives + false_positives > 0 else 0
recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives > 0 else 0
f1_score = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0


print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1_score:.4f}")


Precision: 0.0002
Recall: 0.8857
F1 Score: 0.0004
