# **DSAI 201 Project:** Basic search engine

## Include any needed libraries and intiating

### Libraries import

In [None]:
!pip install python-terrier

In [131]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import pyterrier as pt
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np

### Intiating

In [None]:
!git clone https://github.com/terrierteam/terrier-prf/
!apt-get install maven
%cd /content/terrier-prf/
!mvn install
!pwd
%cd ..

In [None]:
nltk.download("stopwords")
nltk.download("punkt")

In [None]:
# import os
# os.environ["JAVA_HOME"] = r"D:\Program Files\Java\jdk-22"
if not pt.started():
    pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

## Data Collection

### Documents transforming

In [6]:
with open("./raw data/CISI.ALL") as file_handler:
    file_lines = file_handler.readlines()
data = {"DocID" : [], "Title" : [], "Content" : []}
write_flag = False
text = ""
for i in range(len(file_lines)):
    words = file_lines[i].split()
    if words and words[0] == ".I":
        data["DocID"].append(words[1])
        continue
    elif words and words[0] == ".T":
        write_flag = True
        continue
    elif words and words[0] == ".W":
        write_flag = True
        continue
    elif words and words[0] == ".A" and write_flag:
        data["Title"].append(text.strip())
        text = ""
        write_flag = False
    elif words and words[0] == ".X":
        data["Content"].append(text.strip())
        text = ""
        write_flag = False
    if write_flag:
        text += file_lines[i].strip() + ' '

In [7]:
docs_df = pd.DataFrame(data)
docs_df.to_csv("./data/documents.csv", index = False)
docs_df.head()

Unnamed: 0,DocID,Title,Content
0,1,18 Editions of the Dewey Decimal Classifications,The present study is a history of the DEWEY De...
1,2,Use Made of Technical Libraries,This report is an analysis of 6300 acts of use...
2,3,Two Kinds of Power An Essay on Bibliographic C...,The relationships between the organization and...
3,4,Systems Analysis of a University Library; fina...,The establishment of nine new universities in ...
4,5,A Library Management Game: a report on a resea...,Although the use of games in professional educ...


### Topics transformation

In [8]:
with open("./raw data/CISI.QRY") as file_handler:
    file_lines = file_handler.readlines()
data = {"QueryID" : [], "Text" : []}
text = ""
for i in range(len(file_lines)):
    words = file_lines[i].split()
    if words and words[0] == ".I":
        data["QueryID"].append(words[1])
        if text:
            data["Text"].append(text)
        continue
    elif words and words[0] == ".W":
        text = ""
        continue
    text += file_lines[i].strip() + ' '
data["Text"].append(text)

In [9]:
querys_df = pd.DataFrame(data)
querys_df.to_csv("./data/topics.csv", index = False)
querys_df.head()

Unnamed: 0,QueryID,Text
0,1,What problems and concerns are there in making...
1,2,"How can actually pertinent data, as opposed to..."
2,3,What is information science? Give definitions...
3,4,Image recognition and any other methods of aut...
4,5,What special training will ordinary researcher...


### Qrels transformation

In [10]:
qrels_df = pd.read_csv("./raw data/CISI.REL", sep = "\t", header=None)
qrels_df = qrels_df.iloc[:, [0]]
qrels_df["QueryID"] = qrels_df[0].apply(lambda x : x.split()[0])
qrels_df["DocID"] = qrels_df[0].apply(lambda x : x.split()[1])
qrels_df = qrels_df.drop(columns=[0])
qrels_df.to_csv("./data/qrels.csv", index = False)
qrels_df.head()

Unnamed: 0,QueryID,DocID
0,1,28
1,1,35
2,1,38
3,1,42
4,1,43


## Preprocessing

### Define preprocessing function

In [11]:
s_words = stopwords.words("english")
stemmer = PorterStemmer()
def preprocess(document: str):
    temp = document.lower()
    temp = word_tokenize(temp)
    temp = [stemmer.stem(token) for token in temp if not token in s_words]
    return temp

### Preprocess documents

In [12]:
docs_df["preprocessed_text"] = docs_df["Content"].apply(preprocess)
docs_df.head()

Unnamed: 0,DocID,Title,Content,preprocessed_text
0,1,18 Editions of the Dewey Decimal Classifications,The present study is a history of the DEWEY De...,"[present, studi, histori, dewey, decim, classi..."
1,2,Use Made of Technical Libraries,This report is an analysis of 6300 acts of use...,"[report, analysi, 6300, act, use, 104, technic..."
2,3,Two Kinds of Power An Essay on Bibliographic C...,The relationships between the organization and...,"[relationship, organ, control, write, organ, c..."
3,4,Systems Analysis of a University Library; fina...,The establishment of nine new universities in ...,"[establish, nine, new, univers, 1960, 's, prov..."
4,5,A Library Management Game: a report on a resea...,Although the use of games in professional educ...,"[although, use, game, profession, educ, becom,..."


## Indexing

### Build inverted index

In [13]:
docs_df["docno"] = docs_df["DocID"]
indexer = pt.DFIndexer(r"./index", overwrite=True)
index_ref = indexer.index(docs_df["Content"], docs_df["docno"])
index = pt.IndexFactory.of(index_ref)

### Construct the data structure that map unique words to the documents they ocurred in

In [14]:
words_docs = dict()
for row in docs_df[["DocID", "preprocessed_text"]].values:
    for term in row[1]:
        if not term in words_docs:
            words_docs[term] = set()
        words_docs[term].add(row[0])
print(words_docs)

{'present': {'209', '835', '839', '394', '1048', '69', '1051', '367', '999', '612', '1187', '1267', '282', '413', '475', '204', '733', '1429', '611', '352', '1450', '557', '1423', '1162', '884', '1360', '1460', '26', '1245', '735', '712', '403', '143', '111', '508', '1202', '1', '398', '416', '1061', '1274', '384', '725', '429', '657', '900', '913', '98', '1438', '566', '1175', '1218', '1365', '1308', '1421', '734', '100', '471', '819', '470', '1397', '316', '1350', '909', '312', '1106', '1439', '627', '206', '1215', '1385', '963', '1130', '630', '1374', '552', '1356', '118', '440', '850', '915', '1024', '459', '932', '193', '625', '776', '842', '1276', '1194', '1381', '457', '81', '335', '446', '1031', '1323', '1331', '1075', '1221', '1457', '240', '1254', '739', '1241', '1410', '1363', '771', '1082', '534', '593', '1240', '1000', '89', '396', '961', '1053', '476', '397', '191', '640', '873', '1025', '1156', '368', '730', '1046', '1427', '1064', '248', '321', '717', '1173', '496', '81

In [15]:
print("Documents that contain 'call':", words_docs["call"])

Documents that contain 'call': {'1309', '350', '786', '92', '173', '660', '965', '571', '1126', '197', '91', '734', '172', '1173', '632', '803', '1119', '1266', '96', '263', '964', '128', '1221', '1073', '61', '548', '674', '322', '758', '1148', '22', '169', '1224', '400', '778', '226', '25', '1197', '478', '256', '180', '645', '6', '104', '1201', '737', '926', '160', '1244', '1233', '1265', '933', '417', '361', '445', '934', '610', '547', '894', '656', '121', '1385'}


### Maintaining a list for each term of document IDs where the term appears along with the frequency of occurrence.

In [16]:
words_docs_freq = dict()
for row in docs_df[["DocID", "preprocessed_text"]].values:
    for term in row[1]:
        if not term in words_docs_freq:
            words_docs_freq[term] = dict()
        if not row[0] in words_docs_freq[term]:
            words_docs_freq[term][row[0]] = 0
        words_docs_freq[term][row[0]] += 1
print(words_docs_freq)

{'present': {'1': 1, '7': 2, '14': 1, '17': 3, '18': 1, '26': 1, '47': 1, '69': 1, '74': 1, '75': 1, '81': 1, '89': 1, '96': 1, '97': 1, '98': 1, '100': 1, '111': 1, '112': 1, '114': 1, '118': 1, '122': 1, '131': 1, '134': 1, '142': 1, '143': 2, '146': 1, '151': 1, '152': 1, '153': 2, '156': 1, '161': 1, '164': 1, '185': 1, '186': 1, '191': 2, '193': 1, '202': 1, '204': 1, '206': 1, '209': 1, '217': 2, '228': 1, '229': 1, '239': 2, '240': 1, '246': 1, '248': 1, '250': 1, '264': 1, '282': 1, '306': 1, '309': 1, '311': 1, '312': 1, '316': 1, '318': 1, '320': 1, '321': 1, '327': 1, '331': 1, '335': 2, '339': 3, '342': 1, '348': 1, '349': 2, '351': 1, '352': 3, '353': 1, '359': 1, '367': 2, '368': 1, '382': 2, '384': 1, '390': 1, '393': 2, '394': 1, '396': 1, '397': 1, '398': 1, '399': 4, '403': 1, '404': 1, '413': 3, '414': 1, '416': 1, '417': 1, '418': 1, '419': 1, '423': 1, '425': 1, '426': 1, '428': 1, '429': 1, '440': 1, '445': 1, '446': 1, '457': 2, '459': 1, '461': 2, '470': 1, '471

In [17]:
print("Frequency of 'run' in each document ocurred in:", words_docs_freq["run"])

Frequency of 'run' in each document ocurred in: {'164': 1, '197': 2, '491': 1, '506': 1, '512': 1, '581': 4, '609': 1, '661': 1, '696': 1, '1073': 1, '1091': 1, '1230': 1, '1402': 2}


## Query processing

### Get query from the user

In [18]:
query = input("Enter your query: ")
print("The raw query:", query)

Enter your query: relationship between the quantity and quality
The raw query: relationship between the quantity and quality


### Preprocess query

In [19]:
preprocessed_query = preprocess(query)
print("Preprocessed query:", preprocessed_query)

Preprocessed query: ['relationship', 'quantiti', 'qualiti']


### Results retrieval using TFIDF

In [20]:
tfidf_retriever = pt.BatchRetrieve(index, wmodel = "TF_IDF", num_results = 5)
tfidf_results = tfidf_retriever.search(query)
tfidf_results

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,101,102,0,11.918816,relationship between the quantity and quality
1,1,62,63,1,8.193977,relationship between the quantity and quality
2,1,717,718,2,8.193977,relationship between the quantity and quality
3,1,81,82,3,7.861952,relationship between the quantity and quality
4,1,112,113,4,7.603606,relationship between the quantity and quality


In [21]:
results_documents = docs_df[docs_df["docno"].isin(tfidf_results["docno"])]
for i in range(len(results_documents)):
    print(f"[{i + 1}] {results_documents.iloc[i, 1]}")
    print(results_documents.iloc[i, 2])
    print()

[1] The Publication Inflation
The much-vaunted information explosion seems to be the manifestation of a rather generalized publication inflation.. Multipublication and pretentious writing have become obstacles to "communication between human minds". To counteract their harmful effects it will be necessary to reduce the quantity and improve the quality of the printed records by filtering the material before it is stored for retrieval..

[2] Is Interindexer Consistency A Hobgoblin?
It is often assumed that the amount of interindexer consistency experienced under a given method of indexing is somehow indicative of the quality of the indexing.. To explore this assumption, two hypotheses are stated concerning the possible connection between interindexer consistency and indexing quality.. A specific counter-example is then exhibited which shows both hypotheses to be invalid.. Although a mathematical analysis of the counterexample yields certain insights, the general relationship between inte

## Query expansion

### RM3 query expansion

In [45]:
rm3_expander = pt.rewrite.RM3(index)
qe_tfidf = tfidf_retriever >> rm3_expander
expanded_q = qe_tfidf.search(query).iloc[0, 2]
expanded_q

'applypipeline:off quantiti^0.277224213 improv^0.024199286 qualiti^0.277224213 product^0.033807825 produc^0.028825624 physicist^0.038434163 depart^0.028825624 scientist^0.033807825 relationship^0.200000018 output^0.028825624 signific^0.028825624'

In [46]:
expanded_q = expanded_q.split()
expanded_q = " ".join([expanded_q[i].split("^")[0] for i in range(len(expanded_q)) if i])
print("Expanded query:", expanded_q)

Expanded query: quantiti improv qualiti product produc physicist depart scientist relationship output signific


In [47]:
tfidf_results = tfidf_retriever.search(expanded_q)
tfidf_results

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,101,102,0,33.568471,quantiti improv qualiti product produc physici...
1,1,103,104,1,18.785839,quantiti improv qualiti product produc physici...
2,1,717,718,2,13.604498,quantiti improv qualiti product produc physici...
3,1,107,108,3,13.177068,quantiti improv qualiti product produc physici...
4,1,112,113,4,12.518889,quantiti improv qualiti product produc physici...


In [48]:
results_documents = docs_df[docs_df["docno"].isin(tfidf_results["docno"])]
for i in range(len(results_documents)):
    print(f"[{i + 1}] {results_documents.iloc[i, 1]}")
    print(results_documents.iloc[i, 2])
    print()

[1] Scientific Output and Recognition: A Study in the Operation of the Reward System in Science
The relationship between the quantity and quality of scientific output of 120 university physicists was studied.. Although these two variables are highly correlated, some physicists produce many papers of little significance and other produce a few papers of great significance.. The responses of the community of physicists to these distinct patterns of research publication were investigated.. Quality of output is more significant than quantity in eliciting recognition through the receipt of awards, appointment to prestigious academic departments, and being widely known to one's colleagues.. The reward system operates to encourage creative scientists to be highly productive, to divert the energies of less creative physicists into other channels, and to produce a higher correlation between quantity and quality of output in the top departments than in the weaker departments..

[2] Visibility an

### Enhancing using ELMo

In [50]:
sentences = [query]
for i in range(len(results_documents)):
  sentences.append(results_documents.iloc[i, 2])
print(sentences)

['relationship between the quantity and quality', "The relationship between the quantity and quality of scientific output of 120 university physicists was studied.. Although these two variables are highly correlated, some physicists produce many papers of little significance and other produce a few papers of great significance.. The responses of the community of physicists to these distinct patterns of research publication were investigated.. Quality of output is more significant than quantity in eliciting recognition through the receipt of awards, appointment to prestigious academic departments, and being widely known to one's colleagues.. The reward system operates to encourage creative scientists to be highly productive, to divert the energies of less creative physicists into other channels, and to produce a higher correlation between quantity and quality of output in the top departments than in the weaker departments..", 'The paper contains an analysis of several aspects of the com

In [53]:
elmo = hub.load("https://tfhub.dev/google/elmo/3")
embeddings = elmo.signatures["default"](tf.constant(sentences))["elmo"]

In [55]:
def cosine_similarity(emb1: np.ndarray, emb2: np.ndarray):
  centroid_1 = emb1.sum(axis = 0) / len(emb1)
  centroid_2 = emb2.sum(axis = 0) / len(emb2)
  return np.dot(centroid_1, centroid_2) / np.linalg.norm(centroid_1) / np.linalg.norm(centroid_2)

In [59]:
score_document = []
for i in range(1, len(sentences)):
  score_document.append((cosine_similarity(embeddings[0].numpy(), embeddings[i].numpy()), sentences[i]))
score_document.sort(key = lambda x : x[0], reverse = True)

In [60]:
for i in range(len(score_document)):
  print(f"[{i + 1}] Score: {score_document[i][0]}")
  print(score_document[i][1])
  print()

[1] Score: 0.5443784594535828
Critical reviews are examined in terms of the quality and quantity of their present production and the measure of their value and utility to scientists in meeting information needs..The paper presented the viewpoint of user, sponsor, author, and editor, and discuss present problems and possible future solutions..

[2] Score: 0.22279678285121918
The relationship between the quantity and quality of scientific output of 120 university physicists was studied.. Although these two variables are highly correlated, some physicists produce many papers of little significance and other produce a few papers of great significance.. The responses of the community of physicists to these distinct patterns of research publication were investigated.. Quality of output is more significant than quantity in eliciting recognition through the receipt of awards, appointment to prestigious academic departments, and being widely known to one's colleagues.. The reward system operate

## UI

The UI is seperated in UI folder

## Evaluation

In [121]:
qrels = pd.DataFrame()
qrels["qid"] = qrels_df["QueryID"].iloc[:51]
qrels["docno"] = qrels_df["DocID"].iloc[:51]
qrels["label"] = qrels["qid"].apply(lambda x : 1).iloc[:51]
topics = pd.DataFrame()
topics["qid"] = querys_df["QueryID"].iloc[:51]
topics["query"] = querys_df['Text'].apply(lambda x : x.replace("?", "")).iloc[:51]

In [130]:
res = tfidf_retriever.transform(topics)
print("Accuracy:", pt.Evaluate(res, qrels, metrics = ["P"]))

Accuracy: {'P@5': 0.3, 'P@10': 0.15, 'P@15': 0.1, 'P@20': 0.075, 'P@30': 0.05, 'P@100': 0.015, 'P@200': 0.0075, 'P@500': 0.003, 'P@1000': 0.0015}


In [129]:
%%timeit
res = tfidf_retriever.transform(topics)

104 ms ± 33.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
