# Information Retrieval Engine

In [1]:
import json
import xmltodict as xtd
import numpy as np
import os
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import PorterStemmer
from joblib import Parallel, delayed
from gensim import corpora
from gensim import models
import pandas as pd
from gensim import similarities
from operator import itemgetter

## Funciones de carga

In [2]:
path_queries = r"B:\document_parser\document_parses\topics-rnd5.xml"
path_texts = "B:\document_parser\document_parses\pdf_json"
path_test = "B:/document_parser/document_parses/test"
path_judgements = "B:/document_parser/document_parses/judgements.csv"

### Judgements

Lo primero de todo, cargamos el archivo de los judgements. Este archivo contiene 4 columnas pero solo nos interesan la columna del id de la query, id del documento y relevancia. 
Esta relevancia aparece con valores discretos 0, 1 y 2 por lo que tenemos que binarizarla. 

In [3]:
def load_judgements(path_judgements):
    judgements = pd.read_csv(path_judgements, delimiter=' ', names = ["query", "document", "score"], usecols=[0,2,3])
    judgements.loc[judgements['score'] < 1, 'binary_score'] = 0
    judgements.loc[judgements['score'] >=1 , 'binary_score'] = 1
    return judgements

In [4]:
judgements = load_judgements(path_judgements)

### Queries

Después cargamos el archivo de las queries,

In [5]:
def load_queries(queries_path):
    """
    Receives the path of the queries files and returns a dictionary containing all the queries.

    Parameters
    ----------
    queries_path : path of the queries file

    Returns
    -------
    dic_judgements : dictionary

    """
    with open(queries_path, "r") as xml_file:
        data_dict = xtd.parse(xml_file.read())
    xml_file.close()

    dic_queries = {}
    for query in data_dict["topics"]["topic"]:
        dic_queries[query["@number"]] = query["query"]

    df = pd.DataFrame.from_dict(dic_queries, orient='index', columns=['query'])

    return df

In [6]:
queries = load_queries(path_queries)
queries.head()

Unnamed: 0,query
1,coronavirus origin
2,coronavirus response to weather changes
3,coronavirus immunity
4,how do people die from the coronavirus
5,animal models of COVID-19


### Diccionario

Ahora creamos el diccionario con todas las palabras de los documentos.

In [7]:
def load_corpus_single(path_corpus):
    directory = os.fsencode(path_corpus)
    stopset = set(stopwords.words("english"))
    dic = corpora.Dictionary()
    titles_dic = {}
    counter = 1        
    for file in os.listdir(directory):
        print(counter)
        counter+=1
        filename = os.fsdecode(file)
        with open("\\".join([path_corpus, filename])) as file:
            file_json = json.load(file)
        pre_doc = preprocess_document(file_json, stopset)
        titles_dic[file_json["paper_id"]] = pre_doc["title"]
        dic.add_documents([pre_doc["stems"]])
        
    df_titles = pd.DataFrame.from_dict(titles_dic, orient = 'index', columns=['title'])
        
    return df_titles, dic

In [8]:
def preprocess_document(doc, stopset):
    """
    Receives a single document and return a dictionary containing the title and a list of all the stems.

    Parameters
    ----------
    doc : a single document in json.
    stopset : stopset for english

    Returns
    -------
    dict : dictionary having title and stems

    """
    title = np.array([doc["metadata"]["title"]], dtype=str)
    abstract = np.array([paragraph["text"] for paragraph in doc["abstract"]], dtype=str)
    text = np.array([paragraph["text"] for paragraph in doc["body_text"]], dtype=str)
    stemmer = PorterStemmer()
    information = np.concatenate((title, abstract, text))
    tokens = np.concatenate(([wordpunct_tokenize(inf) for inf in information]))
    clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2 and not token.isnumeric()]
    final = [stemmer.stem(word) for word in clean]
    return {"title": title[0], "stems": final}

In [9]:
titles, dictionary = load_corpus_single(path_test)

In [10]:
titles.head()

Unnamed: 0,title
000a0fc8bbef80410199e690191dc3076a290117,"PfSWIB, a potential chromatin regulator for va..."
000affa746a03f1fe4e3b3ef1a62fdfa9b9ac52a,Correlation between antimicrobial consumption ...
000b0174f992cb326a891f756d4ae5531f2845f7,Full Title: A systematic review of MERS-CoV (M...
000b7d1517ceebb34e1e3e817695b6de03e2fa78,Supplementary Information An eco-epidemiologic...
000bc3d4637530496ac4cb798e10d95709eff92d,FDA Perspectives on Diagnostic Device Clinical...


In [45]:
dictionary.save("vsm.dict")

### Bolsa de palabras

Creamos la bolsa de palabras.

In [46]:
def doc2bows_single(path_corpus, dictionary):
    directory = os.fsencode(path_corpus)
    stopset = set(stopwords.words("english"))
    vectors = []
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        with open("\\".join([path_corpus, filename])) as file:
            file_json = json.load(file)
        pre_doc = preprocess_document(file_json, stopset)
        vectors.append(dictionary.doc2bow(pre_doc["stems"]))
        
    corpora.MmCorpus.serialize("corpus_bow.mm", vectors)
    return vectors

In [15]:
bow = doc2bows_single(path_test, dictionary)

## Modelo

In [16]:
def create_TF_IDF_model(bow):
    tfidf = models.TfidfModel(bow)
    return tfidf

In [17]:
tfidf = create_TF_IDF_model(bow)

## Ejecutar query

In [43]:
def launch_query(model, dictionary, bow, query, titles):
    stopset = set(stopwords.words("english"))
    index = similarities.MatrixSimilarity(bow, num_features=len(dictionary))
    print(index.shape)
    pq = preprocess_query(query, stopset)
    vq = dictionary.doc2bow(pq)
    print(vq)
    qtfidf = model[vq]
    print(qtfidf)
    sim = index[qtfidf]
    print(sim)
    ranking = sorted(enumerate(sim), key=itemgetter(1), reverse=True)
    #for doc, score in ranking:
    #    print("[ Score = " + "%.3f" % round(score,3) + " ] " + titles['title'][doc])
    return ranking
        
def preprocess_query(query, stopset):
    stemmer = PorterStemmer()
    tokens = wordpunct_tokenize(query)
    clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2]
    stems = [stemmer.stem(word) for word in clean]
    return stems

In [44]:
ranking = launch_query(tfidf, dictionary, bow, queries.iloc[1][0], titles)

AttributeError: 'MatrixSimilarity' object has no attribute 'shape'

In [20]:
ranking

[(387, 0.15964654),
 (397, 0.090791956),
 (241, 0.07329515),
 (477, 0.046894297),
 (128, 0.046364),
 (334, 0.041256785),
 (321, 0.041148867),
 (211, 0.035458315),
 (374, 0.0347594),
 (438, 0.0343764),
 (263, 0.03397403),
 (342, 0.033784747),
 (238, 0.03316984),
 (65, 0.032920364),
 (10, 0.030513162),
 (47, 0.029655462),
 (479, 0.028951993),
 (103, 0.028667152),
 (540, 0.028008817),
 (462, 0.026778664),
 (212, 0.026004123),
 (502, 0.02579236),
 (236, 0.02487269),
 (216, 0.024505131),
 (392, 0.024449278),
 (19, 0.02404477),
 (499, 0.023986205),
 (135, 0.023916753),
 (86, 0.023860676),
 (81, 0.023850797),
 (219, 0.023562606),
 (482, 0.02332909),
 (185, 0.022768026),
 (217, 0.022598289),
 (44, 0.022448426),
 (240, 0.022163454),
 (182, 0.022089425),
 (141, 0.021936033),
 (213, 0.021778924),
 (428, 0.021629095),
 (134, 0.0215691),
 (267, 0.021506108),
 (11, 0.021430746),
 (336, 0.021251941),
 (469, 0.021148754),
 (149, 0.021115337),
 (244, 0.020513166),
 (314, 0.020287458),
 (510, 0.02020950

In [27]:
rankings = np.array([launch_query(tfidf, dictionary, bow, queries['query'][index], titles) for index in queries.index])

In [28]:
rankings.shape

(50, 556, 2)

In [29]:
ranking[0][0]

387

In [31]:
[titles.iloc[ranked[0]]["title"] for ranked in ranking[:5]]

['Eco-epidemiological assessment of the COVID-19 epidemic in China',
 'Effectively Communicating Climate Science beyond Academia: Harnessing the Heterogeneity of Climate Knowledge',
 'OPINION PIECE Food systems for resilient futures',
 'Turkish Journal of Biology Integration of transcriptomic profile of SARS-CoV-2 infected normal human bronchial epi-thelial cells with metabolic and protein-protein interaction networks',
 'Extreme Engineering: Polarization in Product Development and Manufacturing']

In [26]:
rankings[0]

array([[2.11000000e+02, 1.23008847e-01],
       [3.63000000e+02, 9.95385200e-02],
       [2.63000000e+02, 9.72091034e-02],
       ...,
       [5.48000000e+02, 0.00000000e+00],
       [5.49000000e+02, 0.00000000e+00],
       [5.51000000e+02, 0.00000000e+00]])