In [1]:
path_queries = r"B:\document_parser\document_parses\topics-rnd5.xml"
path_texts = "B:/document_parser/document_parses/pdf_json"
path_test = "B:/document_parser/document_parses/test"
path_judgements = "B:/document_parser/document_parses/judgements.csv"

In [2]:
import pandas as pd
import numpy as np
import json
import xmltodict as xtd

import glob
import dask.bag as db
from dask.distributed import Client, progress
from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import PorterStemmer
from gensim import corpora
import swifter
from joblib import Parallel, delayed


In [3]:
client = Client(n_workers=6, threads_per_worker=2)

client

0,1
Client  Scheduler: tcp://127.0.0.1:54416  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 6  Cores: 12  Memory: 17.13 GB


In [4]:
def load_judgements(path_judgements):
    judgements = pd.read_csv(path_judgements, delimiter=' ', names = ["query", "document", "score"], usecols=[0,2,3])
    judgements.loc[judgements['score'] < 1, 'binary_score'] = 0
    judgements.loc[judgements['score'] >=1 , 'binary_score'] = 1
    return judgements

judgments = load_judgements(path_judgements)
judgments.head()

Unnamed: 0,query,document,score,binary_score
0,1,005b2j4b,2,1.0
1,1,00fmeepz,1,1.0
2,1,010vptx3,2,1.0
3,1,0194oljo,1,1.0
4,1,021q9884,1,1.0


In [5]:
def load_queries(queries_path):
    """
    Receives the path of the queries files and returns a dictionary containing all the queries.

    Parameters
    ----------
    queries_path : path of the queries file

    Returns
    -------
    dic_judgements : dictionary

    """
    with open(queries_path, "r") as xml_file:
        data_dict = xtd.parse(xml_file.read())
    xml_file.close()

    dic_queries = {}
    for query in data_dict["topics"]["topic"]:
        dic_queries[query["@number"]] = query["query"]

    df = pd.DataFrame.from_dict(dic_queries, orient='index', columns=['query'])

    return df

queries = load_queries(path_queries)
queries.head()

Unnamed: 0,query
1,coronavirus origin
2,coronavirus response to weather changes
3,coronavirus immunity
4,how do people die from the coronavirus
5,animal models of COVID-19


In [6]:
def load_document(doc):
    title = doc["metadata"]["title"]
    abstract = " ".join([paragraph["text"] for paragraph in doc["abstract"]])
    text = " ".join([paragraph["text"] for paragraph in doc["body_text"]])
    returned = {}
    returned["id"] = doc["paper_id"]
    returned["title"] = title
    returned["abstract"] = abstract
    returned["body"] = text
    return returned

def load_json(file_path):
    with open(file_path) as file:
        file_json = json.load(file)
    returned = load_document(file_json)
    return returned

In [7]:
filename = path_texts + "/000a0fc8bbef80410199e690191dc3076a290117.json"

In [25]:
def preprocess_document(doc):
    """
    Receives a single document and return a dictionary containing the title and a list of all the stems.

    Parameters
    ----------
    doc : a single document in json.
    stopset : stopset for english

    Returns
    -------
    dict : dictionary having title and stems

    """
    stopset = set(stopwords.words("english"))
    title = doc.title
    abstract = doc.abstract
    body = doc.body
    stemmer = PorterStemmer()
    information = [body,abstract, title]
    tokens = np.concatenate((c))
    clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 2 and not token.isnumeric()]
    final = [stemmer.stem(word) for word in clean]
    return final

In [26]:
def load_dataset(path):
    files = glob.glob(path + "/*.json")
    b = db.from_sequence(files).map(load_json)
    df = b.to_dataframe(columns=["id", "title", "abstract", "body"])
    return df.compute()

In [27]:
def create_dictionary(dataset):
    dictionary = corpora.Dictionary()
    for document in dataset.itertuples():
        dictionary.add_documents([preprocess_document(document)])
    dictionary.save('vsm.dict')
    return dictionary

In [30]:
def docs2bows(dataset, dictionary):
    vectors = [dictionary.doc2bow(preprocess_document(doc)) for doc in dataset]
    corpora.MmCorpus.serialize('corpus.mm', vectors)
    return vectors

In [13]:
dataset = load_dataset(path_texts)

In [14]:
dataset = dataset.set_index("id")

In [15]:
dataset

Unnamed: 0_level_0,title,abstract,body
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0001418189999fea7f7cbe3e82703d71c85a6fe5,Absence of surface expression of feline infect...,Feline infectious peritonitis virus (FIPV) pos...,Feline infectious peritonitis (FIP) is a fatal...
0003793cf9e709bc2b9d0c8111186f78fb73fc04,Title: Rethinking high-risk groups in COVID-19,,How do we protect our 'high-risk' patient popu...
000379d7a7f37a2ccb978862b9f2016bd03259ea,ScienceDirect ScienceDirect Effect of Nanomate...,approach. The NM shape in the conformal circui...,Integration of functional electronic devices o...
00039b94e6cb7609ecbddee1755314bcfeb77faa,Plasma inflammatory cytokines and chemokines i...,Severe acute respiratory syndrome (SARS) is a ...,Severe acute respiratory syndrome (SARS) is a ...
0003ddc51c4291d742855e9ac56076a3bea33ad7,Journal Pre-proofs The Fire This Time: The Str...,,It is said that crisis reveals character. The ...
...,...,...,...
fff8b9e88db122ffcbaf1daf6b697e44eaaffd93,Septic shock caused by Mycobacterium tuberculo...,,Sir: Septic shock due to Mycobacterium tubercu...
fffaed7e9353b7df6c4ca8f66b62e117013cb86d,Dengue Virus Glycosylation: What Do We Know?,In many infectious diseases caused by either v...,Most DENV infections are asymptomatic or remai...
fffb268f02887d8680dc611f6fc0b20c489030cb,Emergence of novel coronavirus and progress to...,"In late December 2019, a group of patients was...",Coronaviruses are a form of positive-strand no...
fffc88be66eb39823fc9b50f0683e06a3038c9fe,A fractional-order compartmental model for pre...,We propose a time-fractional compartmental mod...,Fractional differential equations (FDEs) are u...


In [13]:
docs = dataset.apply(preprocess_document, axis = 1)

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=(None, 'object'))



In [31]:
dictionary = create_dictionary(dataset)
print(dictionary)

KeyboardInterrupt: 

In [None]:
bow = docs2bows(dataset, dictionary)

In [None]:
for v in bow:
    tvec = [(dictionary[id], freq) for (id, freq) in v]
    print(tvec)
    break