# Getting Started

In [6]:
# connecting colab with google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
%cd drive/MyDrive/Covid\ Q\&A\ System/

total 8.0K
drwx------ 5 root root 4.0K Oct 21 11:31 drive
drwxr-xr-x 1 root root 4.0K Oct 19 16:36 sample_data


### Installing and importing Libraries

In [None]:
!pip3 install tqdm==4.62.0

Collecting tqdm==4.62.0
  Downloading tqdm-4.62.0-py2.py3-none-any.whl (76 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m71.7/76.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tqdm
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.66.1
    Uninstalling tqdm-4.66.1:
      Successfully uninstalled tqdm-4.66.1
Successfully installed tqdm-4.62.0


In [None]:
import time
import json
import glob
import numpy as np
import pandas as pd
from tqdm import tqdm

### Downloading the covid dataset

In [None]:
#downloading dataset...
!wget https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases/cord-19_2021-11-15.tar.gz

In [None]:
#unzipping the dataset to google drive
!tar -xzf cord-19_2021-11-15.tar.gz -C /content/drive/MyDrive/Covid\ Q\&A\ System
#unzipping the pdf and pmc files
%cd drive/MyDrive/Covid\ Q\&A\ System/2021-11-15/
!tar xzf document_parses.tar.gz

### Reading Dataset

In [None]:
%cd drive/MyDrive/Covid\ Q\&A\ System/2021-11-15/
#metadata
metadata = pd.read_csv('metadata.csv', dtype={'pubmed_id': str, 'title': str, 'abstract': str})
metadata.head()

In [None]:
# Fetching Research Papers from PDF and PMC Json folder
pdf_json = glob.glob('document_parses/pdf_json/*.json', recursive=True)
pmc_json = glob.glob('document_parses/pmc_json/*.json', recursive=True)

In [None]:
print('PDF:', len(pdf_json), 'PMC:', len(pmc_json))

In [None]:
# FileReader Class Exctract id, abstract and body from research papers
class FileReader:
  def __init__(self, file_path):
    with open(file_path) as f:
      content = json.load(f)
      self.paper_id = content['paper_id']
      self.abstract = '$$'.join([each['text'] for each in content.get('abstract', [])])
      self.body_text = '$$'.join([each['text'] for each in content.get('body_text', [])])

  def __repr__(self):
    return f'{self.paper_id}\tabstract: {self.abstract[:200]}\tbody_text: {self.body_text}'

In [None]:
# A sample research paper from pdf json folder
pdf_file = FileReader(pdf_json[0])
print(pdf_file)

In [None]:
#Create a dictionary of all research papers from pdf json
pdf_dict = {'paper_id': [], 'abstract': [], 'body_text': []}
t1 = time.time()
for idx, record in tqdm(enumerate(pdf_json)):
  content = FileReader(record)
  pdf_dict['paper_id'].append(content.paper_id)
  pdf_dict['abstract'].append(content.abstract)
  pdf_dict['body_text'].append(content.body_text)
print(time.time() - t1)

247236it [05:05, 807.98it/s] 

306.0527939796448





In [None]:
#Creating a dataframe of all research papers from pdf json
pdf_df = pd.DataFrame(pdf_dict, columns=['paper_id', 'abstract', 'body_text'])
pdf_df.head()

Unnamed: 0,paper_id,abstract,body_text
0,206be0740f4d299003d4e09cd6f9a32e6e351130,Heparanase (HPSE) is a multifunctional protein...,Heparanase (HPSE) is an endo-β-d-endoglycosida...
1,32356c8de8fcec7a46bc60793b557964e4e87f37,Objective The COVID-19 pandemic is currently o...,World Health Organization (WHO) declared the o...
2,e5447bc137727b3721de2313755d89b932e1eecc,As the world navigates the COVID-19 health cri...,With the rise in positive COVID-19 cases and t...
3,8f1e56dded7f860a33ad291c06c773653270ee52,The sudden outbreak of coronavirus disease 201...,"In 2020, a new type of coronavirus, named coro..."
4,c84b2484293b3aa59ec8aaecc7eadb93b2294dd7,Spinal cord stimulation may enable recovery of...,Spinal cord injury (SCI) is a life-long condit...


In [None]:
#Create a dictionary of all research papers from pmc json
pmc_dict = {'paper_id': [], 'body_text': []}
t1 = time.time()
for idx, record in tqdm(enumerate(pmc_json)):
  content = FileReader(record)
  pmc_dict['paper_id'].append(content.paper_id)
  pmc_dict['body_text'].append(content.body_text)
print(time.time() - t1)

189611it [04:54, 644.91it/s]

294.0136697292328





In [None]:
#Creating a dataframe of all research papers from pmc json
pmc_df = pd.DataFrame(pmc_dict, columns=['paper_id', 'body_text'])
pmc_df.head()

Unnamed: 0,paper_id,body_text
0,PMC7550677,Previous research suggested that emotional str...
1,PMC7297029,"First, areas with severe outbreaks have genera..."
2,PMC8207685,"Environment‐related illnesses, such as indoor ..."
3,PMC7926729,"Materials, useful matter, are used extensively..."
4,PMC7471855,\n[4]\n$$$No


### Sampling & Saving Course Dataset

In [9]:
%cd ..
%cd 1-Accessing\ Dataset

In [None]:
#drop rows from the metadata where the corresponding research text doesn't exist in pdf json
metadata.dropna(subset=['pdf_json'], inplace=True)

In [None]:
metadata.head(3)

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
0,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb,PMC,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,PMC35282,11472636,no-cc,OBJECTIVE: This retrospective chart review des...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,,,,document_parses/pdf_json/d1aafb70c066a2068b027...,document_parses/pmc_json/PMC35282.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
1,02tnwd4m,6b0567729c2143a66d737eb0a2f63f2dce2e5a7d,PMC,Nitric oxide: a pro-inflammatory mediator in l...,10.1186/rr14,PMC59543,11667967,no-cc,Inflammatory diseases of the respiratory tract...,2000-08-15,"Vliet, Albert van der; Eiserich, Jason P; Cros...",Respir Res,,,,document_parses/pdf_json/6b0567729c2143a66d737...,document_parses/pmc_json/PMC59543.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
2,ejv2xln0,06ced00a5fc04215949aa72528f2eeaae1d58927,PMC,Surfactant protein-D and pulmonary host defense,10.1186/rr19,PMC59549,11667972,no-cc,Surfactant protein-D (SP-D) participates in th...,2000-08-25,"Crouch, Erika C",Respir Res,,,,document_parses/pdf_json/06ced00a5fc04215949aa...,document_parses/pmc_json/PMC59549.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,


In [None]:
#Taking 25000 randmly sampled records from metadata
sub_metadata = metadata.sample(25000)

In [None]:
#Sample the both pdf and pmc research paper table based on the sampled metadata table
sub_pdf_df = pdf_df[pdf_df['paper_id'].isin(sub_metadata['sha'])]
sub_pmc_df = pmc_df[pmc_df['paper_id'].isin(sub_metadata['pmcid'])]

In [None]:
#storing the sample data
sub_metadata.to_pickle('metadata_sample.pickle')
sub_pdf_df.to_pickle('json_pdf_sample.pickle')
sub_pmc_df.to_pickle('json_pmc_sample.pickle')