In [1]:
import pandas as pd
import glob
import json
import pickle
import numpy as np

import scispacy
import spacy
import en_core_sci_lg
from spacy_langdetect import LanguageDetector
from spacy.language import Language

def create_lang_detector(nlp, name):
    return LanguageDetector()
Language.factory('language_detector', func=create_lang_detector)

<function __main__.create_lang_detector(nlp, name)>

## Data Preprocessing
I plan to preprocess the data using the following steps
* Read the metadata.csv, get the whole picture of the publications we have.
* Extract the body text from publications in pdf and pmc respectively.
* Merge the metadata and the corresponding body text.
* Clean the merged data: remove duplicates, remove blank text.
* Creating new features: is_covid19 (binary indicator indicating whether the publication is related to covid19), text_language (remove non-english pubs).

### Read metadata & rough descriptive analysis

In [4]:
# read metadata
metadata = pd.read_csv('./CovidData/metadata.csv', dtype={
    'pubmed_id': str, 
    'Microsoft Academic Paper ID': str, 'doi': str
})
# check the data structure
print(metadata.columns)

Index(['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id',
       'license', 'abstract', 'publish_time', 'authors', 'journal', 'mag_id',
       'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files',
       'url', 's2_id'],
      dtype='object')


In [5]:
# a rough descriptive analysis
print('The percent of null in each column: \n', metadata.isnull().sum()/len(metadata))

The percent of null in each column: 
 cord_uid            0.000000
sha                 0.686875
source_x            0.000000
title               0.000493
doi                 0.504897
pmcid               0.675089
pubmed_id           0.571102
license             0.000000
abstract            0.261919
publish_time        0.000307
authors             0.023066
journal             0.061969
mag_id              1.000000
who_covidence_id    0.525287
arxiv_id            0.987713
pdf_json_files      0.686875
pmc_json_files      0.743873
url                 0.472467
s2_id               0.076933
dtype: float64


What we care about are `sha` and `pmcid` which are unique for each publication and are the keys to merge the publications with the metadata. The percentages of null in these two columns are both over 50%, I plan to remove the ones with both the null `sha` and null `pmcid`.

In [6]:
# cord_uid: this persistent identifier is a 8-char alphanumeric string unique to each entry
# sha: the hash of the PDFs are in 'sha'
# the number of unique titles
print('the number of unique cord_uid:', metadata.cord_uid.nunique(), '\n',
      'the number of unique sha:', metadata.sha.nunique(), '\n',
      'the number of unique titles:', metadata.title.nunique())

the number of unique cord_uid: 659301 
 the number of unique sha: 231780 
 the number of unique titles: 554017


### Extract the body text from publications in pdf files

In [7]:
# get the file path for pdf.json
all_json_path = glob.glob('./CovidData/document_parses/pdf_json/*.json')
# check the structure of json file
with open(all_json_path[1]) as f:
    first_entry = json.load(f)
    print(json.dumps(first_entry, indent=4))

{
    "paper_id": "c40b7d23a6c005ab11aa812d9d43567140feedeb",
    "metadata": {
        "title": "Certifying Irreducibility in Z[x]",
        "authors": [
            {
                "first": "John",
                "middle": [],
                "last": "Abbott",
                "suffix": "",
                "affiliation": {
                    "laboratory": "",
                    "institution": "Universit\u00e4t Passau",
                    "location": {
                        "settlement": "Passau",
                        "country": "Germany"
                    }
                },
                "email": "abbott@dima.unige.it"
            }
        ]
    },
    "abstract": [
        {
            "text": "We consider the question of certifying that a polynomial in",
            "cite_spans": [],
            "ref_spans": [],
            "section": "Abstract"
        },
        {
            "text": "Knowing that a polynomial is irreducible lets us recognise that a quotient rin

The body text is the combination of all the `text`. Some of them might be blank.

In [8]:
##### The following code is commented because after it takes long to generate dict,
##### so we store dict in pickle file which can be loaded efficiently
# write a class to read information from json files
# class filereader:
#     def __init__(self, file_path):
#         with open(file_path) as f:
#             content = json.load(f)
#             self.paper_id = content['paper_id']
            
#             self.abstract = []
#             self.body_text = []
            
#             # get abstract if the abstract is not none
#             for entry in content['abstract']:
#                 self.abstract.append(entry['text'])
                
            
#             # get body text
#             for entry in content['body_text']:
#                 self.body_text.append(entry['text'])
            
#             # join abstract into one string
#             self.abstract = '\n'.join(self.abstract)
#             self.body_text = '\n'.join(self.body_text)

# load json data into dataframe
# json_dict = {'paper_id': [], 'abstract': [], 'body_text': []}
# for idx, file_path in enumerate(all_json_path):
#     if idx % (len(all_json_path) // 10) == 0:
#         print('Processing index: {} of {}'.format(idx, len(all_json_path)))
#     content = filereader(file_path)
#     json_dict['paper_id'].append(content.paper_id)
#     json_dict['abstract'].append(content.abstract)
#     json_dict['body_text'].append(content.body_text)

# with open('json_dict.pkl', 'wb') as f:
#     pickle.dump(json_dict, f)

In [9]:
# read pmc_json files
all_pmc_path = glob.glob('./CovidData/document_parses/pmc_json/*.json')

In [10]:
# write a class to read information from pmc json files
# It contains no abstracts
# class filereader:
#     def __init__(self, file_path):
#         with open(file_path) as f:
#             content = json.load(f)
#             self.paper_id = content['paper_id']
            
#             self.body_text = []
                  
#             # get body text
#             for entry in content['body_text']:
#                 self.body_text.append(entry['text'])
            
#             # join abstract into one string
#             self.body_text = '\n'.join(self.body_text)

# pmc_dict = {'paper_id': [], 'body_text': []}
# for idx, file_path in enumerate(all_pmc_path):
#     if idx % (len(all_pmc_path) // 10) == 0:
#         print('Processing index: {} of {}'.format(idx, len(all_pmc_path)))
#     content = filereader(file_path)
#     pmc_dict['paper_id'].append(content.paper_id)
#     pmc_dict['body_text'].append(content.body_text)

# with open('pmc_dict.pkl', 'wb') as f:
#     pickle.dump(pmc_dict, f)

In [11]:
# read pickle file of pdf.json
json_dict_pickle = pickle.load(open('json_dict.pkl', 'rb'))
json_df = pd.DataFrame(json_dict_pickle, columns=['paper_id', 'abstract', 'body_text'])

# read pickle file of pmc.json
pmc_dict_pickle = pickle.load(open('pmc_dict.pkl', 'rb'))
pmc_df = pd.DataFrame(pmc_dict_pickle, columns=['paper_id', 'body_text'])
# remove blank body text
pmc_df = pmc_df[pmc_df.body_text != ' ']

### Merge the pdf & pmc data with the metadata

In [12]:
# merge pdf with metadata on sha
data = pd.merge(json_df, metadata, left_on='paper_id', right_on='sha', how='left').drop('sha', axis=1)
# merge pmc with data
data = pd.merge(data, pmc_df, left_on='pmcid', right_on='paper_id', how='left').drop('paper_id_y', axis=1)

### Data cleaning
* Remove rows with duplicated body_text, paper_id
* Keep the abstract (abstract_x) in metadata, impute the null with the abstract (abstract_y) from pdf file
* Keep the body text (body_text_y) from pmc files, impute the null with the body text (body_text_x) from pdf file

In [97]:
# remove rows with duplicated body_text
data.drop_duplicates(['body_text_x', 'body_text_y'], inplace=True)
# remove rows with duplicated paper_id
data.drop_duplicates(['paper_id'], inplace=True)

KeyError: Index(['paper_id'], dtype='object')

In [None]:
# abstract_x is from json files (missing is blank), abstract_y is from metadata (missing is null)
# a large proportion of data having different abstract
print('The proportion of data having different abstracts:', 
      data[data.abstract_x != data.abstract_y].shape[0] / data.shape[0])

Remark: many of the differences are slight.

In [None]:
# data[data.abstract_x != data.abstract_y][['abstract_x', 'abstract_y', 'url']].tail(3)
# data[(data.abstract_x != data.abstract_y) &
#      (data.abstract_y.isnull())][['abstract_x', 'abstract_y', 'url', 'body_text_x', 'body_text_y']]
# For e.g., the abstract in json file is not correct, while the abstract in metadata is correct
# the abstract in metadata seems to be more reliable, so we use these, and fill the missing values in abstract_y 
# with the abstract_x
data.loc[data.abstract_y.isnull() & 
         (data.abstract_x != '') & 
         (data.abstract_x != ' ') & 
         (~data.abstract_x.isnull()), 'abstract_y'] = data[data.abstract_y.isnull() &
                                                           (data.abstract_x != '') & 
                                                           (data.abstract_x != ' ') & 
                                                           (~data.abstract_x.isnull())].abstract_x
data.rename(columns={'abstract_y': 'abstract'}, inplace=True)
data.drop('abstract_x', axis=1, inplace=True)

In [None]:
# body_text_x is from json files, body_text_y is from pmc files
# most of the data have different body text
print('The proportion of data having different body text:',
      data[data.body_text_x != data.body_text_y].shape[0] / data.shape[0]

In [None]:
# similar body_text
# data[data.body_text_x != data.body_text_y][['body_text_x', 'body_text_y', 'url']]
# use the text from pmc file (body_text_y) trusting the statement that it's of higher quality
# print(data.body_text_x.isnull().sum(), data[data.body_text_x == ''].shape, data[data.body_text_x == ' '].shape)
# print(data.body_text_y.isnull().sum(), data[data.body_text_y == ''].shape, data[data.body_text_y == ' '].shape)
data.loc[data.body_text_y.isnull(), 'body_text_y'] = data[data.body_text_y.isnull()].body_text_x
data.rename(columns = {'body_text_y': 'body_text'}, inplace=True)
data.drop('body_text_x', axis=1, inplace=True)
data.rename(columns={'paper_id_x': 'paper_id', 'source_x': 'source'}, inplace=True)

### Create new features
New features are:
* is_covid19: 0 indicates that the publication is not related to covid19, 1 for otherwise
* text_language: the language label of the publication (remove non-english publications)

In [54]:
# create new column for covid19
data['is_covid19'] = data.body_text.str.contains('COVID-19|covid|sar cov 2|SARS-CoV-2|2019-nCov|2019 ncov|SARS Coronavirus 2|2019 Novel Coronavirus|coronavirus 2019| Wuhan coronavirus|wuhan pneumonia|wuhan virus', case=False)
print('The percentage of covid19 related publications:', data.is_covid19.sum()/data.shape[0])

In [68]:
# use nlp to detect language: we want to remove the non-english publications for efficiency.
nlp = en_core_sci_lg.load(disable=['tagger', 'ner', 'lemmatizer'])
nlp.add_pipe('language_detector', last=True)
# use the first 500 words to determine the language it uses
# data['text_language'] = data.body_text.apply(lambda x: nlp(x[:500])._.language['language'])

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x2b5e0c070130>

In [99]:
# data_dict = {'data': data}
# with open('data_dict.pkl', 'wb') as f:
#     pickle.dump(data_dict, f)
data = pickle.load(open('data_dict.pkl', 'rb'))['data']
data = data[data.text_language == 'en']

Save to csv.

In [104]:
data.to_csv('covid19pub.csv', index=False)