In [None]:
from Bio import Entrez
from Bio import Medline

In [None]:
import pandas as pd
import numpy as np
import pickle

In [None]:
# Entrez query pipeline
def query(term):
    # Create list of PMIDs
    handle = Entrez.esearch(db="pubmed", term=term, sort="pub+date", retmax=500000)
    record = Entrez.read(handle)
    handle.close()
    idlist = record["IdList"]
    return idlist

def fetch(idlist):
    # Fetch records using list of PMIDs
    handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text", sort="pub+date")
    records = Medline.parse(handle)
    
    # Convert generator object to list
    records = list(records)
    return records

In [None]:
def make_records_d(records):
    '''
    Takes in list of fetched records; iterates through each records and extracts specified information
    into a dictionary, and appends dictionary to list; returns final list
    '''
    records_l = []
    for record in records:
        record_d = {}
        record_d["pmid"] = record.get("PMID", "?")
        record_d["title"] = record.get("TI", "?")
        record_d["authors"] = record.get("AU", "?")
        record_d["source"] = record.get("SO", "?")
        record_d["location"] = record.get("AD", "?")
        record_d["pub_date"] = record.get("DP", "?")
        record_d["abstract"] = record.get("AB", "?")
    records_l.append(record_d)
    return records_l

### COVID-19 Query

In [None]:
covid_idlist = query("covid-19")
len(covid_idlist)

In [None]:
covid_ids = covid_idlist.copy()

In [None]:
# Query using slices of 200 ids at a time (limit)
covid_records = []
for i in range(0, 70703, 200):
    one_slice = covid_ids[i:(i+200)]
    covid_records.append(fetch(one_slice))
    time.sleep(3)

In [None]:
# Turn covid_records into flat list
covid_records_l = [record for record_list in covid_records for record in record_list]
len(covid_records_l)

For each abstract, in addition to the abstract text, I would also like to extract:
- PMID (unique identifier)
- title
- authors
- source (journal)
- location (institution(s))
- published date
- Entrez date (date entered into database)
- country of journal
- language

In [None]:
# For each abstract, compile list of dictionaries with desired variables
covid_records_dicts = []
for record in covid_records_l:
    record_d = {}
    record_d["pmid"] = record.get("PMID", "?")
    record_d["title"] = record.get("TI", "?")
    record_d["authors"] = record.get("AU", "?")
    record_d["source"] = record.get("SO", "?")
    record_d["location"] = record.get("AD", "?")
    record_d["pub_date"] = record.get("DP", "?")
    record_d["entrez_date"] = record.get("EDAT", "?")
    record_d["country"] = record.get("PL", "?")
    record_d["language"] = record.get("LA", "?")
    record_d["abstract"] = record.get("AB", "?")
    covid_records_dicts.append(record_d)

In [None]:
# Check length
len(covid_records_dicts)

In [None]:
# Turn dictionaries into dataframe
covid_records_df = pd.DataFrame(covid_records_dicts)
covid_records_df.head()

In [None]:
# Look for missing abstracts
mask = (covid_records_df["abstract"] == "?") | (covid_records_df["abstract"] == ".") | \
(covid_records_df["abstract"] == "Not available.") | (covid_records_df["abstract"] == "Not required.")
missing_abstracts = covid_records_df[mask]
missing_abstracts.shape

27,424 papers are missing abstracts. I will remove these entries.

In [None]:
# Create dataframe with no missing abstracts
mask2 = (covid_records_df["abstract"] != "?") & (covid_records_df["abstract"] != ".") & \
(covid_records_df["abstract"] != "Not available.") & (covid_records_df["abstract"] != "Not required.") & \
(covid_records_df["abstract"] != ".") & (covid_records_df["abstract"] != "NA.")
covid_abstracts_df = covid_records_df[mask2]
covid_abstracts_df.shape

In [None]:
# Save df
compression_opts = dict(method='zip', archive_name='covid_abstracts_df.csv')
covid_abstracts_df.to_csv(r'/Users/sunnajo/Desktop/covid_abstracts.zip', index=False, compression=compression_opts)

We notice that some papers were published prior to 2020. I primarily want to look at papers published after the first COVID-19 case was brought to light, which, per the WHO, was on December 31, 2019 in China.

The 'pub_date' column looks inconsistent and there are some formatting issues. For accuracy's sake, I will use the 'entrez_date' column for filtering.

In [None]:
df = covid_abstracts_df.copy()

In [None]:
# Convert 'entrez_date' column to datetime format
df['entrez_date_dt'] = pd.to_datetime(df['entrez_date'])
df.head()

In [None]:
import datetime as dt

In [None]:
# Extract year
df['year'] = df['entrez_date_dt'].dt.year
df.head()

In [None]:
# Extract month for papers published in 2020
df_2020 = df[df['year'] == 2020]
df_2020['month'] = df_2020['entrez_date_dt'].dt.month
df_2020.head()

In [None]:
# Look at papers published in Jan & Feb to evaluate relevance
df_2020[df_2020['month'] < 3]

It looks like there are some relevant papers even in January and February.

In [None]:
# Turn abstract column of dataframe into list: each item is an abstract
abstracts_2020_l = list(df_2020["abstract"])
len(abstracts_2020_l)

In [None]:
# Look at list of abstracts
abstracts_2020_l[:5]

In [None]:
# Save list of abstracts text
import pickle

outfile = open('abstracts_2020_l.pkl','wb')
pickle.dump(abstracts_2020_l,outfile)
outfile.close()

## Text Preprocessing

In [None]:
# Import module
from preprocessing import *

%load_ext autoreload
%autoreload 2

I need to create a British English to American English dictionary.

In [None]:
# Creating British English to American English dictionary

# Read in text files
text_file = open("/Users/sunnajo/metis/onl20_ds4/british.txt", "r")
british_text = text_file.read()
text_file.close()

text_file = open("/Users/sunnajo/metis/onl20_ds4/american.txt", "r")
american_text = text_file.read()
text_file.close()

# Convert each text file to list
british_text = british_text.replace('\n', ' ')
british_text = british_text.split(' ')

american_text = american_text.replace('\n', ' ')
american_text = american_text.split(' ')

# Zip lists and create dictionary
ab_list = list(zip(british_text, american_text))

ab_dict = {}
for pair in ab_list:
    ab_dict[pair[0]] = pair[1]

In [None]:
# Deleting words causing issues
ab_dict

del ab_dict['disc']
del ab_dict['discs']

Logic for preprocessing:
- Remove content within parentheses: this content is often redundant (e.g. acronyms)
- There are many compound terms adjoined with '-'. Since each word may have meaning, I will separate these words and try to capture them by grouping my corpus into bigrams and trigrams.
- I noticed that some words are in British English. I will convert these terms to American English so as to maintain consistency and optimize the quality of my preprocessing.
- There are terms that occur frequently in my corpus that are redundant in light of the scope of this project and that do not add value in interpretation. I will add these to my list of stopwords in addition to the standard English stopwords in NLTK.
- I will look only at nouns as I have many terms and these will likely be the most valuable for topic modeling
- I will remove words <4 letters in length as these are likely to have little semantic value

In [None]:
# Preprocess text using pipeline
processed_text = pp_pipeline(abstracts_2020_l)

In [None]:
# Check length of preprocessed text list
len(processed_text)

In [None]:
# Save preprocessed text
outfile = open('processed_text.pkl','wb')
pickle.dump(processed_text,outfile)
outfile.close()

In [None]:
# Find abstracts with 0 terms after pre-processing
tokens = [word_tokenize(text) for text in processed_text]

zero_terms = []
for idx, text in enumerate(tokens):
    if len(text) == 0:
        zero_terms.append(idx)

In [None]:
# Abstracts with 0 terms after pre-processing - n_gram threshold 300
zero_terms = [10024,
 12295,
 12375,
 13667,
 24863,
 31641,
 31658,
 31659,
 33122,
 33173,
 34006,
 36422,
 36427,
 39538,
 39544,
 39741,
 41600]

In [None]:
zero_terms.reverse()

for i in zero_terms:
    del processed_text[i]

len(processed_text)

In [None]:
# Remove rows with missing abstracts from df
for i in zero_terms:
    df_2020.drop(df_2020.index[i], inplace=True)

df_2020.shape

In [None]:
# Create list of PMIDs to use as abstract labels
pmids = list(df_2020['pmid'])

In [None]:
# Save processed text - bigram/trigram, 2020 only
outfile = open('pmids.pkl','wb')
pickle.dump(pmids,outfile)
outfile.close()

In [None]:
# Save altered df
compression_opts = dict(method='zip', archive_name='covid_abstracts_2020.csv')
df_2020.to_csv(r'/Users/sunnajo/Desktop/covid_abstracts_2020.zip', index=False, compression=compression_opts)