# Project 2-COVID-19 Open Research Dataset Challenge (CORD-19)

## Dataset description
In response to the COVID-19 pandemic, the White House and a coalition of leading research groups have prepared the COVID-19 Open Research Dataset (CORD-19). CORD-19 is a resource of over 500,000 scholarly articles, including over 200,000 with full text, about COVID-19, SARS-CoV-2, and related coronaviruses. This freely available dataset is provided to the global research community to apply recent advances in natural language processing and other AI techniques to generate new insights in support of the ongoing fight against this infectious disease. There is a growing urgency for these approaches because of the rapid acceleration in new coronavirus literature, making it difficult for the medical research community to keep up.

### In this project we are required to do the following tasks:
1. Collect and process pdf data dump from COVID-19 Open Research Dataset Challenge (CORD-19)
2.  Analyze the data and provide publication statistics such as the number of publications according to time, location but not limited to. Provide (any type of) visualization for the results.
3. Using sentence embedding from the articles' abstract and main content respectively.
4. Build a tool for question answering: given a user input sentence or query, outputs the top 10 most relevant sentences from the data.

### Importing Libraries

In [38]:
!pip install word2number

In [39]:
!pip install contractions

In [40]:
!pip install langdetect

In [41]:
import numpy as np 
import pandas as pd 
import os
import json
import os
from pprint import pprint
from copy import deepcopy

from tqdm import tqdm,tqdm_notebook

from nltk.corpus import stopwords
from wordcloud import WordCloud

import matplotlib.pyplot as plt
import numpy as np
import plotly.graph_objects as go
import pycountry
import datetime
from datetime import datetime

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

from bs4 import BeautifulSoup
import spacy
import unidecode
from word2number import w2n
import contractions

from langdetect import detect
from langdetect import DetectorFactory
DetectorFactory.seed = 0

import gensim
from nltk import tokenize
import warnings
warnings.filterwarnings("ignore")

### Importing and exploring dataset

In [42]:
#importing pdf_json dataset
path = '../input/CORD-19-research-challenge/document_parses/pdf_json'
filenames = os.listdir(path)
print("Number of articles in the dataset: ", len(filenames))

### Functions for formating author column of pdf_json data

In [43]:
def format_names(author):
    middle_names = " ".join(author['middle'])
    
    if author['middle']:
        return " ".join([author['first'], middle_names, author['last']])
    else:
        return " ".join([author['first'], author['last']])


def format_affiliations(affiliations):
    texts = []
    location = affiliations.get('location')
    if location:
        texts.extend(list(affiliations['location'].values()))
    
    institution = affiliations.get('institution')
    if institution:
        texts = [institution] + texts
    return ", ".join(texts)

def format_author(authors, with_affiliation=False):
    name_s = []
    
    for author in authors:
        name = format_names(author)
        if with_affiliation:
            affiliations = format_affiliations(author['affiliation'])
            if affiliations:
                name_s.append(f"{name} ({affiliations})")
            else:
                name_s.append(name)
        else:
            name_s.append(name)
    
    return ", ".join(name_s)

### Data extraction

In [44]:
counts = 0

doc = []
for file in tqdm(os.listdir(path)):
    f_path = f"{path}/{file}"
    j = json.load(open(f_path,"rb"))
    paper_id = j['paper_id']
    
    paper_id = paper_id[:]
    title = j['metadata']['title']
    
    authors = format_author(j['metadata']['authors'], 
                           with_affiliation=True)

    try: 
        abstract = j['abstract'][0]['text']
    except:
        abstract = ""

    full_text = ""
    
    
    for t in j['body_text']:
        full_text += t['text']

    doc.append([paper_id, title, authors, abstract, full_text])

    counts += 1

    if (counts >= 10000) :
        break

### Making dataframe for the extracted data

In [45]:
data = pd.DataFrame(doc,columns=["paper_id", "title", "authors", "abstract", "full_text"])
data.head()

### Reading metadata.csv and merging both dataframes into 1 based on paper ID

In [46]:
df = pd.read_csv('../input/CORD-19-research-challenge/metadata.csv', usecols=["pdf_json_files", "publish_time"])
df.head()

Extracting the id using str.extract

In [47]:
df['paper_id'] = df['pdf_json_files'].str.extract(r'/\w+/(\w+)')
df.head()

In [48]:
#merged data based on ID
final_data = pd.merge(data,df,on='paper_id')
final_data.head()

### working on Publish_time column to visualise based on time

In [49]:
print (final_data['publish_time'].dtype)

In [50]:
# extracting data for month and year
final_data['publish_time'] =pd.to_datetime(final_data['publish_time'])
print(final_data['publish_time'].dtype)
final_data['year']=final_data['publish_time'].dt.year
final_data['month']=final_data['publish_time'].dt.month
final_data['date']=final_data['publish_time'].dt.date

#printing final_data head
final_data.head()

In [51]:
final_data['year'].value_counts()[:6].plot(kind='barh', color ='brown')

In [52]:
final_data.year.value_counts()[:6].plot.pie()

### visualisation by month

In [53]:
final_data['month'].value_counts()[:12].plot(kind='barh', color ='brown')

In [54]:
final_data.month.value_counts()[:12].plot.pie()

In [55]:
#number of publications by date
final_data['date'].value_counts()[:20].plot(kind='barh', color ='brown')

### Top Author publications 

In [56]:
final_data['authors'].value_counts()[:1].plot(kind='barh', color ='brown')

In [57]:
final_data.authors.value_counts()[:2].plot.pie()

top author publication is more than 780 as the above pie chart shows the name of most popular authors which wrote 780+ articles.

## Cleaning the dataframe full_text and abstract

Lowercase all texts for body and abstract

In [58]:
final_data["full_text"] = final_data["full_text"].str.lower()
final_data["abstract"] = final_data["abstract"].str.lower()
final_data.head()

sorting articles which only talk about covid using few covid related keywords

In [59]:
topics = ['asia','wuhan','covid-19','covid','covid19','corona','coronavirus','corona-virus','SARS','SARSCOV2','severe acute resperatory syndrom']

label = []

for a in tqdm(final_data["full_text"]):
    if any(x in a for x in topics):
        label.append(1)
    else :
        label.append(0)
        
final_data['label'] = label

final_data.drop(final_data.index[final_data['label']==0], inplace = True)


len(final_data)

Wordcloud before cleaning

In [60]:
long = ','.join(list(final_data['full_text'].values))

# Create a WordCloud object
wordcloud = WordCloud(background_color="black", max_words=500, contour_width=5, contour_color='blue')

# Generate a word cloud
wordcloud.generate(long)

# Visualize the word cloud
wordcloud.to_image()

removing the non-english papers

In [61]:
for a in tqdm(final_data['full_text']):
    try:
        if detect(a) != "en":
            final_data.drop(final_data.index[final_data['full_text']==a], inplace = True)
    except:
        final_data.drop(final_data.index[final_data['full_text']==a], inplace = True)


len(final_data)

In [62]:
final_data.head(2)

Removing the paper which have length less than 200 of text

In [63]:
final_data["split_text"] = final_data["full_text"].apply(lambda phrase: len(phrase.strip().split()))
final_data.drop(final_data.index[final_data['split_text'] <= 200], inplace = True)
len(final_data)
#final_data.head()

### Second part of cleaning using stopwords and punctutation removal

using Stopwords using nltk library

In [64]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['reports','additions','india','usa','tables','papers','review','common','review','describes','abstract','retrospective','chart','patients','study',
                        'associated','result','features','including','found','one','well','among','abstract','provide',
                        'objective','background','range','feature','participates', 'copyright', 'many',
                        'org', 'https', 'author', 'figure', 'table', 'rights', 'reserved', 'figures', 'reported',
                        'permission', 'use', 'used', 'license','editor', 'brazil', 'article', 'figures', 'tables', "the", 'a', 'all', 'thus',
                        'pubmed', 'editors', 'authors', 'methods', 'method', 'result', 'paper', 'introduction', 'editor', 
                         'although', 'letters', 'review', 'paper', 'table', 'addition', 'example', 'even', 'within', 'report']
                        )
final_data['full_text'] = final_data['full_text'].apply(lambda x: " ".join([word for word in x.split() if word not in (stopwords)]))
final_data['abstract'] = final_data['abstract'].apply(lambda x: " ".join([word for word in x.split() if word not in (stopwords)]))
final_data.head()

In [65]:
tokenizer = RegexpTokenizer('\w+')
final_data['full_text'] = final_data['full_text'].apply(lambda x: " ".join(tokenizer.tokenize(x.lower())))
final_data.head()

In [66]:
long = ','.join(list(final_data['full_text'].values))

# Create a WordCloud object
wordcloud = WordCloud(background_color="black", max_words=500, contour_width=5, contour_color='blue')

# Generate a word cloud
wordcloud.generate(long)

# Visualize the word cloud
wordcloud.to_image()

In [67]:
final_data.full_text[0]

#### Sample Word2Vec 

In [68]:
# review =final_data.full_text.apply(gensim.utils.simple_preprocess)
# review

In [69]:
# model = gensim.models.Word2Vec(
#     final_data['full_text'],
#     window = 10,
#     min_count =2,
#     workers =10,
#     epochs = 10
    
# )
# model.build_vocab(final_data['full_text'], progress_per=1000)
# model.corpus_count
# model.train(final_data['full_text'], total_examples = model.corpus_count, epochs =30)

### Part 3 Sentence Embedding- using doc2vec

In [70]:
def tokenize_docs(data, tests=False):
    
    _documents = []
    
    if tests:
        texts = data['full_text'].loc[0]
        _documents.append((texts.lower(), 'no_tag'))
    
    else:
        for rows in range(0, len(data)):
            texts = data['full_text'].loc[rows]      
            texts = texts.split('\n\n')
            
            publish_t = data['publish_time'].loc[rows]
            authors = data['authors'].loc[rows]
            title = data['title'].loc[rows]
            
            count_no=1

            for a in texts:
                a = a.lower()
                if len(a)>=200:
                    sentences = tokenize.sent_tokenize(a)
                    if len(a)>100:
                        tag_doc = ''.join([str(rows), '-', str(count_no)])
                        
                        _documents.append([a, tag_doc, publish_t, authors, title])
                        count_no+=1
                    else:
                        pass
                          
    return _documents
def _preprocess(docs, tokens_only=False):
    for i, rec in enumerate(docs):
        doc = rec[0]
        tag_doc = rec[1]
        tag_doc = ''.join([tag_doc, '_', str(i)])
        tokens = gensim.utils.simple_preprocess(doc)
        if tokens_only:
            yield tokens
        else:
            yield gensim.models.doc2vec.TaggedDocument(tokens, [tag_doc])

### Training the dataset

In [71]:
train_data = final_data[['full_text', 'publish_time', 'authors', 'title']]
train_data.reset_index(drop=True, inplace=True)
train_docs = tokenize_docs(train_data)
df_train = pd.DataFrame.from_records(train_docs, columns = ['document', 'tag_doc', 'publish_time', 'authors', 'title'])
df_train.drop(columns=['document'], axis=1, inplace=True)

print('Trained-Documents: ', len(train_docs))
train_cor_pus = list(_preprocess(train_docs))

In [72]:
# print(train_cor_pus[:2])

### Model Build -gensim doc2vec

In [73]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=1, epochs=40)
model.build_vocab(train_cor_pus)

### Training Model

In [74]:
model.train(train_cor_pus, total_examples=model.corpus_count, epochs=model.epochs)

## Part-4 Tool for user input

### User Input function

In [75]:
def user_answer(user_query, _results):
    df_test = pd.DataFrame(data={'full_text':[user_query]})
    test_docs = tokenize_docs(df_test, tests=True)
    test_corpus = list(_preprocess(test_docs, tokens_only=True))

    
    i_vector = model.infer_vector(test_corpus[0])
    sims = model.docvecs.most_similar([i_vector], topn=len(model.docvecs))

    print('TEST Docs: «{}»\n'.format(' '.join(test_corpus[0])))
    print(u'Similar documents as per the sentence input using doc2vec model %s:\n' % model)

    results = [(f'TOP {i}', i) for i in range(0,_results)]
    _papers = []
    result = []
    for labels, i in results:
        print('Sentence Docs: «{}»\n'.format(' '.join(test_corpus[0])))
        doc_split = sims[i][0].split('_')
        tag_doc = doc_split[0]
        doc_index = int(doc_split[1])
        print('Index of document from the dataset', doc_index)
        _papers.append(int(doc_split[0].split('-')[0]))
        extract = ' '.join(map(str, train_docs[doc_index]))
        
        print(u'%s %s:\n%s\n' % (labels, sims[i], extract))
        print('-------')
        print(extract)
        result.append([tag_doc, extract])
        
        print('='*80)
    
    new_dataframe = pd.DataFrame().from_records(result, columns= ['tag_doc', 'extract'])
    return new_dataframe

## User Input 

In [76]:
print('Input the sentence or query')
print('::::::----------------------::::::')
user_input = input()
print('::::::----------------------::::::')

In [77]:
dataframe_output = user_answer(user_input, 10)
dataframe_output = dataframe_output.merge(df_train, how='left', on='tag_doc')
dataframe_output = dataframe_output[['publish_time', 'authors', 'title', 'extract']]

dataframe_output.to_csv('./result.csv', index=False)

### Model for Abstract

Tokeniznig the abstract column of the dataframe

In [78]:
def tokenize_docs(data, tests=False):
    
    _documents = []
    
    if tests:
        texts = data['abstract'].loc[0]
        _documents.append((texts.lower(), 'no_tag'))
    
    else:
        for rows in range(0, len(data)):
            texts = data['abstract'].loc[rows]      
            texts = texts.split('\n\n')
            
            publish_t = data['publish_time'].loc[rows]
            authors = data['authors'].loc[rows]
            title = data['title'].loc[rows]
            
            count_no=1

            for a in texts:
                a = a.lower()
                if len(a)>=300:
                    sentences = tokenize.sent_tokenize(a)
                    if len(a)>200:
                        tag_doc = ''.join([str(rows), '-', str(count_no)])
                        
                        _documents.append([a, tag_doc, publish_t, authors, title])
                        count_no+=1
                    else:
                        pass
                          
    return _documents
def _preprocess(docs, tokens_only=False):
    for i, rec in enumerate(docs):
        doc = rec[0]
        tag_doc = rec[1]
        tag_doc = ''.join([tag_doc, '_', str(i)])
        tokens = gensim.utils.simple_preprocess(doc)
        if tokens_only:
            yield tokens
        else:
            yield gensim.models.doc2vec.TaggedDocument(tokens, [tag_doc])

Training dataframe

In [79]:
train_data = final_data[['abstract', 'publish_time', 'authors', 'title']]
train_data.reset_index(drop=True, inplace=True)
train_docs = tokenize_docs(train_data)
df_train = pd.DataFrame.from_records(train_docs, columns = ['document', 'tag_doc', 'publish_time', 'authors', 'title'])
df_train.drop(columns=['document'], axis=1, inplace=True)

print('Trained-Documents: ', len(train_docs))
train_cor_pus = list(_preprocess(train_docs))

Building model

In [80]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=1, epochs=40)
model.build_vocab(train_cor_pus)

In [81]:
model.train(train_cor_pus, total_examples=model.corpus_count, epochs=model.epochs)

#### User input tool function

In [82]:
def user_answer(user_query, _results):
    df_test = pd.DataFrame(data={'abstract':[user_query]})
    test_docs = tokenize_docs(df_test, tests=True)
    test_corpus = list(_preprocess(test_docs, tokens_only=True))

    
    i_vector = model.infer_vector(test_corpus[0])
    sims = model.docvecs.most_similar([i_vector], topn=len(model.docvecs))

    print('TEST Docs: «{}»\n'.format(' '.join(test_corpus[0])))
    print(u'Similar documents as per the sentence input using doc2vec model %s:\n' % model)

    results = [(f'TOP {i}', i) for i in range(0,_results)]
    _papers = []
    result = []
    for labels, i in results:
        print('Sentence Docs: «{}»\n'.format(' '.join(test_corpus[0])))
        doc_split = sims[i][0].split('_')
        tag_doc = doc_split[0]
        doc_index = int(doc_split[1])
        print('Index of document from the dataset', doc_index)
        _papers.append(int(doc_split[0].split('-')[0]))
        extract = ' '.join(map(str, train_docs[doc_index]))
        
        print(u'%s %s:\n%s\n' % (labels, sims[i], extract))
        print('-------')
        print(extract)
        result.append([tag_doc, extract])
        
        print('='*80)
    
    new_dataframe = pd.DataFrame().from_records(result, columns= ['tag_doc', 'extract'])
    return new_dataframe

user input tool

In [85]:
print('Input the sentence or query')
print('::::::----------------------::::::')
user_input = input()
print('::::::----------------------::::::')

In [86]:
dataframe_output = user_answer(user_input, 10)
dataframe_output = dataframe_output.merge(df_train, how='left', on='tag_doc')
dataframe_output = dataframe_output[['publish_time', 'authors', 'title', 'extract']]

dataframe_output.to_csv('./result.csv', index=False)