In [7]:
import nltk
import pandas as pd
import os
import json
from nltk.stem import PorterStemmer
import pycountry
import numpy as np
import spacy
import warnings; warnings.simplefilter('ignore')
# nltk.download('averaged_perceptron_tagger')

In [8]:
# ps = PorterStemmer()
nlp = spacy.load('en_core_web_lg')

df = pd.read_csv("csvfiles/COVID.csv")

In [9]:
fil = df['abstract'] != "[]"
df = df[fil]
df = df[["title", "abstract", "text"]]
df = df.drop_duplicates(subset='title', keep="first").dropna()
df.head()

Unnamed: 0,title,abstract,text
0,The RNA pseudoknots in foot-and-mouth disease ...,word count: 194 22 Text word count: 5168 23 24...,"VP3, and VP0 (which is further processed to VP..."
1,Analysis Title: Regaining perspective on SARS-...,"During the past three months, a new coronaviru...","In December 2019, a novel coronavirus, SARS-Co..."
3,"Real-time, MinION-based, amplicon sequencing f...",Infectious bronchitis (IB) causes significant ...,"Infectious bronchitis (IB), which is caused by..."
4,A Combined Evidence Approach to Prioritize Nip...,Nipah Virus (NiV) came into limelight recently...,Nipah is an infectious negative-sense single-s...
5,Assessing spread risk of Wuhan novel coronavir...,Background: A novel coronavirus (2019-nCoV) em...,"In December 2019, a cluster of patients with p..."


In [10]:
def find_similarity(tags):
    task = []
    for i, row in df.iterrows():
        try:
            title = row[0].lower()
            title = nlp(title)
            result = title.similarity(tags)
            task.append([i, result])
        except Exception as e: print(e)
            
    sorted_results = sorted(task, key=lambda x: x[1], reverse=True)[:50]
    ids = [i for i, r in sorted_results]
    dataframe = df.loc[ids]
    dataframe = dataframe.reset_index(drop=True)
    prob = pd.DataFrame({"similarity": [r for i, r in sorted_results]})
    dataframe = dataframe.join(prob)     
        
    return dataframe

# TASK 1
## What is known about transmission, incubation, and environmental stability?

In [5]:
tags = nlp("transmission incubation environment seasonality")
task1 = find_similarity(tags)
task1

Unnamed: 0,title,abstract,text,similarity
0,Environmental Conditions Affect Exhalation of ...,The seasonality of influenza virus infections ...,Influenza viruses display distinct seasonal pa...,0.750491
1,Host genetics determine susceptibility to avia...,antibody. Genetic resistance to a novel H9N2 v...,immunological reagents 4 . Mice have been wide...,0.723156
2,Environmental Factors Affecting the Transmissi...,Many viruses are capable of infecting the huma...,Viral respiratory tract infections are both ub...,0.72042
3,The ecology and adaptive evolution of influenz...,This is an open access article under the terms...,Influenza A virus (IAV) has caused significant...,0.714105
4,Ionizing air affects influenza virus infectivi...,Active ionizer prevented 100% (4/4) of guinea ...,Visualization and efficiency of aerosol sampli...,0.712272
5,HVAC filtration for controlling infectious air...,This work describes and applies a methodology ...,The airborne transmission of respiratory patho...,0.711666
6,Adaptive Contact Networks Change Effective Dis...,Human societies are organized in complex webs ...,During recent years it has become clear that d...,0.703346
7,Sequential Adaptive Mutations Enhance Efficien...,The adaptation of Chikungunya virus (CHIKV) to...,The potential of RNA viruses to emerge into ne...,0.702998
8,Association between viral seasonality and mete...,We examined the effects of daily local meteoro...,The common cold is typically a mild upper resp...,0.702115
9,Spatially Adjusted time- varying Reproductive ...,the basic reproductive number (R 0 ) is a fund...,The basic reproductive number (R 0 ) is a fund...,0.700947


In [6]:
task1.to_csv('csvfiles/tasks/task1.csv', index=False)

In [6]:
# for i, r in task1.iterrows():
#     print('------------------------')
#     print(r[0].upper())
#     print('------------------------')
#     print(r[1])
#     print('\n\n')

# TASK 2
## What do we know about COVID-19 risk factors? What have we learned from epidemiological studies?

In [7]:
tags = nlp('risk smoking infections pregnancy fatality')
task2 = find_similarity(tags)
task2

Unnamed: 0,title,abstract,text,similarity
0,Respiratory disease in pregnancy,Breathlessness in the absence of an underlying...,Severe chest disease leading to respiratory fa...,0.848921
1,Exercise and infection risk,"After studying this chapter, you should be abl...",Upper respiratory tract infections (URTI) such...,0.829904
2,Pregnancy and perinatal outcomes of women with...,Objective: This study was undertaken to evalua...,"Since November 2002, severe acute respiratory ...",0.82199
3,Severe Morbidity and Mortality Associated With...,Background. Respiratory syncytial virus (RSV) ...,Respiratory syncytial virus (RSV) initially wa...,0.820837
4,Clinical Presentation and Birth Outcomes Assoc...,Respiratory syncytial virus (RSV) is the most ...,Respiratory syncytial virus (RSV) is the most ...,0.815128
5,Risk factors for severe bronchiolitis caused b...,Severe bronchiolitis is the most common reason...,Bronchiolitis causes significant morbidity and...,0.81372
6,Clinical characteristics and outcomes of respi...,Objective: To describe the clinical presentati...,Infection with respiratory pathogens during pr...,0.813499
7,Risk of mortality associated with respiratory ...,Background: Respiratory syncytial virus (RSV) ...,Respiratory syncytial virus (RSV) is a signifi...,0.813145
8,Viruses causing lower respiratory symptoms in ...,Introduction Viral acute respiratory infection...,Viral acute respiratory infections (ARIs) are ...,0.812559
9,Statins May Decrease the Fatality Rate of Midd...,Citation Yuan S. 2015. Statins may decrease th...,T he recent paper by Totura and colleagues (1)...,0.811333


# TASK 3
## What do we know about virus genetics, origin, and evolution? What do we know about the virus origin and management measures at the human-animal interface?

In [8]:
tags = nlp('genetics origin evolution animal range host')
task3 = find_similarity(tags)
task3

Unnamed: 0,title,abstract,text,similarity
0,The Evolution and Genetics of Virus Host Shifts,Emerging viral diseases are often the product ...,"Emerging infectious diseases affecting humans,...",0.847011
1,viruses The Broad Host Range and Genetic Diver...,Astroviruses are a diverse family of viruses t...,Astroviruses (AstVs) were first described in 1...,0.841576
2,"The diversity, evolution and origins of verteb...",Despite a substantial increase in our knowledg...,"Yong-Zhen Zhang 1,2 , Wei-Chen Wu 1 , Mang Shi...",0.825989
3,Evolution and Genetic Diversity of Porcine Cir...,The identification of a new circovirus (Porcin...,The Circovirus genus claimed veterinarians' at...,0.824489
4,Does genetic diversity limit disease spread in...,It is a commonly held view that genetically ho...,It seems to be conventional wisdom that geneti...,0.823535
5,Molecular evolution and emergence of avian gam...,"Coronaviruses, which are single stranded, posi...",The majority of emerging infectious diseases a...,0.819451
6,"Origin, Genetic Diversity, and Evolutionary Dy...",(porcine dermatitis and nephropathy syndrome)-...,Continuous epidemiological surveillance of eme...,0.816998
7,viruses Reliable and Standardized Animal Model...,"Starting in 2006, bluetongue virus serotype 8 ...","Amongst pathogens, RNA viruses were a major so...",0.8169
8,Genomic organization and adaptive evolution of...,Immunoglobulins are important elements of the ...,Immunoglobulins (Igs) are glycoprotein molecul...,0.815416
9,Animal virus ecology and evolution are shaped ...,12 The current classification of animal viruse...,"ecologies, poultry plus pig production and rum...",0.814338


# TASK 4
## What do we know about vaccines and therapeutics? What has been published concerning research and development and evaluation efforts of vaccines and therapeutics?

In [9]:
tags = nlp('vaccines therapeutics drugs antibody effectiveness')
task4 = find_similarity(tags)
task4

Unnamed: 0,title,abstract,text,similarity
0,Therapeutics and Vaccines,An emerging respiratory infectious disease wit...,"A novel human coronavirus, Middle East respira...",0.885888
1,Monoclonal antibody-based therapies for microb...,The monoclonal antibody (mAb) revolution that ...,Historical The field of infectious diseases ha...,0.845974
2,Vaccines and Therapeutics Against Hantaviruses,Hantaviruses (HVs) are rodent-transmitted viru...,"In recent years, the repeated outbreak of hant...",0.8379
3,molecules Pharmacological and Biological Antiv...,Subtype B coxsackieviruses (CVB) represent the...,Myocarditis is defined as a subclinical inflam...,0.831365
4,New Class of Monoclonal Antibodies against Sev...,Background: The urgent medical need for innova...,A novel class of human monoclonal antibodies a...,0.82461
5,Therapy with CTLA4-Ig and an antiviral monoclo...,‡,www.sciencetranslationalmedicine.org/cgi/conte...,0.8206
6,Clinical development of monoclonal antibody- b...,"Today there are many licensed antiviral drugs,...",The innate immune response is the first-line d...,0.819956
7,Human monoclonal antibodies as candidate thera...,"The emergence of new pathogens, such as severe...",Emerging infectious diseases are infectious di...,0.81694
8,Clinical Medicine Review Advancements in Nucle...,Several viruses cause pulmonary infections due...,Many viruses show tropism for cells of the res...,0.816499
9,Therapeutic Applications of Monoclonal Antibodies,Researchers have sought therapeutic applicatio...,"I n 1975, Kohler and Milstein revolutionized t...",0.816157


# TASK 5
## What has been published about medical care? What has been published concerning surge capacity and nursing homes? What has been published concerning efforts to inform allocation of scarce resources? What do we know about personal protective equipment? What has been published concerning alternative methods to advise on disease management? What has been published concerning processes of care? What do we know about the clinical characterization and management of the virus?

In [10]:
tags = nlp('medical manifestations respirators technologies protection')
task5 = find_similarity(tags)
task5

Unnamed: 0,title,abstract,text,similarity
0,Resistance to synthetic blood penetration of N...,Background: Surgical N95 filtering facepiece r...,Surgical mask (SM) or facemask refers to the F...,0.802547
1,The efficacy of medical masks and respirators ...,We aimed to examine the efficacy of medical ma...,There is currently a lack of consensus around ...,0.799166
2,Cost-effectiveness analysis of N95 respirators...,Background: There are substantial differences ...,Healthcare workers (HCWs) are at increased ris...,0.788759
3,Clinical Infectious Diseases Healthcare Worker...,Background. Personal protective equipment (PPE...,Effective use of personal protective equipment...,0.784326
4,SPEECH INTELLIGIBILITY ASSESSMENT OF PROTECTIV...,Speech Intelligibility (SI) is the perceived q...,Speech Intelligibility (SI) and clear communic...,0.772141
5,Reusable elastomeric air-purifying respirators...,Background: Elastomeric air-purifying respirat...,The current pandemic influenza and previous ex...,0.766423
6,Protecting healthcare staff from severe acute ...,Guidelines issued by the Centers for Disease C...,Severe acute respiratory syndrome (SARS) is a ...,0.765372
7,Review of economic evaluations of mask and res...,Background: There has been increasing debate s...,Both the World Health Organisation (WHO) and t...,0.764374
8,The Respiratory Protection Effectiveness Clini...,Background: Although N95 filtering facepiece r...,Healthcare personnel (HCP) are exposed to resp...,0.762197
9,User acceptance of reusable respirators in hea...,"Background: Inclusion of reusable respirators,...",Health care workers (HCWs) face potentially ha...,0.756421


# TASK 6
## What do we know about the effectiveness of non-pharmaceutical interventions? What is known about equity and barriers to compliance for non-pharmaceutical interventions?

In [11]:
tags = nlp('pharmaceutical closures ban gatherings depending')
task6 = find_similarity(tags)
task6

Unnamed: 0,title,abstract,text,similarity
0,Mass gatherings medicine: public health issues...,Mass gathering events are associated with majo...,A mass gathering is defined by WHO as a planne...,0.704077
1,Mass Gatherings and Public Health: Case Studie...,B A C K G R O U N D Many new and challenging r...,The World Health Organization describes a mass...,0.676145
2,Mass Gatherings Health 6 Research agenda for m...,Public health research is essential for the de...,"A mass gathering (MG), as defi ned by WHO, is ...",0.675742
3,Considerations for Use of Investigational Drug...,The paradigm for the use of investigational dr...,Infectious diseases are estimated to have kill...,0.675272
4,Mass Gatherings Health 1 Emergence of medicine...,Although defi nitions of mass gatherings (MG) ...,Defi nitions of mass gatherings (MGs) vary gre...,0.668726
5,An integrative approach to enhancing small-sca...,"Background: In Asian countries, small-scale ru...",Interest in an integrative approach has been i...,0.66779
6,Could influenza transmission be reduced by res...,Introduction: Mass gatherings (MG) may provide...,It is well established that influenza is trans...,0.665104
7,Effect of non-pharmaceutical interventions for...,The COVID-19 outbreak containment strategies i...,"could have shown a 3-fold, 7-fold, and 18-fold...",0.664495
8,Indications for healthcare surge capacity in E...,European healthcare systems face rapidly incre...,"In the past days and weeks, it has become clea...",0.66289
9,Making difficult ethical decisions in patient ...,OBJECTIVE: Recent experiences in the United St...,"O n Tuesday, September 11, 2001 , the American...",0.659594


# TASK 7
## What do we know about diagnostics and surveillance? What has been published concerning systematic, holistic approach to diagnostics (from the public health surveillance perspective to being able to predict clinical outcomes)?

In [12]:
tags = nlp("diagnostics surveillance policy technology")
task7 = find_similarity(tags)
task7

Unnamed: 0,title,abstract,text,similarity
0,Internet-based surveillance systems for monito...,Emerging infectious diseases present a complex...,Emerging infectious diseases are of particular...,0.807831
1,ScienceDirect Development of the electronic su...,This paper presents the electronic surveillanc...,"To achieve the goal for the total health care,...",0.794128
2,BMC Public Health Beyond traditional surveilla...,Background: All countries need effective disea...,"In this paper, we review examples of these nov...",0.79338
3,Advanced Querying Features for Disease Surveil...,Most automated disease surveillance systems no...,"In its 2007 annual report, the World Health Or...",0.790191
4,Smart home technology for telemedicine and eme...,"With the ageing population, mobility is an imp...","In the past decade, one of the fastest growing...",0.784368
5,Evaluating Hospital-Based Surveillance for Out...,The International Health Regulations outline c...,We propose a framework to evaluate the sensiti...,0.77255
6,Integrated Sensor Systems and Data Fusion for ...,"As stated by John Naisbitt in his bestseller ""...",and data links. After conceiving also algorith...,0.769337
7,Leveraging the Laboratory Response Network Mod...,Promoting global health security as an interna...,"I n recent years, considerable resources have ...",0.769135
8,JAMIA Focus on Media-based Biosurveillance Mod...,A b s t r a c t Objective: Unstructured electr...,"Internet-based resources, such as online newsp...",0.766927
9,Application of next generation sequencing tech...,The surveillance and prevention of pathogenic ...,The prevention and control of contamination ca...,0.761484


# TASK 8
- Are there geographic variations in the rate of COVID-19 spread?
- Are there geographic variations in the mortality rate of COVID-19?
- Is there any evidence to suggest geographic based virus mutations?

In [11]:
tags8 = []
countries = [c.name.lower() for c in pycountry.countries]
relevant = []

# for id_, row in df.iterrows():
#     for i in range(len(row)):
#         try:
#             sentences = row[i]
#             sentences = sentences.split(". ")
#             for sentence in sentences:
#                 for country in countries:
#                     if country in sentence.lower():
#                         relevant.append(id_)
#         except:
#             pass
#         break
countries
# geo_df = df.loc[relevant]
# geo_df.head()

['aruba',
 'afghanistan',
 'angola',
 'anguilla',
 'åland islands',
 'albania',
 'andorra',
 'united arab emirates',
 'argentina',
 'armenia',
 'american samoa',
 'antarctica',
 'french southern territories',
 'antigua and barbuda',
 'australia',
 'austria',
 'azerbaijan',
 'burundi',
 'belgium',
 'benin',
 'bonaire, sint eustatius and saba',
 'burkina faso',
 'bangladesh',
 'bulgaria',
 'bahrain',
 'bahamas',
 'bosnia and herzegovina',
 'saint barthélemy',
 'belarus',
 'belize',
 'bermuda',
 'bolivia, plurinational state of',
 'brazil',
 'barbados',
 'brunei darussalam',
 'bhutan',
 'bouvet island',
 'botswana',
 'central african republic',
 'canada',
 'cocos (keeling) islands',
 'switzerland',
 'chile',
 'china',
 "côte d'ivoire",
 'cameroon',
 'congo, the democratic republic of the',
 'congo',
 'cook islands',
 'colombia',
 'comoros',
 'cabo verde',
 'costa rica',
 'cuba',
 'curaçao',
 'christmas island',
 'cayman islands',
 'cyprus',
 'czechia',
 'germany',
 'djibouti',
 'dominica'

# TASK 9
## What has been published concerning ethical considerations for research? What has been published concerning social sciences at the outbreak response?

In [13]:
tags = nlp("ethical social education access fear")
task9 = find_similarity(tags)
task9

Unnamed: 0,title,abstract,text,similarity
0,Public Perception on Healthcare Services: Evid...,Social media has been used as data resource in...,Investigating public perception of healthcare ...,0.847087
1,Collaborative accountability for sustainable p...,The sustainability of public health practices ...,The sustainability of public health practices ...,0.84596
2,Crisis communication in context: Cultural and ...,This study analyzes academic journal articles ...,Much existing research analyzes the descriptiv...,0.841431
3,Implementing a One Health approach to emerging...,Background: 'One Health' represents a call for...,The recent Ebolavirus (EBOV) outbreak in West ...,0.835882
4,Informed public against false rumor in the soc...,This study explores how the public's belief in...,People often tend to seek more information in ...,0.833767
5,Social determinants of health inequalities: to...,A systems approach offers a novel conceptualiz...,Health outcomes are increasingly perceived fro...,0.833575
6,BMC Public Health Identifying strategies to im...,Background: Movement towards evidence-based pr...,The need for improved access to high quality p...,0.833431
7,Unraveling the complexities of disaster manage...,Complexity is a useful frame of reference for ...,"Recent disasters, such as the oil spill in the...",0.83314
8,Risk Management and Healthcare Policy Dovepres...,The devastation caused by the Ebola virus dise...,The magnitude of the recent Ebola virus diseas...,0.833116
9,Risk communication and management in public he...,As the world faces its first influenza pandemi...,As the world faces its first influenza pandemi...,0.832785


# TASK 10
## What has been published about information sharing and inter-sectoral collaboration? What has been published about data standards and nomenclature? What has been published about governmental public health? What do we know about risk communication? What has been published about communicating with high-risk populations? What has been published to clarify community measures? What has been published about equity considerations and problems of inequity?

In [14]:
tags = nlp('government published surveillance policy')
task10 = find_similarity(tags)
task10

Unnamed: 0,title,abstract,text,similarity
0,The International Health Regulations: The Gove...,r The International Health Regulations (IHR) a...,The IHR's origins can be traced to a series of...,0.785571
1,Global Health and Foreign Policy,Health has long been intertwined with the fore...,Global health issues have long been a concern ...,0.78321
2,Capacity of Public Health Surveillance to Comp...,Public health surveillance is essential for de...,T he 2005 revisions to the International Healt...,0.780242
3,Health security capacities in the context of C...,"Background Public health measures to prevent, ...",Coronaviruses are RNA viruses that are found w...,0.779393
4,Investigating public health emergency response...,Infectious diseases pose a great danger to pub...,Recent years have witnessed a number of microb...,0.777344
5,Pandemic H1N1 in Canada and the use of evidenc...,When responding to a novel infectious disease ...,An effective public health response to a novel...,0.774028
6,Public health human resources: a comparative a...,Background: Amidst concerns regarding the capa...,The increase in communicable and non-communica...,0.772878
7,BMC Public Health Establishing a nationwide em...,Background: With international concern over em...,groups starting in November 2004 and two clear...,0.769025
8,Steps to a Sustainable Public Health Surveilla...,At a time when populations are changing and di...,Online Journal of Public Health Informatics * ...,0.768514
9,Pandemic planning in pediatric care: A website...,Objectives: This study investigates current po...,The impact of pandemic H1N1 (2009) has raised ...,0.764343


In [15]:
task2.to_csv('csvfiles/tasks/task2.csv', index=False)
task3.to_csv('csvfiles/tasks/task3.csv', index=False)
task4.to_csv('csvfiles/tasks/task4.csv', index=False)
task5.to_csv('csvfiles/tasks/task5.csv', index=False)
task6.to_csv('csvfiles/tasks/task6.csv', index=False)
task7.to_csv('csvfiles/tasks/task7.csv', index=False)
task9.to_csv('csvfiles/tasks/task9.csv', index=False)
task10.to_csv('csvfiles/tasks/task10.csv', index=False)