In [None]:
import numpy as np 
import pandas as pd
import json

import os
from tqdm import tqdm


In [None]:
articles = {}

for dirpath, subdirs, files in os.walk('/kaggle/input'):
    for x in files:
        if x.endswith(".json"):
            articles[x] = os.path.join(dirpath, x)        
metadata = pd.read_csv('/kaggle/input/CORD-19-research-challenge/metadata.csv')

In [None]:
literature = []
for index, row in tqdm(metadata.iterrows(), total=metadata.shape[0]):
    sha = str(row['sha'])
    if sha != 'nan':
        sha = sha + '.json';
        try:
            with open(articles[sha]) as f:
                data = json.load(f)
                key = 'abstract'
                abstract = ""
                if key in data:
                    for content in data[key]:
                        abstract += content['text']
                text = ""
                if "body_text" in data:
                    for content in data["body_text"]:
                        text += content['text']
                literature.append({'file': articles[sha], 'abstract': abstract, 'content': text}) 
        
        except KeyError:
            pass
        

In [None]:
!pip install scispacy


In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.4/en_core_sci_lg-0.2.4.tar.gz


In [None]:
import scispacy
import spacy

In [None]:
nlp = spacy.load("en_core_sci_lg")

In [None]:
def preprocess(doc):
    raw = raw.lower()
    stop_words = set(stopwords.words('english'))
    words=[]
    for j in range(0,len(tokens)):
        words.append([ i for i in tokens[j] if not i in stop_words])
    return words

In [None]:
def to_vector(doc):
    return nlp(doc)

In [None]:
vectors = []
for article in tqdm(literature):
    vec = to_vector(article["abstract"])
    vectors.append(vec)

In [None]:
question = "covid 19 sars pneumonia covid-19 coronavirus medical care surge capacity and nursing homes  allocation of scarce resources personal protective equipment ppe disease management  processes of care clinical characterization and management of the virus"
vec_question = to_vector(question)

In [None]:
vec_question.has_vector

In [None]:
import warnings
warnings.resetwarnings()

In [None]:
similarity = []
for vec in tqdm(vectors):
    if vec.has_vector:
        similarity.append(vec.similarity(vec_question))
similarity = np.array(similarity)    

In [None]:
vectors[np.argmax(similarity)]

In [None]:
import heapq

ind = heapq.nlargest(1000, range(len(similarity)), similarity.take)
#similarity[ind]

In [None]:
for i in ind:
    print(vectors[i])

In [None]:
medical_care_articles = np.array(literature)[np.array(ind)]


In [None]:
np.savetxt(("indices_medical_care.csv"), np.array(ind))

In [None]:
medical_care_articles.shape

In [None]:
texts= []
for article in medical_care_articles:
    vec = article["content"]
    texts.append(vec)

In [None]:
import nltk
from nltk import word_tokenize, RegexpTokenizer,PunktSentenceTokenizer, sent_tokenize

Interest=[]
for text in texts:
    raw = text

    #Make everything lower case - useful for stop-words
    raw = raw.lower()
    #print(raw)

    # split into sentences. important not to get whitespace
    sentences = sent_tokenize(raw)
    #print('Tokenised sentences',sentences)

    # remove punctuation - this can create problems with '-' words
    import string
    table = str.maketrans('','', string.punctuation)
    nopunk=[w.translate(table) for w in sentences]
    #print('Removed Punctuation',nopunk)

    #Tokenise words
    tokens=[]
    for sentence in nopunk:
        tokens.append(nltk.word_tokenize(sentence))
    #print('Tokenised sentences',tokens)

    #Remove Stop Words. Remember add stopwords that may be relevant
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    words=[]
    for j in range(0,len(tokens)):
        words.append([ i for i in tokens[j] if not i in stop_words])
    #print ('Removed stop words',words)

    # CONSIDER COMMENTING: stemming of words
    from nltk.stem.porter import PorterStemmer
    porter = PorterStemmer()
    stemmed = []
    for j in range(0,len(words)):
        stemmed.append([porter.stem(word) for word in words[j]])
    #print('Post stemming',stemmed)

    #CAN COMMENT BUT LESS CRITICAL: Lemmatisation (NOT of stemmed but could change to combine both)
    from nltk.stem import WordNetLemmatizer
    lemmatizer=WordNetLemmatizer()
    lemmed = []
    for j in range(0,len(words)):
        lemmed.append([lemmatizer.lemmatize(word) for word in words[j]])
    #print('Post Lemmatisation',lemmed)
    
    Interest.append(lemmed)
print('List of Tokenised abstracts',Interest)


In [None]:
sentences = []
for article in tqdm(medical_care_articles):
    text = article["content"]
    sentences += sent_tokenize(text)

In [None]:
len(sentences)

In [None]:
vec_sentences = []
for sentence in tqdm(sentences):
    vec_sentences.append(to_vector(sentence))

In [None]:
question = "Efforts to determine adjunctive and supportive interventions that can improve the clinical outcomes of infected patients (e.g. steroids, high flow oxygen)"
vec_question = to_vector(question)

In [None]:
similarity_sen = []
for vec in tqdm(vec_sentences):
    if vec.has_vector:
        similarity_sen.append(vec.similarity(vec_question))
similarity_sen = np.array(similarity_sen)    

In [None]:
ind2 = heapq.nlargest(10, range(len(similarity_sen)), similarity_sen.take)

In [None]:
res = []
for i in ind2:
    res.append(vec_sentences[i])

In [None]:
#results = {}

In [None]:
results[question] = res

In [None]:
results

In [None]:
import json
second = {}
for key in results.keys():
    j = []
    for i in results[key]:
        #print(type(str(i)))
        j.append(str(i))
    second[str(key)] = j

json_ = json.dumps(second)
print(json_)

In [None]:
with open('results.json', 'w') as fp:
    json.dump(second, fp)