In [28]:
from newspaper import Article
import random
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [29]:
#Get the article URL
article = Article('https://medium.com/@yamanpatle398/covid-19-faqs-f4773f6e8ebd')
article.download()
article.parse()
article.nlp()
corpus = article.text

#Print the corpus/text
print(corpus)

Q&A on Corona Virus (COVID-19)

WHO is continuously monitoring and responding to this outbreak. This Q&A will be updated as more is known about COVID-19, how it spreads and how it is affecting people worldwide.

For more information, check back regularly on WHO’s coronavirus pages. https://www.who.int/emergencies/diseases/novel-coronavirus-2019

What is a coronavirus?

Coronaviruses are a large family of viruses that may cause illness in animals or humans. In humans, several coronaviruses are known to cause respiratory infections ranging from the common cold to more severe diseases such as Middle East Respiratory Syndrome (MERS) and Severe Acute Respiratory Syndrome (SARS). The most recently discovered coronavirus causes coronavirus disease COVID-19.

What is COVID-19?

COVID-19 is an infectious disease caused by the most recently discovered coronavirus. This new virus and disease were unknown before the outbreak began in Wuhan, China, in December 2019.

What are the symptoms of COVID-

In [30]:
#Tokenization
text = corpus
sent_tokens = nltk.sent_tokenize(text) #Convert the text into a list of sentences

#Print the list of sentences
print(sent_tokens)

['Q&A on Corona Virus (COVID-19)\n\nWHO is continuously monitoring and responding to this outbreak.', 'This Q&A will be updated as more is known about COVID-19, how it spreads and how it is affecting people worldwide.', 'For more information, check back regularly on WHO’s coronavirus pages.', 'https://www.who.int/emergencies/diseases/novel-coronavirus-2019\n\nWhat is a coronavirus?', 'Coronaviruses are a large family of viruses that may cause illness in animals or humans.', 'In humans, several coronaviruses are known to cause respiratory infections ranging from the common cold to more severe diseases such as Middle East Respiratory Syndrome (MERS) and Severe Acute Respiratory Syndrome (SARS).', 'The most recently discovered coronavirus causes coronavirus disease COVID-19.', 'What is COVID-19?', 'COVID-19 is an infectious disease caused by the most recently discovered coronavirus.', 'This new virus and disease were unknown before the outbreak began in Wuhan, China, in December 2019.', '

In [31]:
#Create a dictionary (key:value) pair to remove punctuations
remove_punct_dict = dict(  ( ord(punct),None) for punct in string.punctuation)

#Print the punctuations
print(string.punctuation)

#Print the dictionary
print(remove_punct_dict)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
{33: None, 34: None, 35: None, 36: None, 37: None, 38: None, 39: None, 40: None, 41: None, 42: None, 43: None, 44: None, 45: None, 46: None, 47: None, 58: None, 59: None, 60: None, 61: None, 62: None, 63: None, 64: None, 91: None, 92: None, 93: None, 94: None, 95: None, 96: None, 123: None, 124: None, 125: None, 126: None}


In [32]:
#Create a function to return a list of lemmatized lower case words after removing punctuations
def LemNormalize(text):
  return nltk.word_tokenize(text.lower().translate(remove_punct_dict))

#Print the tokenization text
print(LemNormalize(text))

['qa', 'on', 'corona', 'virus', 'covid19', 'who', 'is', 'continuously', 'monitoring', 'and', 'responding', 'to', 'this', 'outbreak', 'this', 'qa', 'will', 'be', 'updated', 'as', 'more', 'is', 'known', 'about', 'covid19', 'how', 'it', 'spreads', 'and', 'how', 'it', 'is', 'affecting', 'people', 'worldwide', 'for', 'more', 'information', 'check', 'back', 'regularly', 'on', 'who', '’', 's', 'coronavirus', 'pages', 'httpswwwwhointemergenciesdiseasesnovelcoronavirus2019', 'what', 'is', 'a', 'coronavirus', 'coronaviruses', 'are', 'a', 'large', 'family', 'of', 'viruses', 'that', 'may', 'cause', 'illness', 'in', 'animals', 'or', 'humans', 'in', 'humans', 'several', 'coronaviruses', 'are', 'known', 'to', 'cause', 'respiratory', 'infections', 'ranging', 'from', 'the', 'common', 'cold', 'to', 'more', 'severe', 'diseases', 'such', 'as', 'middle', 'east', 'respiratory', 'syndrome', 'mers', 'and', 'severe', 'acute', 'respiratory', 'syndrome', 'sars', 'the', 'most', 'recently', 'discovered', 'coronavi

In [33]:
#Keyword Matching

#Greeting Inputs
GREETING_INPUTS = ["hi", "hello", "hola", "greetings", "wassup", "hey"]

#Greeting responses back to the user
GREETING_RESPONSES=["howdy", "hi", "hey", "what's good", "hello", "hey there"]

#Function to return a random greeting response to a users greeting
def greeting(sentence):
  #if the user's input is a greeting, then return a randomly chosen greeting response
  for word in sentence.split():
    if word.lower() in GREETING_INPUTS:
      return random.choice(GREETING_RESPONSES)

In [34]:
#Generate the response
def response(user_response):
  

  #The users response / query
  #user_response = 'What is chronic kidney disease'

  user_response = user_response.lower() #Make the response lower case

  ###Print the users query/ response
  #print(user_response)

  #Set the chatbot response to an empty string
  robo_response = ''

  #Append the users response to the sentence list
  sent_tokens.append(user_response)

  ###Print the sentence list after appending the users response
  #print(sent_tokens)

  #Create a TfidfVectorizer Object
  TfidfVec = TfidfVectorizer(tokenizer = LemNormalize, stop_words='english')

  #Convert the text to a matrix of TF-IDF features
  tfidf = TfidfVec.fit_transform(sent_tokens)

  ###Print the TFIDF features
  #print(tfidf)

  #Get the measure of similarity (similarity scores)
  vals = cosine_similarity(tfidf[-1], tfidf)

  #Print the similarity scores
  #print(vals)

  #Get the index of the most similar text/sentence to the users response
  idx = vals.argsort()[0][-2]

  #Reduce the dimensionality of vals
  flat = vals.flatten()

  #sort the list in ascending order
  flat.sort()

  #Get the most similar score to the users response
  score = flat[-2]

  #Print the similarity score
  #print(score)

  #If the variable 'score' is 0 then their is no text similar to the users response
  if(score == 0):
    robo_response = robo_response+"I apologize, I don't understand."
  else:
    robo_response = robo_response+sent_tokens[idx]
  
  #Print the chat bot response
  #print(robo_response)
  
  #Remove the users response from the sentence tokens list
  sent_tokens.remove(user_response)
  
  return robo_response

In [35]:
flag = True
print("DOCBot: I am Doctor Bot or DOCBot for short. I will answer your queries about COVID-19. If you want to exit, type Bye!")
while(flag == True):
  user_response = input()
  user_response = user_response.lower()
  if(user_response != 'bye'):
    if(user_response == 'thanks' or user_response =='thank you'):
      flag=False
      print("DOCBot: You are welcome !")
    else:
      if(greeting(user_response) != None):
        print("DOCBot: "+greeting(user_response))
      else:
        print("DOCBot: "+response(user_response))       
  else:
    flag = False
    print("DOCBot: Chat with you later !")

DOCBot: I am Doctor Bot or DOCBot for short. I will answer your queries about COVID-19. If you want to exit, type Bye!
Corona Virus
DOCBot: Q&A on Corona Virus (COVID-19)

WHO is continuously monitoring and responding to this outbreak.
WHO
DOCBot: I apologize, I don't understand.
Cause
DOCBot: Coronaviruses are a large family of viruses that may cause illness in animals or humans.
Safety
DOCBot: Ensure good food safety practices at all times.
Wash
DOCBot: Wash them the same way you would in any other circumstance.
Food
DOCBot: There is currently no confirmed case of COVID-19 transmitted through food or food packaging.
Distancing
DOCBot: I apologize, I don't understand.
Social
DOCBot: I apologize, I don't understand.
Meter
DOCBot: Maintain at least 1 meter (3 feet) distance between yourself and anyone who is coughing or sneezing.
distance
DOCBot: Maintain at least 1 meter (3 feet) distance between yourself and anyone who is coughing or sneezing.
bye
DOCBot: Chat with you later !
