In [1]:
url = 'https://www.mayoclinic.org/diseases-conditions/chronic-kidney-disease/symptoms-causes/syc-20354521'
def chat_bot(url, user_in):
    from newspaper import Article
    import random
    import string
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    import nltk
    import numpy as np
    import warnings
    warnings.filterwarnings('ignore')
    
    # might not be necessary (machine specific)
    import ssl
    ssl._create_default_https_context = ssl._create_unverified_context
    
    # Download packages from NLTK
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)
    
    # Get the article
    article = Article(url)
    article.download()
    article.parse()
    article.nlp()
    corpus = article.text
    
    # Tokenization 
    text = corpus
    sent_tokens = nltk.sent_tokenize(text) # convert text to list of sentences
    
    # create a dictionary (key:value) pair to remove punctuations, use ord to get the ordinal numbers
    remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
    
    def LemNormalize(text):
        return nltk.word_tokenize(text.lower().translate(remove_punct_dict))
    
    
    ### INPUT FROM USER ###
    user_in = user_in.lower() # make user_in lower case
    
    
    # set chatbot response to any empty string
    robo_out = ''

    # Append users response to sentence list
    sent_tokens.append(user_in)
    
    # Create a TfidfVectorizer object
    tf_vec = TfidfVectorizer(tokenizer = LemNormalize, stop_words = 'english')

    # Convert text to a matrix of TF-IDF features
    tfidf = tf_vec.fit_transform(sent_tokens)
    
    # Get measure of similarity (score)
    vals = cosine_similarity(tfidf[-1], tfidf) # compare user in with all features
    
    # Get index of the sentence most similar to user's in
    idx = vals.argsort()[0][-2] # 0 because list in list, -2 to get the end top score, -1 is the user_in which is most similar

    # Reduce dimensionality of vals, to make from list of list to just one list
    val_flat = vals.flatten()

    # Sort val_flat in ascending order
    val_flat.sort()

    
    ### Get the most similar score to the user_in
    score = val_flat[-2] # -1 is the user_in, -2 is the top score
    
    
    # If the score is 0, then there is no text similar to user's response
    if (score == 0):
        robo_out = robo_out + " I apologize, I don't understand." 
    else:
        robo_out = robo_out + sent_tokens[idx]
    
    # Remove user_in
    sent_tokens.remove(user_in)
    
    return score, robo_out

In [2]:
score, robo_out = chat_bot(url, "What is chronic kidney disease?")
print(robo_out, f'Score: {score}')

Overview

Chronic kidney disease, also called chronic kidney failure, describes the gradual loss of kidney function. Score: 0.5068559627834549


In [3]:
score, robo_out = chat_bot(url, "What causes kidney disease?")
print(robo_out, f'Score: {score}')

Diseases and conditions that cause chronic kidney disease include:

Type 1 or type 2 diabetes

High blood pressure

Glomerulonephritis (gloe-mer-u-low-nuh-FRY-tis), an inflammation of the kidney's filtering units (glomeruli)

Interstitial nephritis (in-tur-STISH-ul nuh-FRY-tis), an inflammation of the kidney's tubules and surrounding structures

Polycystic kidney disease

Prolonged obstruction of the urinary tract, from conditions such as enlarged prostate, kidney stones and some cancers

Vesicoureteral (ves-ih-koe-yoo-REE-tur-ul) reflux, a condition that causes urine to back up into your kidneys

Recurrent kidney infection, also called pyelonephritis (pie-uh-low-nuh-FRY-tis)

Risk factors

Factors that may increase your risk of chronic kidney disease include:

Diabetes

High blood pressure

Heart and blood vessel (cardiovascular) disease

Smoking

Obesity

Being African-American, Native American or Asian-American

Family history of kidney disease

Abnormal kidney structure

Older age


In [4]:
url = 'https://en.wikipedia.org/wiki/Dog'
score, robo_out = chat_bot(url, "Dog sizes?")
print(robo_out, f'Score: {score}')

[65] The median longevity of mixed-breed dogs, taken as an average of all sizes, is one or more years longer than that of purebred dogs when all breeds are averaged. Score: 0.2920178776892433


In [5]:
url = 'https://en.wikipedia.org/wiki/Coronavirus_disease_2019'
score, robo_out = chat_bot(url, "Medications of the disease?")
print(robo_out, f'Score: {score}')

[128] Although new medications may take until 2021 to develop,[183] several of the medications being tested are already approved for other uses or are already in advanced testing. Score: 0.40400438758812446


In [6]:
url = 'https://en.wikipedia.org/wiki/Coronavirus_disease_2019'
score, robo_out = chat_bot(url, "how to prevent covid19?")
print(robo_out, f'Score: {score}')

Wuhan, China), animal species or groups of people in disease and virus names to prevent social stigma. Score: 0.21680310861171218
