In [12]:
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
import urllib
import requests

In [25]:
def search(query):
    return "http://www.bing.com/search?q=%s" % (urllib.parse.quote_plus(query))

In [26]:
headers = {'User-Agent':
           'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:54.0) Gecko/20100101 Firefox/54.0'}

In [173]:
def begins_with(full_text, sub_text):
    if full_text[:len(sub_text)] == sub_text:
        return True
    return False

def include_content(cur_text, tag):
    exclude_prefixes = ['Click to view', 'See more on', 'Advertise', 'Help', 'Image:']
    if len(cur_text) == 0:
        return False
    for excl in exclude_prefixes:
        if begins_with(cur_text, excl):
            return False
    
    if 'href' in tag.attrs:
        if begins_with(tag['href'], 'https://'):
            return True
        
    return False

def print_sres(content_list):
    for i, c in enumerate(content_list):
        if include_content(c):
            print(c.get_text().strip() + "\n" + c['href'] + "\n")
        
            
def bing(search_term):
    wiki_prefix = 'https://en.wikipedia.org/'
    feat_doc = []
    feat_links = []
    feat_wiki = []
    
    url = search(search_term)
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.content, 'html5lib')
    content_list = soup.find_all('a')
    
    # collecting terms
    for i, c in enumerate(content_list):
        cur_text = c.get_text().strip()
        if include_content(cur_text, c):
            feat_doc.append(cur_text)
            feat_links.append(c['href'])
            
            if begins_with(c['href'], wiki_prefix):
                feat_wiki.append(c['href'].replace(wiki_prefix, ''))
    
    return feat_doc, feat_links, feat_wiki

### Case 1

In [190]:
feat_doc, feat_links, feat_wiki = bing('non small cell lung cancer')

In [191]:
feat_doc2, feat_links2, feat_wiki2 = bing("carcinoma, non-small-cell lung")

In [192]:
print("---STRING 1 ---\n")
print(feat_doc)

print("\n---STRING 2 ---\n")
print(feat_doc2)

---STRING 1 ---

['Non-Small-Cell Lung Cancer: Causes, Symptoms, and Treatment', 'What Is Non-Small-Cell Lung Cancer?', 'Types of Immunotherapy for Non', 'Non-Small-Cell Lung Cancer Treatments by St…', 'Treatment of Metastatic Non', 'Non-Small Cell Lung Cancer Treatment (PDQ®)–Patient ...', 'Non-Small Cell Lung Cancer - What You Need to Know', 'What Is Lung Cancer? | Types of Lung Cancer', 'What Is Non-Small Cell Lung Cancer? Symptoms, Treatment ...', 'Non-small-cell lung carcinoma - Wikipedia', 'Non-Small Cell Lung Cancer vs. Small Cell Lung Cancer ...', 'Small cell vs. non-small cell lung cancer: What are the ...', 'Lung cancer - Symptoms and causes - Mayo Clinic', 'Small Cell Lung Cancer Treatment (PDQ®)–Patient Version ...', 'What’s the Difference Between Small Cell Lung Cancer and Non-Small C…', 'Non-Small Cell Lung Cancer Treatment (PDQ®)—Patient ...', 'Overview of Stage 3 Lung Cancer Symptoms', 'Stage 4 Lung Cancer Symptoms']

---STRING 2 ---

['Non-Small-Cell Lung Cancer: Cause

In [193]:
print("---LINK 1 ---\n")
print(feat_links)

print("\n---LINK 2 ---\n")
print(feat_links2)

---LINK 1 ---

['https://www.webmd.com/lung-cancer/non-small-cell-lung-cancer', 'https://www.webmd.com/lung-cancer/what-is-nsclc', 'https://www.webmd.com/lung-cancer/non-small-cell-lung-cancer-immunotherapy', 'https://www.webmd.com/lung-cancer/non-small-cell-lung-cancer-treatment-stage', 'https://www.webmd.com/lung-cancer/metastatic-nsclc-treatments', 'https://www.cancer.gov/types/lung/patient/non-small-cell-lung-treatment-pdq', 'https://www.drugs.com/cg/non-small-cell-lung-cancer.html', 'https://www.cancer.org/cancer/lung-cancer/about/what-is.html', 'https://www.emedicinehealth.com/non-small-cell_lung_cancer/article_em.htm', 'https://en.wikipedia.org/wiki/Non-small-cell_lung_carcinoma', 'https://www.medicinenet.com/non-small_cell_lung_cancer_vs_small_cell/article.htm', 'https://www.medicalnewstoday.com/articles/316477.php', 'https://www.mayoclinic.org/diseases-conditions/lung-cancer/symptoms-causes/syc-20374620', 'https://www.cancer.gov/types/lung/patient/small-cell-lung-treatment-pdq

In [182]:
feat_wiki, feat_wiki2

(['wiki/Non-small-cell_lung_carcinoma'],
 ['wiki/Non-small-cell_lung_carcinoma'])

## Case 2

In [183]:
feat_doc, feat_links, feat_wiki = bing("parkinson's disease")

In [184]:
feat_doc2, feat_links2, feat_wiki2 = bing("parkinson disease")

In [185]:
feat_wiki, feat_wiki2

(['wiki/Parkinson%27s_disease', 'wiki/Parkinson%27s_disease'], [])

In [188]:
print("---STRING 1 ---\n")
print(feat_doc)

print("\n---STRING 2 ---\n")
print(feat_doc2)

---STRING 1 ---


---STRING 2 ---

["Parkinson's disease - Symptoms and causes - Mayo Clinic", "Parkinson's Disease Center: Symptoms, Treatments, Causes ...", 'How to Eat Right If You Have Parkinson’s Dis…', 'Parkinson’s Disease: How to Spot the Signs a…', "The Basics on Parkinson's Disease", "What Is Parkinson's Disease?", "Parkinson's Disease - National Institute on Aging", "Scientists propose new theory of Parkinson's diseaseMedical News Today\xa0· 4d", 'In DepthDifferentiation of multiple system atrophy from …Nature\xa0· 5d', 'Parkinson’s Foundation: Better Lives. Together.', "Parkinson's Disease | PD | MedlinePlus", 'Parkinson Disease - What You Need to Know', 'What Is Parkinson Disease (PD)?PD is a long-term movement disorder. The brain cells that control movement start to die and cause changes in how you move, feel, and act. Even thou...', 'What Increases My Risk For PD?1. Age 60 years or older 2. A family history of PD 3. Exposure to chemicals, such as pesticides or herbicides'

In [189]:
print("---LINK 1 ---\n")
print(feat_links)

print("\n---LINK 2 ---\n")
print(feat_links2)

---LINK 1 ---


---LINK 2 ---

['https://www.mayoclinic.org/diseases-conditions/parkinsons-disease/symptoms-causes/syc-20376055', 'https://www.webmd.com/parkinsons-disease/default.htm', 'https://www.webmd.com/parkinsons-disease/guide/eating-right-parkinsons', 'https://www.webmd.com/parkinsons-disease/guide/parkinsons-common-symptoms', 'https://www.webmd.com/parkinsons-disease/guide/understanding-parkinsons-disease-basics', 'https://www.webmd.com/parkinsons-disease/parkinsons-disease-overview', 'https://www.nia.nih.gov/health/parkinsons-disease', 'https://www.medicalnewstoday.com/articles/326989.php', 'https://www.nature.com/articles/s41598-019-52829-8', 'https://www.parkinson.org/understanding-parkinsons/what-is-parkinsons', 'https://medlineplus.gov/parkinsonsdisease.html', 'https://www.drugs.com/cg/parkinson-disease.html', 'https://www.drugs.com/cg/parkinson-disease.html', 'https://www.drugs.com/cg/parkinson-disease.html', 'https://www.drugs.com/cg/parkinson-disease.html', 'https://ww