In [None]:
# uncomment the below lines to install beaufiful soup for the first time.
# ! pip install --upgrade pip
# ! pip install beautifulsoup4
# ! pip install ipywidgets

In [None]:
import json
import requests
import tqdm.notebook as tq
from bs4 import BeautifulSoup

In [None]:
domain = 'https://patient.info/'
source_url = 'https://patient.info/forums/discuss/browse/coronavirus-covid-19--4541'
page_number = '?page={}'
headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
}

In [None]:
req = requests.get(source_url, headers)
source = BeautifulSoup(req.content, 'html.parser') 
max_pages = int(source.find_all('option')[-1].get('value'))+1

In [None]:
discussion_urls = []
for page in range(max_pages):
    req = requests.get(source_url+(page_number.format(page)), headers)
    soup = BeautifulSoup(req.content, 'html.parser') 
    articles = soup.select('li.disc-smr.cardb:not(.post-pinned)')
    discussion_urls.extend(['https://patient.info/'+article.find('a').get('href') for article in articles])

In [None]:
discussions = []
for discussion_url in tq.tqdm(discussion_urls):
    req = requests.get(discussion_url, headers)
    soup = BeautifulSoup(req.content, 'html.parser')
    discussion = {
        'url': discussion_url,
        'title': soup.select_one('h1.u-h1.post__title').get_text().strip(),
        'author': soup.select_one('div.author h5.author__info a.author__name').get('href').strip(),
        'content': soup.select_one('div.post__content input').get('value').strip(),
        'replies': []
    }
    page_count_section = soup.select_one('div.comment-page')
    max_pages = int(page_count_section.get('data-pagecount')) if page_count_section else 1
    for page in tq.tqdm(range(max_pages)):
        req = requests.get(discussion_url+(page_number.format(page)), headers)
        soup = BeautifulSoup(req.content, 'html.parser')
        posts = soup.select('article.post.post__root')
        for post in posts:
            reply_content = post.select_one('div.post__content input')
            if reply_content:
                reply = {
                    'content': reply_content.get('value').strip(),
                    'sub_replies': []
                }
                sub_replies = post.select('ul.comments.comments--nested div.post__content input')
                for sub_reply in sub_replies:
                    reply['sub_replies'].append(sub_reply.get('value').strip())
                discussion['replies'].append(reply)
    discussions.append(discussion)

In [12]:
def transform(content):
    content = content.replace("\n","").replace("\r","")
    content = ' '.join(content.split())
    content = content.lower()
    return content

In [13]:
for i in range(len(discussions)):
    discussions[i]['title'] = transform(discussions[i]['title'])
    discussions[i]['author'] = transform(discussions[i]['author'] )
    discussions[i]['content'] = transform(discussions[i]['content'])
    for j in range(len(discussions[i]['replies'])):
        discussions[i]['replies'][j]['content'] = transform(discussions[i]['replies'][j]['content'])
        for k in range(len(discussions[i]['replies'][j]['sub_replies'])):
            discussions[i]['replies'][j]['sub_replies'][k] = transform(discussions[i]['replies'][j]['sub_replies'][k])

In [14]:
out_file = open("g25_patient_info.json", "w") 
json.dump(discussions, out_file, indent = 2) 
out_file.close()

In [None]:
# discussion{
#     title(String) : (h1 -> class: u-h1 post__title)
#     author(String) : (div -> class: author, h5 -> class:author__info, a -> class: author__name -> href)
#     content(String) : (div -> class: post__content, input -> value)
#     max_pages (Integer) : (div -> class: comment-page comment-page-loaded, data-pagecount)
#     replies([Reply]) : (article -> class: post post__root, value)
# }

# Reply{
#     content(String): (article -> class: post post__root, div -> class: post__content break-word, input -> value)
#     sub_replies: (ul -> class: comments comments--nested, div -> class: post__content, input -> value)
# }

In [None]:
# discussion{
#     url: (String)
#     title(String),
#     author(String),
#     content(String),
#     replies([Reply])
# }

# Reply{
#     content(String),
#     sub_replies(String)
# }