<a href="https://colab.research.google.com/github/unburied/DiY-Help/blob/master/DiY_WebScrapeHelpSection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq

In [0]:
diy_url = 'https://support.discmakers.com/hc/en-us'

# open connection and grab page
uClient = uReq(diy_url)
page_html = uClient.read()
uClient.close()

In [0]:
# Get and parse the page
diy_soup = soup(page_html, 'html.parser')

# added to extracted urls for validity
PREPEND = 'https://support.discmakers.com' 

In [0]:
data = {} # store main page data

# Get blocks containing category urls
blocks = diy_soup.findAll('li',{'class':'blocks-item'})

for block in blocks:
  title = block.findAll('h4', {'class':"blocks-item-title"})[0].text
  link = block.a.get('href')
  full_link = PREPEND + link

  data[title] = full_link

del data['Contact Us']# removed as it does not contain articles
print(f'There are {len(data)} categories containing articles on this page') 

There are 10 categories containing articles on this page


In [0]:
sub_data = {} # for subpages
for k,v in data.items():
  subpage_url = v #get url from dict

  # create connection to webpage
  uClient = uReq(subpage_url)
  subpage_html = uClient.read()
  uClient.close()

  # parse data
  subpage_soup = soup(subpage_html, 'html.parser')

  # add url on subpages to new dict
  sub_sections = subpage_soup.findAll('h3',{'class':'section-h3'})
  sub_data[k] = [PREPEND + section.a.get('href') for section in sub_sections]

In [0]:
count = sum([len(v) for v in sub_data.values()])
print(f'There are {count} sub-categories in this help section')

There are 40 sub-categories in this help section


In [0]:
subsub_data = {} # final subpage before access to articles
for k,v in sub_data.items():
  subsub_sections = []
  for url in v: # need to loop through values list for urls
    subsub_url = url

    # connect to webpage
    uClient = uReq(subsub_url)
    subsub_html = uClient.read()
    uClient.close()

    # parse
    subsub_soup = soup(subsub_html, 'html.parser')

    # get links
    subsub_sections.extend(subsub_soup.findAll('li',
                                {'class':'article-list-item'}))
    
    # check for promoted articles that have alternate class
    promoted = subsub_soup.findAll('li',
                                {'class':'article-list-item article-promoted'})
    if promoted != None: 
      subsub_sections.extend(promoted)
  
  # load links into dict using same keys
  subsub_data[k]= ([PREPEND + section.a.get('href')
                        for section in subsub_sections])

In [0]:
count = sum([len(v) for v in subsub_data.values()])
print(f'There are {count} articles in this help section')

There are 459 articles in this help section


In [0]:
articles = {} # to contain all article content
urls = {} # used to join data on urls 
#TODO- add dict that contians url as keys and article title as values

for categories in subsub_data.values():
  for url in categories:
    article_url = url

    # connect to webpage
    uClient = uReq(article_url)
    article_html = uClient.read()
    uClient.close()

    article_soup = soup(article_html, 'html.parser')

    article_title = article_soup.findAll('h1',
                                {'class':'article-title'})[0].text.strip()
    article_body = article_soup.findAll('div', 
                                {'class':'article-body'})
    article_body = article_body[0].findAll('p')
    article_text= [line.text.strip() for line in article_body if line != None]

    articles[article_title] = " ".join(article_text)
    urls[article_title] = url

In [0]:
import pandas as pd

def dict2df(data:dict, columns:list, modifier = False): 
  """Prep dicts for dataframe insertion"""
  keys = []
  values = []
  for k,v in data.items():

    if modifier: #for subsub dict containing categories allowing for tidy insert
      keys.extend([k for _ in range(len(v))])
      values.extend(v)
    else:
      keys.append(k)
      values.append(v)

  assert len(keys) == len(values)
  keyval = zip(keys, values)

  return pd.DataFrame(data=keyval, columns=columns)

In [0]:
# Load all dicts into dataframe and assign column names
df_subdata = dict2df(subsub_data, columns=['categories', 'urls'], modifier=True)
df_articles = dict2df(articles, columns = ['title', 'content'])
df_urls = dict2df(urls, columns=['title', 'urls'])

In [0]:
# Merge dfs into one
diy_df = pd.merge(df_articles, df_urls)
diy_df = pd.merge(diy_df, df_subdata.drop_duplicates(), how='left')

# subdata had duplicates that didnt align until after merge
assert diy_df.shape[0] == df_articles.shape[0] == df_urls.shape[0]

In [0]:
from datetime import date

today = date.today()
today = today.strftime("%B %d, %Y")

file_name = 'Help Section Articles-' + today + '.csv'

diy_df.to_csv(file_name, index = False)