<a href="https://colab.research.google.com/github/umutcanberkhasret/Boxish-Hero---Introduction-to-Game-Development/blob/master/MetaScraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import urllib.request
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import pdb
import sys

In [None]:
# Scrape the XML sitemap

def get_sitemap(url):
    """Scrapes an XML sitemap from the provided URL and returns XML source.

    Args:
        url (string): Fully qualified URL pointing to XML sitemap.

    Returns:
        xml (string): XML source of scraped sitemap.
    """
    try:
      response = urllib.request.urlopen(url)
      xml = BeautifulSoup(response, 
                         'lxml-xml', 
                         from_encoding=response.info().get_param('charset'))

      return xml
    except: 
      return
    

In [None]:
# Parse the sitemap to determine its type

def get_sitemap_type(xml):
    """Parse XML source and returns the type of sitemap.

    Args:
        xml (string): Source code of XML sitemap.

    Returns:
        sitemap_type (string): Type of sitemap (sitemap, sitemapindex, or None).
    """

    sitemapindex = xml.find_all('sitemapindex')
    sitemap = xml.find_all('urlset')

    if sitemapindex:
        return 'sitemapindex'
    elif sitemap:
        return 'urlset'
    else:
        return


In [None]:
# Parse the sitemap and extract child sitemaps

def get_child_sitemaps(xml):
    """Return a list of child sitemaps present in a XML sitemap file.

    Args:
        xml (string): XML source of sitemap. 

    Returns:
        sitemaps (list): Python list of XML sitemap URLs.
    """

    sitemaps = xml.find_all("sitemap")

    output = []

    for sitemap in sitemaps:
        output.append(sitemap.findNext("loc").text)
    return output

In [None]:
# Read the sitemap XML into a Pandas dataframe

def sitemap_to_dataframe(xml, name=None, data=None, verbose=False):
    """Read an XML sitemap into a Pandas dataframe. 

    Args:
        xml (string): XML source of sitemap. 
        name (optional): Optional name for sitemap parsed.
        verbose (boolean, optional): Set to True to monitor progress.

    Returns:
        dataframe: Pandas dataframe of XML sitemap content. 
    """

    df = pd.DataFrame(columns=['loc', 'lastmod', 'domain','sitemap_name','changefreq', 'priority'])

    urls = xml.find_all("url")
  
    for url in urls:

        if xml.find("loc"):
            loc = url.findNext("loc").text
            parsed_uri = urlparse(loc)
            domain = '{uri.netloc}'.format(uri=parsed_uri)
        else:
            loc = ''
            domain = ''
            
        if xml.find("lastmod"):
            lastmod = url.findNext("lastmod").text
        else:
            lastmod = ''

        if xml.find("changefreq"):
            changefreq = url.findNext("changefreq").text
        else:
            changefreq = ''
            
    
        if xml.find("priority"):
            priority = url.findNext("priority").text
        else:
            priority = ''

        if name:
            sitemap_name = name
        else:
            sitemap_name = ''
              
        row = {
            'domain': domain,
            'loc': loc,
            'lastmod': lastmod,
            'changefreq': changefreq,
            'priority': priority,
            'sitemap_name': sitemap_name,
        }

        if verbose:
            print(row)

        df = df.append(row, ignore_index=True)
    return df

In [None]:
def get_all_urls(url):
    """Return a dataframe containing all of the URLs from a site's XML sitemaps.

    Args:
        url (string): URL of site's XML sitemap. Usually located at /sitemap.xml

    Returns:
        df (dataframe): Pandas dataframe containing all sitemap content. 

    """

    
    sitemaps = []
    xml = get_sitemap(url)
    # sys.breakpointhook()
    if xml is not None:
      sitemap_type = get_sitemap_type(xml)

      if sitemap_type =='sitemapindex':
          sitemaps = get_child_sitemaps(xml)
      else:
          sitemaps = [url]

    df = pd.DataFrame(columns=['loc', 'lastmod', 'domain','sitemap_name'])

    for sitemap in sitemaps:
      sitemap_xml = get_sitemap(sitemap)
      df_sitemap = sitemap_to_dataframe(sitemap_xml, name=sitemap)

      df = pd.concat([df, df_sitemap], ignore_index=True)

    return df

In [None]:
def get_page(url):
    """Scrapes a URL and returns the HTML source.

    Args:
        url (string): Fully qualified URL of a page.

    Returns:
        soup (string): HTML source of scraped page.
    """

    response = urllib.request.urlopen(url)
    soup = BeautifulSoup(response, 
                         'html.parser', 
                         from_encoding=response.info().get_param('charset'))

    return soup

In [None]:
def get_title(soup):
    """Return the page title

    Args:
        soup: HTML code from Beautiful Soup
        
    Returns: 
        value (string): Parsed value
    """

    if soup.findAll("title"):
        return soup.find("title").string
    else:
        return

In [None]:
def get_description(soup):
    """Return the meta description content

    Args:
        soup: HTML code from Beautiful Soup
        
    Returns: 
        value (string): Parsed value
    """

    if soup.findAll("meta", attrs={"name": "description"}):
        return soup.find("meta", attrs={"name": "description"}).get("content")
    else:
        return

    return

In [None]:
# Debug Reference: https://stackoverflow.com/questions/52656692/debugging-in-google-colab

targetUrls = []
resultSiteMaps = pd.DataFrame(columns=['loc', 'lastmod', 'domain','sitemap_name'])

for url in targetUrls:
  df = get_all_urls(url)
  resultSiteMaps = pd.concat([resultSiteMaps, df], ignore_index=True)
resultSiteMaps.count()

loc             272
lastmod         272
domain          272
sitemap_name    272
changefreq      272
priority        272
dtype: int64

In [None]:
resultSiteMaps

In [None]:
# df.to_csv('cenaptec-sitemap.csv', encoding='utf-8')
df_new = pd.read_csv('cenaptec-sitemap.csv', encoding='utf-8')
df_new.head()

In [None]:
# Fetch all

df_pages = pd.DataFrame(columns = ['url', 'title', 'description'])

for index, row in resultSiteMaps.iterrows(): 
    soup = get_page(row['loc'])
    title = get_title(soup)
    description = get_description(soup)

    page = pd.DataFrame({
        'url': row['loc'],
        'title': title,
        'description': description
    },index = [index]) 

    df_pages = pd.concat([df_pages,page], ignore_index=True)

df_pages