# get_all_page_metas
Author: Luc Mercier

Tested with:
* Travel Yukon
* Tourism Montreal
* metal-archives

In [1]:
#Import libraries
import numpy as np
import pandas as pd
import requests
import json
import re
import math
from bs4 import BeautifulSoup

In [2]:
def get_all_page_metas(page, config, debug = False):
    '''
    get_all_page_metas
    '''
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
        
        response = requests.get(page, headers=headers)
        metas = {}

        if debug: print(page)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # Word count in description
     
            description_selector = config.get('description_selector')
            
            if description_selector:
                block_description = soup.select(description_selector)

                # Paragraphs or no
                if description_selector[-1:] == 'p':
                    clean_description = ""
                    for p in block_description:
                        clean_description = clean_description + p.text
                else:
                    clean_description = str(block_description)


                char_count = len(clean_description)
                word_count = len(clean_description.split(" "))
            else:
                char_count = -1
                word_count = -1

            # Elements badly tagged
            
            block_seasons = soup.select('.hero-seasons')
            seasons = []
            for ul in block_seasons:
                for li in ul:
                    m = re.search('<span>([a-zA-Z]+)<\/span>',str(li))
                    if m:
                        found = m.group(1)
                        if found not in seasons:
                            seasons.append(found)

            value_in_element = config.get('value_in_element')
            class_to_check = config.get('class_to_check')
            meta_selector = config.get('meta_selector')
            
            if meta_selector:
                if debug: print('meta_selector')
                items = soup.select(meta_selector)
                metaText = class_to_check
                
                value = []
                
                for item in items:
                    if debug: print(items)
                    if value_in_element == 'sibling':
                        metaText = item.text.split(":")[0]
                        value = item.find_next_sibling().text

                        metaText = metaText.strip()
                        value = value.strip()
                        
                        if debug: print('item:' + str(metaText)+ ': ' + str(value))
                        
                    
                    elif value_in_element == 'meta':
                        metaText = class_to_check
                        temp = item.text.split(":")[0]
                        if debug: print(temp)
                        value.append(str(temp))
                        
                        

                    metas.update( {metaText : value} )
            else:
                print('error: missing attribute meta_selector')
         
            # Final dictionary
            metas.update({'char_count': char_count,
                         'word_count': word_count,
                         'seasons': seasons})
            return metas
        else:
            if debug:
                print("statuscode not 200: " + str(response.status_code))
                return
                
        
    
    except Exception as e:
        print(e)
        return

In [1]:
def get_all_page_metas_private(page, config, debug = False):
    
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
        
        response = requests.get(page, headers=headers)
        metas = {}

        if debug: print(page)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # Word count in description
     
            description_selector = config.get('description_selector')
            
            if description_selector:
                block_description = soup.select(description_selector)

                # Paragraphs or no
                if description_selector[-1:] == 'p':
                    clean_description = ""
                    for p in block_description:
                        clean_description = clean_description + p.text
                else:
                    clean_description = str(block_description)


                char_count = len(clean_description)
                word_count = len(clean_description.split(" "))
            else:
                char_count = -1
                word_count = -1

            # Elements badly tagged
            
            block_seasons = soup.select('.hero-seasons')
            seasons = []
            for ul in block_seasons:
                for li in ul:
                    m = re.search('<span>([a-zA-Z]+)<\/span>',str(li))
                    if m:
                        found = m.group(1)
                        if found not in seasons:
                            seasons.append(found)

            value_in_element = config.get('value_in_element')
            class_to_check = config.get('class_to_check')
            meta_selector = config.get('meta_selector')
            
            if meta_selector:
                if debug: print('meta_selector')
                items = soup.select(meta_selector)
                metaText = class_to_check
                
                value = []
                
                for item in items:
                    if debug: print(items)
                    if value_in_element == 'sibling':
                        metaText = item.text.split(":")[0]
                        value = item.find_next_sibling().text

                        metaText = metaText.strip()
                        value = value.strip()
                        
                        if debug: print('item:' + str(metaText)+ ': ' + str(value))
                        
                    
                    elif value_in_element == 'meta':
                        metaText = class_to_check
                        temp = item.text.split(":")[0]
                        if debug: print(temp)
                        value.append(str(temp))
                        
                        

                    metas.update( {metaText : value} )
            else:
                print('error: missing attribute meta_selector')
         
            # Final dictionary
            metas.update({'char_count': char_count,
                         'word_count': word_count,
                         'seasons': seasons})
            return metas
        else:
            if debug:
                print("statuscode not 200: " + str(response.status_code))
                return
                
        
    
    except Exception as e:
        print(e)
        return

In [86]:
page = 'https://www.metal-archives.com/bands/Trollfest/20648'
config = {
    'value_in_element' : 'sibling'
    ,'meta_selector' : 'div#band_stats dl dt'
}

get_all_page_metas(page, config, debug = False)

{'Country of origin': 'Norway',
 'Current label': 'NoiseArt Records',
 'Formed in': '2003',
 'Genre': 'Folk Metal',
 'Location': 'Oslo',
 'Lyrical themes': 'Trolls, Drinking, Humour',
 'Status': 'Active',
 'Years active': '2003-present',
 'char_count': -1,
 'seasons': [],
 'word_count': -1}

In [83]:
page = 'https://www.mtl.org/en/what-to-do/festivals-and-events/enchanted-worlds-montreal'
config = {
    'meta_selector' : 'div.node-details__subcategory'
    ,'value_in_element' : 'meta'
    ,'class_to_check' : 'node-details__subcategory'
    ,'description_selector': 'div.details-content__paragraph p'
}

get_all_page_metas(page, config, debug = False)

{'char_count': 740,
 'node-details__subcategory': ['History', 'Downtown - Golden Square Mile'],
 'seasons': [],
 'word_count': 115}

In [85]:
page = 'https://www.travelyukon.com/en/plan/operator/air-north-yukon-s-airline'
config = {
    'meta_selector': 'span.block-info-title'
    ,'value_in_element' : 'sibling'
    ,'class_to_check' : 'block-info-title'
    ,'description_selector': 'div.body-description p'
}

get_all_page_metas(page, config, debug = False)

{'Categories': 'Air Service, Day Trips, Travel Agents and Tour Operators',
 'Communities': 'Dawson City, Old Crow, Whitehorse',
 'Regions': 'Klondike, Northern & Arctic Yukon, Whitehorse Region',
 'char_count': 966,
 'seasons': [],
 'word_count': 140}