In [1]:
import os
import requests
import json
import re
import time
import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os.path
from os.path import join
from slugify import slugify
from bs4 import BeautifulSoup
from bs4.element import NavigableString
from urllib.parse import urlparse, parse_qs
from urllib import parse

base_dir = "wow-tours"
if not os.path.exists(base_dir):
    os.makedirs(base_dir)
    
tours_json_dir = "wow-tours/json"
if not os.path.exists(tours_json_dir):
    os.makedirs(tours_json_dir)
    
tours_json_activities_dir = "wow-tours/json-activities"
if not os.path.exists(tours_json_activities_dir):
    os.makedirs(tours_json_activities_dir)

To get all the torus we need to scrape the main page, from it, we need to find the div with `class` name `row tour-list`

In [None]:
wow_tours_url = "https://tours.wowair.com/tours/"
wow_tours_soup = BeautifulSoup(requests.get(wow_tours_url).text, "lxml")
div_tour_list = wow_tours_soup.find("div", {"class":"row tour-list"})

Inside this tour list, find all the `div` with class name `tour-box`. Inside each tour box, there is an anchor tag with the link to the details of each tour.

In [None]:
tour_boxes = div_tour_list.findAll("div", {"class":"tour-box"})
urls = []
for tour_box in tour_boxes:
    anchor = tour_box.find("a")
    url = anchor.get("href")
    if url.startswith("/"):
        urls.append(url)

In [None]:
real_urls = []
for url in urls:
    tour_page = BeautifulSoup(requests.get("https://tours.wowair.com" + url).text, "lxml")
    div_container = tour_page.find("div",  {"class":"package-content"})
    script_tag = div_container.find("script")
    script_lines= script_tag.text.split("\r\n")
    real_urls.append(script_lines[4].strip()[len("var frameUrl = 'https://' + host + '") +1 : -2])
print("Done getting web pages")

In [None]:
import html
host = 'widgets.bokun.is'
for real_url in real_urls:
    real_tour_url = html.unescape("https://" + host + "/" + real_url)
    rt_url = urlparse(real_tour_url)
    rt_qs = parse_qs(rt_url.query)
    if "activityId" in rt_qs:
        activityId = rt_qs["activityId"][0]
        file_location = join(tours_json_dir, activityId + ".json")
    
        if os.path.isfile(file_location):
            continue
        tour_json_url = "https://widgets.bokun.is/widgets/1825/activity/" + activityId + "/activity-json?bookingChannelUUID=fd0d0807-6375-49cf-9c61-74c1ef1ad41b"
    
        tour_json = requests.get(tour_json_url).text
        with open(file_location, "w") as file:
            file.write(tour_json)
    else:
        print(real_tour_url)

In [8]:
import glob

proposed_empty_fields = ["requiredCustomerFields",
                         "paymentCurrencies",
                         "videos",
                         "categories",
                         "tagGroups",
                         "guidanceTypes",
                         "agendaItems",
                         "seasonalOpeningHours",
                         "activityAttributes",
                         "startPoints",
                         "bookableExtras",
                         "supportedAccessibilityTypes",
                         "activityCategories",
                         "dropoffFlags",
                         "pickupFlags",
                         "dayOptions"]

fields_with_flags = ["customFields","tagGroups",
                     "cancellationPolicy"
                     "categories", "startTimes", "bookableExtras","pricingCategories"]

def clean_fields(o, del_id=True):
    if "flags" in o and len(o["flags"]) == 0:
        del o["flags"]
    if del_id and "id" in o and not isinstance(o, str):
        try:
            del o["id"]
        except TypeError:
            print(o)
            return False
    return True

#from https://stackoverflow.com/a/4256027/605482
def del_none(d):
    """
    Delete keys with the value ``None`` in a dictionary, recursively.

    This alters the input so you may wish to ``copy`` the dict first.
    """
    # For Python 3, write `list(d.items())`; `d.items()` won’t work
    # For Python 2, write `d.items()`; `d.iteritems()` won’t work
    for key, value in list(d.items()):
        if value is None:
            del d[key]
        elif isinstance(value, str): # Adition to strip strings
            d[key] = d[key].strip()
        elif isinstance(value, dict):
            del_none(value)
    return d  # For convenience

for file in glob.glob(tours_json_dir + '/*.json'):
    full_json = None
    try:
        with open(file, "r") as r:
            full_json = json.load(r)
        activity = full_json["activity"]
        
        if "productCategory" in activity:
            del activity["productCategory"]
        
        del_none(activity)
        clean_fields(activity, del_id=False)
        
        if "photos" in activity:
            for photo in activity["photos"]:
                if not clean_fields(photo):
                    print(file)
                del photo["derived"], photo["fileName"]
        
        if "keyPhoto" in activity and activity["keyPhoto"] is not None:
            if "derived" in activity["keyPhoto"]:
                clean_fields(activity["keyPhoto"])
                del activity["keyPhoto"]["derived"], activity["keyPhoto"]["fileName"]
        
        
        for field_with_flags in fields_with_flags:
            if field_with_flags not in activity:
                continue
            for f in activity[field_with_flags]:
                clean_fields(f)
        
        for proposed_empty_field in proposed_empty_fields:
            if len(activity[proposed_empty_field]) > 0:
                if proposed_empty_field == "videos":
                    for video in activity["videos"]:
                        del video["cleanThumbnailUrl"], video["cleanPreviewUrl"], video["html"],
                        video["thumbnailUrl"], video["previewUrl"]
                elif proposed_empty_field == "requiredCustomerFields":
                    if activity["requiredCustomerFields"][0] == "":
                        del activity["requiredCustomerFields"]
                else:
                    for non_empty_field in activity[proposed_empty_field]:
                        if not clean_fields(non_empty_field):
                            print("Error cleaning:", file)
            else:
                del activity[proposed_empty_field]
                
        activity_file_name = "%d-%s.json" % (int(activity["id"]), slugify(activity["title"]))
        
        with open(join(tours_json_activities_dir, activity_file_name), "w") as w:
            json.dump(activity, w)
    except json.JSONDecodeError:
        print(":"*5, "Error", file)

::::: Error wow-tours/json/670.json


In [12]:
clean_activities =[]

for file in glob.glob(tours_json_activities_dir + '/*.json'):
    full_json = None
    try:
        with open(file, "r") as r:
            full_json = json.load(r)
            
        durationMinutes = int(full_json["durationMinutes"])
        durationHours = int(full_json["durationHours"])
        durationDays = int(full_json["durationDays"])
        durationWeeks = int(full_json["durationWeeks"])

        duration = durationMinutes + \
            durationHours *3600 + \
            durationDays * (3600*24) + \
            durationWeeks * (7*24*3600)
        
        activity = {
            'title' : full_json["title"],
            'duration': duration,
            'duration_type': full_json["durationType"],
            'level': full_json["difficultyLevel"],
            'description':full_json["description"],
            'keywords': full_json["keywords"]
        }
        clean_activities.append(activity)
    except json.JSONDecodeError:
        print(":"*5, "Error", file)
        
act_frame =pd.DataFrame(clean_activities)
act_frame.head(10)

Unnamed: 0,description,duration,duration_type,keywords,level,title
0,"<div class=""font-play size-18""> \t\t\t<p>Now y...",21600,HOURS,"[northern lights, Whale watching, combo tour, ...",VERY_EASY,Whale Watching & Northern lights Combo
1,This happy hour tour is taking you to the cent...,3600,HOURS,[],EASY,New York Craft Cocktail Tour
2,<p>Your epic full-day NYC tour will begin in C...,39600,HOURS,[],VERY_EASY,"Total NYC Tour Combo 1: Tenements, Midtown & C..."
3,Explore a few of Iceland‘s most famous landmar...,32400,HOURS,"[northern lights, cre-10, golden circle, combo...",EASY,The Golden Circle & Northern Lights - Combo Deal
4,"<p class=""p1""><b>Icelandic beer, local beer hi...",7230,HOURS,[],VERY_EASY,The Reykjavik Beer Tour
5,"<div>From the cafÃ©s and restaurants, to the s...",10800,HOURS,[],MODERATE,North Beach/Little Italy Walking Food Tour
6,<p></p><h4>WHY YOU SHOULD GO</h4>Enjoy 24hrs o...,86400,HOURS,[],VERY_EASY,24hrs City Sightseeing Dublin Hop on-off 2 rou...
7,<p><h2>Come with us and enjoy four days of an ...,345600,DAYS,"[Laugavegur, Fimmvörðuháls, Golden Circle, Tho...",MODERATE,Northern Lights Trek - 4 Day Tour
8,<p>Buy 1 seat for up to 6 people! Your private...,39600,HOURS,"[Dropoff, Glacier hike, South coast, Northern ...",VERY_EASY,Private northern lights & Glacier hiking day tour
9,Design your own journey around San Francisco b...,86400,DAYS,[],EASY,San Francisco City Hop On and Hop Off


In [None]:
print(act_frame.level.unique())

In [None]:
act_frame.level = pd.Categorical(act_frame.level, 
                                 categories = ['VERY_EASY', 'EASY' ,'MODERATE', 'CHALLENGING'], 
                                 ordered = True)

In [None]:
grouped_by_level = act_frame.groupby('level')
grouped_by_level.describe()

In [None]:
sns.boxplot( y = 'duration', x='level',hue='level', data=act_frame, orient='v')
plt.show()

In [None]:
sns.swarmplot( y = 'duration', x='level',hue='level', data=act_frame, orient='v')
plt.show()