In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

%matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
# create a list of all EU countries with ski resorts
html = requests.get('https://www.j2ski.com/ski_resorts/').text
soup = BeautifulSoup(html, 'html.parser')
eu_countries = []
main_batch = [country.text for country in soup.select('aside ul li a')]
sub_batch = [country.text for country in soup.select('aside div p a')]
eu_countries = main_batch + sub_batch
eu_countries = ['Bosnia and Herzegovina' if country == 'Bosnia' 
                else country for country in eu_countries] 
# eu_countries

In [3]:
# get resorts from each country from https://www.j2ski.com/ski_resorts/*COUNTRY*/
def get_country_url(country):
    """returns the j2ski page which contains details about the country's resorts"""
    if country == 'Bosnia and Herzegovina':
        return f'https://www.j2ski.com/ski_resorts/Bosnia/'
    elif country == 'Czech Republic':
        return f'https://www.j2ski.com/ski_resorts/Czech_Republic/'
    else:
        return f'https://www.j2ski.com/ski_resorts/{country}/'

def get_resorts_table(country):
    """returns the tag which contains the table with all the resort in the country"""
    html = requests.get(get_country_url(country)).text
    soup = BeautifulSoup(html, 'html.parser')
    for h3 in soup.select('div h3'):
        if (h3.text == f'The Ski Resorts of {country}'):
            return h3.parent

def get_resorts(country):
    """given a country name, extract all its resorts and their j2ski page url. return a dict"""
    table = get_resorts_table(country)
    return [{'ResortName': resort.text,
             'Country': country,
             'URL': get_country_url(country) + resort.get('href')} 
            for resort in table.find_all('a')]

In [216]:
# create a {country:[resorts]} dictionary
# country_resorts_dict = {}
# for country in eu_countries:
#     try:
#         country_resorts_dict[country] = get_resorts_list(country)
#     except AttributeError:
#         print(country_resorts_dict.keys, country_resorts_dict.values)
# country_resorts_dict

In [4]:
# create dataframe
resorts_df = pd.DataFrame(columns=['ResortName', 'Country', 'URL'])
for country in eu_countries:
    resorts = get_resorts(country)
    for resort in resorts:
        resorts_df.loc[resorts_df.shape[0]] = resort

In [5]:
resorts_df.sample(3)

Unnamed: 0,ResortName,Country,URL
689,Kubínska Hol'a,Slovakia,https://www.j2ski.com/ski_resorts/Slovakia/Kub...
142,Val d'Isère,France,https://www.j2ski.com/ski_resorts/France/Val_d...
761,Björnrike,Sweden,https://www.j2ski.com/ski_resorts/Sweden/Bjorn...


In [6]:
# TODO: populate df with data for each resort from its own page
import webbrowser

webbrowser.open(resorts_df.iloc[81, 2], new=2)

True

In [8]:
import re
avoriaz = resorts_df.iloc[81]
html = requests.get(avoriaz.URL).text
soup = BeautifulSoup(html, 'html.parser')
print(soup.find_all('table', 'skifacts thb'))

<table class="skifacts thb">
<caption class="phone-off">Avoriaz Ski Area Highlights</caption><tbody>
<tr><th class="label">Recommended For</th><td class="label">Freeriders, Expert Skiers, Intermediates, Beginners, Snowboarders and Apres-Ski!</td></tr>
<tr><th class="label">Total Piste Length</th><td class="value"><span class="m-t">650km</span><span class="m-t m-off">404 miles</span></td></tr>
<tr><th class="label">Highest Lift</th><td class="value"><span class="m-t">2,280m</span><span class="m-t m-off">7,480ft</span></td></tr>
<tr><th class="label">Resort Height</th><td class="value"><span class="m-t">1,800m</span><span class="m-t m-off">5,906ft</span></td></tr>
<tr><th class="label"><i class="fa fa-plane"></i> Nearest Airports</th><td class="value">Geneva and Sion</td></tr>
<tr><th class="label">Save Money on Ski Hire</th><td class="label"><a class="ctabut warm r" href="/ski_resorts/France/Avoriaz_ski_hire.html" title="Reserve your Skis and Save Money in Avoriaz"><span class="splash r