# Scrape SBB

In [105]:
from bs4 import BeautifulSoup
import requests
import ast
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import time
import sqlite3

In [107]:
def create_connection(db_file):
    """ create a database connection to a SQLite database """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
        print(sqlite3.version)
    except Error as e:
        print(e)
    finally:
        if conn:
            conn.close()

In [109]:
create_connection('./events.db')

2.6.0


In [110]:
conn = sqlite3.connect("./events.db")
cur = conn.cursor()

## Category Scrape

In [111]:
sbb_categories = {
    'Bahn_&_Schiff': 'SBB_lh_rail_boat',
    'Spiel_&_Spass': 'SBB_lh_games_fun',
    'Berge_&_Aussicht': 'SBB_lh_mountains_views',
    'Erlebnis_&_Panoramareisen' : 'SBB_lh_adventure_panorama_trips',
    'Natur_&_Sehenswürdigkeiten' : 'SBB_lh_nature_sights_of_interest',
    'Zoos_&_Tierparks' : 'SBB_lh_zoo_animal_parks',
    'Velo_&_Bike' : 'SBB_lh_bike_ebike',
    'Wellness_&_Erholung' : 'SBB_lh_wellness_relaxation',
    'Wandern' : 'SBB_lh_hiking',
    'Kunst_Kultur_&_Museen' : 'SBB_lh_art_culture_museums',
    'Messen_&_Volksfeste' : 'SBB_lh_trade_fairs_folk_festivals',
    'Konzerte_&_Musicals' : 'SBB_lh_concerts_musicals_festivals',
    'Sportveranstalungen' : 'SBB_lh_sports_events',
    'Märkte_&_Shopping' : 'SBB_lh_markets_shopping',
    'Kurztrips' : 'SBB_lh_short_trips_in_switzerland',
    'Städtereisen' : 'SBB_lh_city_trips',
    'Familienausfluege' : 'SBB_lh_family_excursions',
    'Schlusausfluege' : 'SBB_lh_school_excursions',
    'Gruppenausfluege' : 'SBB_lh_group_excursions'
}
values = sbb_categories.keys()
index = [sbb_categories[k] for k in sbb_categories.keys()]
categories_df = pd.DataFrame.from_dict({'index': index})
categories_df['category_ger'] = categories_df['index'].apply(lambda x: list(sbb_categories.keys())[list(sbb_categories.values()).index(x)])

In [112]:
categories_df.to_sql('categories', con=conn, if_exists='append')

In [104]:
for category in categories_df['index']:
    print(category)

SBB_lh_rail_boat
SBB_lh_games_fun
SBB_lh_mountains_views
SBB_lh_adventure_panorama_trips
SBB_lh_nature_sights_of_interest
SBB_lh_zoo_animal_parks
SBB_lh_bike_ebike
SBB_lh_wellness_relaxation
SBB_lh_hiking
SBB_lh_art_culture_museums
SBB_lh_trade_fairs_folk_festivals
SBB_lh_concerts_musicals_festivals
SBB_lh_sports_events
SBB_lh_markets_shopping
SBB_lh_short_trips_in_switzerland
SBB_lh_city_trips
SBB_lh_family_excursions
SBB_lh_school_excursions
SBB_lh_group_excursions


In [103]:
categories_page = 'https://www.sbb.ch/de/freizeit-ferien/ideen.html?'

In [66]:
r  = requests.get(ideen_page)

soup = BeautifulSoup(r.text, 'html.parser')

In [67]:
divs = soup.findAll("div", {"class": "mod_map"})

map_markers = json.loads(divs[0]['data-mapboxmap-options'])

events = map_markers['markers']

597

In [59]:
event_df = pd.DataFrame(columns = ['id', 'title', 'subtitle', 'lat', 'lng', 'teaserUrl'])

for idea in tqdm(events):
    
    r = requests.get('https://www.sbb.ch' + idea['teaserUrl'],headers={"User-Agent":"Mozilla/5.0"})

    event_soup = BeautifulSoup(r.text, 'html.parser')
    
    #extract (sub-)title
    res = event_soup.findAll('a')
    idea['title'] = res[0].text.replace('\t','').replace('\n','').strip()
    idea['subtitle'] = res[0].text.replace('\t','').replace('\n','').replace('Zum Angebot.', '').strip()

    
    event_df = event_df.append(idea, ignore_index=True)
    time.sleep(0.5)



  0%|          | 0/601 [00:00<?, ?it/s][A[A

  0%|          | 1/601 [00:01<10:17,  1.03s/it][A[A

  0%|          | 2/601 [00:02<11:45,  1.18s/it][A[A

  0%|          | 3/601 [01:18<3:54:10, 23.50s/it][A[A

  1%|          | 4/601 [01:18<2:45:46, 16.66s/it][A[A

  1%|          | 5/601 [01:19<1:58:13, 11.90s/it][A[A

  1%|          | 6/601 [01:20<1:25:22,  8.61s/it][A[A

  1%|          | 7/601 [01:21<1:01:39,  6.23s/it][A[A

  1%|▏         | 8/601 [01:21<45:08,  4.57s/it]  [A[A

  1%|▏         | 9/601 [01:22<33:38,  3.41s/it][A[A

  2%|▏         | 10/601 [01:23<25:31,  2.59s/it][A[A

  2%|▏         | 11/601 [01:24<19:53,  2.02s/it][A[A

  2%|▏         | 12/601 [01:24<16:12,  1.65s/it][A[A

  2%|▏         | 13/601 [01:25<13:27,  1.37s/it][A[A

  2%|▏         | 14/601 [01:26<11:36,  1.19s/it][A[A

  2%|▏         | 15/601 [01:26<10:13,  1.05s/it][A[A

  3%|▎         | 16/601 [01:27<09:47,  1.00s/it][A[A

  3%|▎         | 17/601 [01:28<08:52,  1.10it/s][A

KeyboardInterrupt: 

In [None]:
from bs4 import BeautifulSoup

import requests

url = raw_input("Enter a website to extract the URL's from: ")

r  = requests.get("http://" +url)

data = r.text

soup = BeautifulSoup(data)

for link in soup.find_all('a'):
    print(link.get('href'))