In [238]:
import math
import numpy as np
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.keys import Keys

# Google flights

In [216]:
def scrape_best_flights(url):
    
    # Run browser - need to run full browser to get JavaScript elements
    driver = webdriver.Chrome('/Users/lucy/Documents/Research/chromedriver')
    driver.get(url)
    
    # Wait until page is loaded
    try:
        WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.ID,'gws-flights-results__best_flights_heading')))
    except TimeoutException:
        print("Loading took too much time!")

    soup = BeautifulSoup(driver.page_source, 'lxml')
    driver.close()
    
    # Get list of best flights
    best_flight_heading = soup.find('span', {'id': 'gws-flights-results__best_flights_heading'})
    best_flights_list = best_flight_heading.find_next('ol', {'class': 'gws-flights-results__result-list'})
    best_flights = best_flights_list.find_all('li', {'class': 'gws-flights-results__result-item'})
    
    # Iterate through each flight to collect time, duration, price, and type (round trip or not)
    flight_data = pd.DataFrame(columns=['time', 'duration', 'price'])
    for flight in best_flights:
        times = flight.find('div', {'class': 'gws-flights-results__times'}).text
        durations = flight.find('div', {'class': 'gws-flights-results__duration'}).text
        prices = flight.find('div', {'class': 'gws-flights-results__price'}).text
        res = pd.DataFrame({'time': [times], 'duration': [durations], 'price': [prices]})
        flight_data = flight_data.append(res)
        
    return flight_data

In [322]:
def scrape_flights(url):
    
    # Run browser - need to run full browser to get JavaScript elements
    driver = webdriver.Chrome('/Users/lucy/Documents/Research/chromedriver')
    driver.get(url)
    
    # Wait until page is loaded
    try:
        WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.ID,'gws-flights-results__best_flights_heading')))
    except TimeoutException:
        print("Loading took too much time!")

    soup = BeautifulSoup(driver.page_source, 'lxml')
    driver.close()
    
    # Get lists of best flights
    flights_lists = soup.find_all('ol', {'class': 'gws-flights-results__result-list'})
    flight_data = pd.DataFrame(columns=['time', 'duration', 'price'])
    for flights_list in flights_lists:
        best_flights = flights_list.find_all('li', {'class': 'gws-flights-results__result-item'})

        # Iterate through each flight to collect time, duration, price, and type (round trip or not)
        for flight in best_flights:
            times = flight.find('div', {'class': 'gws-flights-results__times'}).text
            durations = flight.find('div', {'class': 'gws-flights-results__duration'}).text
            prices = flight.find('div', {'class': 'gws-flights-results__price'}).text
            res = pd.DataFrame({'time': [times], 'duration': [durations], 'price': [prices]})
            flight_data = flight_data.append(res)
        
    return flight_data

## Boston

### Weekend 0: 10/17-10/20

In [325]:
flights = scrape_flights('https://www.google.com/flights#flt=BOS./m/0dclg.2019-10-17;c:USD;e:1;sd:1;t:f;tt:o')
flights['date'] = '10-17-2019'
flights['origination'] = 'Philadelphia'
flights['destination'] = 'Boston'
flights.to_csv('data/boston/flights_10-17-2019.csv')

In [326]:
flights = scrape_flights('https://www.google.com/flights#flt=/m/0dclg.BOS.2019-10-20;c:USD;e:1;sd:1;t:f;tt:o')
flights['date'] = '10-20-2019'
flights['origination'] = 'Boston'
flights['destination'] = 'Philadelphia'
flights.to_csv('data/boston/flights_10-20-2019.csv')

### Weekend 1: 10/24-10/27

In [328]:
flights = scrape_flights('https://www.google.com/flights#flt=BOS./m/0dclg.2019-10-24;c:USD;e:1;sd:1;t:f;tt:o')
flights['date'] = '10-24-2019'
flights['origination'] = 'Philadelphia'
flights['destination'] = 'Boston'
flights.to_csv('data/boston/flights_10-24-2019.csv')

In [329]:
flights = scrape_flights('https://www.google.com/flights#flt=BOS./m/0dclg.2019-10-27;c:USD;e:1;sd:1;t:f;tt:o')
flights['date'] = '10-27-2019'
flights['origination'] = 'Boston'
flights['destination'] = 'Philadelphia'
flights.to_csv('data/boston/flights_10-27-2019.csv')

### Weekend 2: 10/31-11/03

In [331]:
flights = scrape_flights('https://www.google.com/flights#flt=/m/0dclg.BOS.2019-10-31;c:USD;e:1;sd:1;t:f;tt:o')
flights['date'] = '10-31-2019'
flights['origination'] = 'Philadelphia'
flights['destination'] = 'Boston'
flights.to_csv('data/boston/flights_10-31-2019.csv')

In [332]:
flights = scrape_flights('https://www.google.com/flights#flt=BOS./m/0dclg.2019-11-03;c:USD;e:1;sd:1;t:f;tt:o')
flights['date'] = '11-03-2019'
flights['origination'] = 'Boston'
flights['destination'] = 'Philadelphia'
flights.to_csv('data/boston/flights_11-03-2019.csv')

## NYC

### Weekend 0: 10/17-10/20

In [334]:
flights = scrape_flights('https://www.google.com/flights#flt=/m/0dclg./m/02_286.2019-10-17;c:USD;e:1;sd:1;t:f;tt:o')
flights['date'] = '10-17-2019'
flights['origination'] = 'Philadelphia'
flights['destination'] = 'NYC'
flights.to_csv('data/nyc/flights_10-17-2019.csv')

In [335]:
flights = scrape_flights('https://www.google.com/flights#flt=/m/02_286./m/0dclg.2019-10-20;c:USD;e:1;sd:1;t:f;tt:o')
flights['date'] = '10-20-2019'
flights['origination'] = 'NYC'
flights['destination'] = 'Philadelphia'
flights.to_csv('data/nyc/flights_10-20-2019.csv')

### Weekend 1: 10/24-10/27

In [336]:
flights = scrape_flights('https://www.google.com/flights#flt=/m/0dclg./m/02_286.2019-10-24;c:USD;e:1;sd:1;t:f;tt:o')
flights['date'] = '10-24-2019'
flights['origination'] = 'Philadelphia'
flights['destination'] = 'NYC'
flights.to_csv('data/nyc/flights_10-24-2019.csv')

In [337]:
flights = scrape_flights('https://www.google.com/flights#flt=/m/02_286./m/0dclg.2019-10-27;c:USD;e:1;sd:1;t:f;tt:o')
flights['date'] = '10-27-2019'
flights['origination'] = 'NYC'
flights['destination'] = 'Philadelphia'
flights.to_csv('data/nyc/flights_10-27-2019.csv')

### Weekend 2: 10/31-11/03

In [338]:
flights = scrape_flights('https://www.google.com/flights#flt=/m/0dclg./m/02_286.2019-10-31;c:USD;e:1;sd:1;t:f;tt:o')
flights['date'] = '10-31-2019'
flights['origination'] = 'Philadelphia'
flights['destination'] = 'NYC'
flights.to_csv('data/nyc/flights_10-31-2019.csv')

In [339]:
flights = scrape_flights('https://www.google.com/flights#flt=/m/02_286./m/0dclg.2019-11-03;c:USD;e:1;sd:1;t:f;tt:o')
flights['date'] = '11-03-2019'
flights['origination'] = 'NYC'
flights['destination'] = 'Philadelphia'
flights.to_csv('data/nyc/flights_11-03-2019.csv')

## DC

### Weekend 0: 10/17-10/20

In [340]:
flights = scrape_flights('https://www.google.com/flights#flt=/m/0dclg./m/0rh6k.2019-10-17;c:USD;e:1;sd:1;t:f;tt:o')
flights['date'] = '10-17-2019'
flights['origination'] = 'Philadelphia'
flights['destination'] = 'DC'
flights.to_csv('data/dc/flights_10-17-2019.csv')

In [341]:
flights = scrape_flights('https://www.google.com/flights#flt=/m/0rh6k./m/0dclg.2019-10-20;c:USD;e:1;sd:1;t:f;tt:o')
flights['date'] = '10-20-2019'
flights['origination'] = 'DC'
flights['destination'] = 'Philadelphia'
flights.to_csv('data/dc/flights_10-20-2019.csv')

### Weekend 1: 10/24-10/27

In [342]:
flights = scrape_flights('https://www.google.com/flights#flt=/m/0dclg./m/0rh6k.2019-10-24;c:USD;e:1;sd:1;t:f;tt:o')
flights['date'] = '10-24-2019'
flights['origination'] = 'Philadelphia'
flights['destination'] = 'DC'
flights.to_csv('data/dc/flights_10-24-2019.csv')

In [343]:
flights = scrape_flights('https://www.google.com/flights#flt=/m/0rh6k./m/0dclg.2019-10-27;c:USD;e:1;sd:1;t:f;tt:o')
flights['date'] = '10-27-2019'
flights['origination'] = 'DC'
flights['destination'] = 'Philadelphia'
flights.to_csv('data/dc/flights_10-27-2019.csv')

### Weekend 2: 10/31-11/03

In [345]:
flights = scrape_flights('https://www.google.com/flights#flt=/m/0dclg./m/0rh6k.2019-10-31;c:USD;e:1;sd:1;t:f;tt:o')
flights['date'] = '10-31-2019'
flights['origination'] = 'Philadelphia'
flights['destination'] = 'DC'
flights.to_csv('data/dc/flights_10-31-2019.csv')

In [346]:
flights = scrape_flights('https://www.google.com/flights#flt=/m/0rh6k./m/0dclg.2019-11-03;c:USD;e:1;sd:1;t:f;tt:o')
flights['date'] = '11-03-2019'
flights['origination'] = 'DC'
flights['destination'] = 'Philadelphia'
flights.to_csv('data/dc/flights_11-03-2019.csv')

In [213]:
url = 'https://www.google.com/flights#flt=/m/0rh6k./m/0dclg.2019-10-27;c:USD;e:1;sd:1;t:f;tt:o'

driver = webdriver.Chrome('/Users/lucy/Documents/Research/chromedriver')
driver.get(url)
delay = 3 # seconds

try:
    myElem = WebDriverWait(driver, delay).until(EC.presence_of_element_located((By.ID,
                                                                                'gws-flights-results__best_flights_heading')))
except TimeoutException:
    print("Loading took too much time!")
    
soup = BeautifulSoup(driver.page_source, 'lxml')

# Get list of best flights
best_flight_heading = soup.find('span', {'id': 'gws-flights-results__best_flights_heading'})
best_flights_list = best_flight_heading.find_next('ol', {'class': 'gws-flights-results__result-list'})
best_flights = best_flights_list.find_all('li', {'class': 'gws-flights-results__result-item'})

# Iterate through each flight to collect time, duration, price, and type (round trip or not)
flight_data = pd.DataFrame(columns=['time', 'duration', 'price'])
for flight in best_flights:
    times = flight.find('div', {'class': 'gws-flights-results__times'}).text
    durations = flight.find('div', {'class': 'gws-flights-results__duration'}).text
    prices = flight.find('div', {'class': 'gws-flights-results__price'}).text
    res = pd.DataFrame({'time': [times], 'duration': [durations], 'price': [prices]})
    flight_data = flight_data.append(res)

Page is ready!


# Wanderu

In [304]:
def scrape_wanderu(url):

    driver = webdriver.Chrome('/Users/lucy/Documents/Research/chromedriver')
    driver.get(url)
    driver.execute_script("window.scrollTo(0, 200)") 
    driver.find_element_by_class_name('closeButton-PleSU').click()
    checkboxes = driver.find_elements_by_id('f996b5217258d2bbce3d56aed74be479dcad59a70a093673f6720b13327976ae-span')
    checkboxes[1].click()
    if len(checkboxes) > 3:
        checkboxes[3].click()

    driver.find_element_by_id('directOnly-span').click()
    driver.find_element_by_id('allowNearby-span').click()
    
    see_more = True
    while see_more:
        try:
            driver.find_element(By.XPATH, '//button[text()="See more"]').click()
        except NoSuchElementException:
            see_more = False

    soup = BeautifulSoup(driver.page_source, 'lxml')
    driver.close()
    
    best_buses = soup.find_all('div', {'class': 'searchResult-3dUQ8'})
    
    bus_data = pd.DataFrame(columns=['depart', 'arrive', 'duration', 'price', 'carrier'])
    for bus in best_buses:
        depart = bus.find('div', {'aria-label': 'depart'}).text
        arrive = bus.find('div', {'aria-label': 'arrive'}).text
        duration = bus.find('span',{'aria-label': 'Duration'}).text
        price = bus.find('span',{'aria-label': 'Price'}).text
        carrier = bus.find('div', {'class': 'carrierName-3UUEV'}).text
        res = pd.DataFrame({'depart': [depart], 'arrive': [arrive], 'duration': [duration],
                            'price': [price], 'carrier': [carrier]})
        bus_data = bus_data.append(res)
    
    return bus_data

## Boston

### Weekend 0: 10/17-10/20

In [294]:
bus_train = scrape_wanderu('https://www.wanderu.com/en-us/depart/Philadelphia%2C%20PA%2C%20USA/Boston%2C%20MA%2C%20USA/2019-10-17/?cur=USD&dpid=ChIJGzE9DS1l44kRoOhiASS_fHg&opid=ChIJ60u11Ni3xokRwVg-jNgU9Yk')
bus_train['date'] = '10-17-2019'
bus_train['origination'] = 'Philadelphia'
bus_train['destination'] = 'Boston'
bus_train.to_csv('data/boston/bus_train_10-17-2019.csv')

In [301]:
bus_train = scrape_wanderu('https://www.wanderu.com/en-us/depart/Boston%2C%20MA%2C%20USA/Philadelphia%2C%20PA%2C%20USA/2019-10-20/?cur=USD&dpid=ChIJ60u11Ni3xokRwVg-jNgU9Yk&opid=ChIJGzE9DS1l44kRoOhiASS_fHg')
bus_train['date'] = '10-20-2019'
bus_train['origination'] = 'Boston'
bus_train['destination'] = 'Philadelphia'
bus_train.to_csv('data/boston/bus_train_10-20-2019.csv')

### Weekend 1: 10/24-10/27

In [296]:
bus_train = scrape_wanderu('https://www.wanderu.com/en-us/depart/Philadelphia%2C%20PA%2C%20USA/Boston%2C%20MA%2C%20USA/2019-10-24/?cur=USD&dpid=ChIJGzE9DS1l44kRoOhiASS_fHg&opid=ChIJ60u11Ni3xokRwVg-jNgU9Yk')
bus_train['date'] = '10-24-2019'
bus_train['origination'] = 'Philadelphia'
bus_train['destination'] = 'Boston'
bus_train.to_csv('data/boston/bus_train_10-24-2019.csv')

In [302]:
bus_train = scrape_wanderu('https://www.wanderu.com/en-us/depart/Boston%2C%20MA%2C%20USA/Philadelphia%2C%20PA%2C%20USA/2019-10-27/?cur=USD&dpid=ChIJ60u11Ni3xokRwVg-jNgU9Yk&opid=ChIJGzE9DS1l44kRoOhiASS_fHg')
bus_train['date'] = '10-27-2019'
bus_train['origination'] = 'Boston'
bus_train['destination'] = 'Philadelphia'
bus_train.to_csv('data/boston/bus_train_10-27-2019.csv')

### Weekend 2: 10/31-11/03

In [300]:
bus_train = scrape_wanderu('https://www.wanderu.com/en-us/depart/Philadelphia%2C%20PA%2C%20USA/Boston%2C%20MA%2C%20USA/2019-10-31/?cur=USD&dpid=ChIJGzE9DS1l44kRoOhiASS_fHg&opid=ChIJ60u11Ni3xokRwVg-jNgU9Yk')
bus_train['date'] = '10-31-2019'
bus_train['origination'] = 'Philadelphia'
bus_train['destination'] = 'Boston'
bus_train.to_csv('data/boston/bus_train_10-31-2019.csv')

In [303]:
bus_train = scrape_wanderu('https://www.wanderu.com/en-us/depart/Boston%2C%20MA%2C%20USA/Philadelphia%2C%20PA%2C%20USA/2019-11-03/?cur=USD&dpid=ChIJ60u11Ni3xokRwVg-jNgU9Yk&opid=ChIJGzE9DS1l44kRoOhiASS_fHg')
bus_train['date'] = '11-03-2019'
bus_train['origination'] = 'Boston'
bus_train['destination'] = 'Philadelphia'
bus_train.to_csv('data/boston/bus_train_11-03-2019.csv')

## NYC

### Weekend 0: 10/17-10/20

In [314]:
bus_train = scrape_wanderu('https://www.wanderu.com/en-us/depart/Philadelphia%2C%20PA%2C%20USA/New%20York%2C%20NY%2C%20USA/2019-10-17/?cur=USD&dpid=ChIJOwg_06VPwokRYv534QaPC8g&opid=ChIJ60u11Ni3xokRwVg-jNgU9Yk')
bus_train['date'] = '10-17-2019'
bus_train['origination'] = 'Philadelphia'
bus_train['destination'] = 'NYC'
bus_train.to_csv('data/nyc/bus_train_10-17-2019.csv')

In [317]:
bus_train = scrape_wanderu('https://www.wanderu.com/en-us/depart/New%20York%2C%20NY%2C%20USA/Philadelphia%2C%20PA%2C%20USA/2019-10-20/?cur=USD&dpid=ChIJ60u11Ni3xokRwVg-jNgU9Yk&opid=ChIJOwg_06VPwokRYv534QaPC8g')
bus_train['date'] = '10-20-2019'
bus_train['origination'] = 'NYC'
bus_train['destination'] = 'Philadelphia'
bus_train.to_csv('data/nyc/bus_train_10-20-2019.csv')

### Weekend 1: 10/24-10/27

In [315]:
bus_train = scrape_wanderu('https://www.wanderu.com/en-us/depart/Philadelphia%2C%20PA%2C%20USA/New%20York%2C%20NY%2C%20USA/2019-10-24/?cur=USD&dpid=ChIJOwg_06VPwokRYv534QaPC8g&opid=ChIJ60u11Ni3xokRwVg-jNgU9Yk')
bus_train['date'] = '10-24-2019'
bus_train['origination'] = 'Philadelphia'
bus_train['destination'] = 'NYC'
bus_train.to_csv('data/nyc/bus_train_10-24-2019.csv')

In [318]:
bus_train = scrape_wanderu('https://www.wanderu.com/en-us/depart/New%20York%2C%20NY%2C%20USA/Philadelphia%2C%20PA%2C%20USA/2019-10-27/?cur=USD&dpid=ChIJ60u11Ni3xokRwVg-jNgU9Yk&opid=ChIJOwg_06VPwokRYv534QaPC8g')
bus_train['date'] = '10-27-2019'
bus_train['origination'] = 'NYC'
bus_train['destination'] = 'Philadelphia'
bus_train.to_csv('data/nyc/bus_train_10-27-2019.csv')

### Weekend 2: 10/31-11/03

In [316]:
bus_train = scrape_wanderu('https://www.wanderu.com/en-us/depart/Philadelphia%2C%20PA%2C%20USA/New%20York%2C%20NY%2C%20USA/2019-10-31/?cur=USD&dpid=ChIJOwg_06VPwokRYv534QaPC8g&opid=ChIJ60u11Ni3xokRwVg-jNgU9Yk')
bus_train['date'] = '10-31-2019'
bus_train['origination'] = 'Philadelphia'
bus_train['destination'] = 'NYC'
bus_train.to_csv('data/nyc/bus_train_10-31-2019.csv')

In [320]:
bus_train = scrape_wanderu('https://www.wanderu.com/en-us/depart/New%20York%2C%20NY%2C%20USA/Philadelphia%2C%20PA%2C%20USA/2019-11-03/?cur=USD&dpid=ChIJ60u11Ni3xokRwVg-jNgU9Yk&opid=ChIJOwg_06VPwokRYv534QaPC8g')
bus_train['date'] = '11-03-2019'
bus_train['origination'] = 'NYC'
bus_train['destination'] = 'Philadelphia'
bus_train.to_csv('data/nyc/bus_train_11-03-2019.csv')

## DC

### Weekend 0: 10/17-10/20

In [308]:
bus_train = scrape_wanderu('https://www.wanderu.com/en-us/depart/Philadelphia%2C%20PA%2C%20USA/Washington%2C%20DC%2C%20USA/2019-10-17/?cur=USD&dpid=ChIJW-T2Wt7Gt4kRKl2I1CJFUsI&opid=ChIJ60u11Ni3xokRwVg-jNgU9Yk')
bus_train['date'] = '10-17-2019'
bus_train['origination'] = 'Philadelphia'
bus_train['destination'] = 'DC'
bus_train.to_csv('data/dc/bus_train_10-17-2019.csv')

In [309]:
bus_train = scrape_wanderu('https://www.wanderu.com/en-us/depart/Washington%2C%20DC%2C%20USA/Philadelphia%2C%20PA%2C%20USA/2019-10-20/?cur=USD&dpid=ChIJ60u11Ni3xokRwVg-jNgU9Yk&opid=ChIJW-T2Wt7Gt4kRKl2I1CJFUsI')
bus_train['date'] = '10-20-2019'
bus_train['origination'] = 'DC'
bus_train['destination'] = 'Philadelphia'
bus_train.to_csv('data/dc/bus_train_10-20-2019.csv')

### Weekend 1: 10/24-10/27

In [306]:
bus_train = scrape_wanderu('https://www.wanderu.com/en-us/depart/Philadelphia%2C%20PA%2C%20USA/Washington%2C%20DC%2C%20USA/2019-10-24/?cur=USD&dpid=ChIJW-T2Wt7Gt4kRKl2I1CJFUsI&opid=ChIJ60u11Ni3xokRwVg-jNgU9Yk')
bus_train['date'] = '10-24-2019'
bus_train['origination'] = 'Philadelphia'
bus_train['destination'] = 'DC'
bus_train.to_csv('data/dc/bus_train_10-24-2019.csv')

In [312]:
bus_train = scrape_wanderu('https://www.wanderu.com/en-us/depart/Washington%2C%20DC%2C%20USA/Philadelphia%2C%20PA%2C%20USA/2019-10-27/?cur=USD&dpid=ChIJ60u11Ni3xokRwVg-jNgU9Yk&opid=ChIJW-T2Wt7Gt4kRKl2I1CJFUsI')
bus_train['date'] = '10-27-2019'
bus_train['origination'] = 'DC'
bus_train['destination'] = 'Philadelphia'
bus_train.to_csv('data/dc/bus_train_10-27-2019.csv')

### Weekend 2: 10/31-11/03

In [307]:
bus_train = scrape_wanderu('https://www.wanderu.com/en-us/depart/Philadelphia%2C%20PA%2C%20USA/Washington%2C%20DC%2C%20USA/2019-10-31/?cur=USD&dpid=ChIJW-T2Wt7Gt4kRKl2I1CJFUsI&opid=ChIJ60u11Ni3xokRwVg-jNgU9Yk')
bus_train['date'] = '10-31-2019'
bus_train['origination'] = 'Philadelphia'
bus_train['destination'] = 'DC'
bus_train.to_csv('data/dc/bus_train_10-31-2019.csv')

In [313]:
bus_train = scrape_wanderu('https://www.wanderu.com/en-us/depart/Washington%2C%20DC%2C%20USA/Philadelphia%2C%20PA%2C%20USA/2019-11-03/?cur=USD&dpid=ChIJ60u11Ni3xokRwVg-jNgU9Yk&opid=ChIJW-T2Wt7Gt4kRKl2I1CJFUsI')
bus_train['date'] = '11-03-2019'
bus_train['origination'] = 'DC'
bus_train['destination'] = 'Philadelphia'
bus_train.to_csv('data/dc/bus_train_11-03-2019.csv')