# Travel Destination Recommendation Application

### Project Goal
1. To recommend travel localtions based on users' requirements
2. Prelimilary settings: focusing on flight tickets, budgetary, cost driven
3. Extended: hotel prices, weathers, special occasions, ongoing status...

### Users
Everyone who loves traveling

### Use Cases
1. Real time recommendation: When the user enter the current location (city), return top 10 recommendation from low to high flight ticket price
2. In a certain timeframe, recommend top 10 traveling location from current location
3. Given a destination, return the rank of months to travel from current location

In [25]:
# Build a web scraping bot for best ticket price based on Kayak

from time import sleep, strftime
from random import randint
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [26]:
# Use Chromedriver to open Chrome

chromedriver_path = '/usr/local/Caskroom/chromedriver/83.0.4103.39/chromedriver'
driver = webdriver.Chrome(executable_path=chromedriver_path) # This will open the Chrome window
sleep(2)

In [27]:
# Define page scraping function

def page_scrape():
    """This is the function that does the scraping!"""
    
    xp_sections = '(//*[@class="section duration allow-multi-modal-icons"])[position()<5]'
    sections = driver.find_elements_by_xpath(xp_sections)
    sections_list = [value.text for value in sections]
    section_a_list = sections_list[::2]  # This is to separate the two flights
    section_b_list = sections_list[1::2] # This is to separate the two flights
    
    # If running into reCaptcha, add a sleep
    # If returned an empty list, use SystemExit, to test everything from the start
    if section_a_list == []:
        raise SystemExit
        
    # letter A for outbound flight; letter B for inbound flight
    a_duration = []
    a_section_names = []
    for n in section_a_list:
        # Separate time from the cities
        a_section_names.append(''.join(n.split()[2:5]))
        a_duration.append(''.join(n.split()[0:2]))
    b_duration = []
    b_section_names = []
    for n in section_b_list:
        # Separate time from the cities
        b_section_names.append(''.join(n.split()[2:5]))
        b_duration.append(''.join(n.split()[0:2]))
        
    xp_dates = '(//div[@class="section date"])[position()<5]'
    dates = driver.find_elements_by_xpath(xp_dates)
    dates_list = [value.text for value in dates]
    a_date_list = dates_list[::2]
    b_date_list = dates_list[1::2]
    # Separate the weekday from the day
    a_day = [value.split()[0] for value in a_date_list]
    a_weekday = [value.split()[1] for value in a_date_list]
    b_day = [value.split()[0] for value in b_date_list]
    b_weekday = [value.split()[1] for value in b_date_list]
    
    # Get the prices
    xp_prices = '(//span[contains(@id,"price-text") and contains(@class,"price-text")])[position()<5]'
    prices = driver.find_elements_by_xpath(xp_prices)
    prices_list = [price.text.replace('$', '') for price in prices if price.text != '']
    prices_list = list(map(int, prices_list))
    
    # The stops are a big list with one leg at the even index and second leg on odd index
    xp_stops = '(//div[@class="section stops"]/div[1])[position()<5]'
    stops = driver.find_elements_by_xpath(xp_stops)
    stops_list = [stop.text[0].replace('n', '0') for stop in stops]
    a_stop_list = stops_list[::2]
    b_stop_list = stops_list[1::2]
    
    xp_stops_cities = '(//div[@class="section stops"]/div[2])[position()<5]'
    stops_cities = driver.find_elements_by_xpath(xp_stops_cities)
    stops_cities_list = [stop.text for stop in stops_cities]
    a_stop_name_list = stops_cities_list[::2]
    b_stop_name_list = stops_cities_list[1::2]
    
    # This part gets the airline company and the departure and arrival times for both legs
    xp_schedule = '(//div[@class="section times"])[position()<5]'
    schedules = driver.find_elements_by_xpath(xp_schedule)
    hours_list = []
    carrier_list = []
    for schedule in schedules:
        hours_list.append(schedule.text.split('\n')[0])
        carrier_list.append(schedule.text.split('\n')[1])
    # Split the hours and carriers, between a and b legs
    a_hours = hours_list[::2]
    a_carrier = carrier_list[::2]
    b_hours = hours_list[1::2]
    b_carrier = carrier_list[1::2]
    
    # Create dataframe
    cols = (['Out Day', 'Out Time', 'Out Weekday', 'Out Airline', 'Out Cities', 'Out Duration', 'Out Stops', 'Out Stop Cities',
            'Return Day', 'Return Time', 'Return Weekday', 'Return Airline', 'Return Cities', 'Return Duration', 'Return Stops', 'Return Stop Cities',
            'Price'])
    
    flights_df = pd.DataFrame({'Out Day': a_day,
                               'Out Weekday': a_weekday,
                               'Out Duration': a_duration,
                               'Out Cities': a_section_names,
                               'Return Day': b_day,
                               'Return Weekday': b_weekday,
                               'Return Duration': b_duration,
                               'Return Cities': b_section_names,
                               'Out Stops': a_stop_list,
                               'Out Stop Cities': a_stop_name_list,
                               'Return Stops': b_stop_list,
                               'Return Stop Cities': b_stop_name_list,
                               'Out Time': a_hours,
                               'Out Airline': a_carrier,
                               'Return Time': b_hours,
                               'Return Airline': b_carrier,
                               'Price': prices_list[:2]})[cols]
    
    flights_df['timestamp'] = strftime("%Y%m%d-%H%M") # so we can know when it was scraped
    return flights_df

In [28]:
# Call Kayak

def start_kayak(city_from, city_to, date_start, date_end):
    """City codes - it's IATA codes!
       Date format - YYYY-MM-DD"""
    
    kayak = 'http://www.kayak.com/flights/' + city_from + '-' + city_to + '/' + date_start + '-flexible/' + date_end + '-flexible?sort=bestflight_a'
    driver.get(kayak)
    sleep(randint(8,10))
    
    # Sometimes a popup shows up, so we can use a try statement to check it and close
    try:
        xp_popup_close = '//button[contains(@id,"dialog-close") and contains(@class,"Button-No-Standard-Style close")]'
        driver.find_elements_by_xpath(xp_popup_close)[8].click()
    except Exception as e:
        pass
    sleep(randint(8,10))
    print('loading more...')
    
    # Scape the "best" price
    print('starting the first scrape...')
    df_flights_best = page_scrape()
    df_flights_best['sort'] = 'best'
    sleep(randint(60,80))

    # Let's also get the lowest prices from the matrix on top
#    matrix = driver.find_elements_by_xpath('//*[contains(@id,"FlexMatrixCell")]')
#    matrix_prices = [price.text.replace('$','') for price in matrix]
#    matrix_prices = list(map(int, matrix_prices))
#    matrix_min = min(matrix_prices)
#    matrix_avg = sum(matrix_prices)/len(matrix_prices)
    
    print(df_flights_best)

In [10]:
# Testing

# city_from = input('From which city? ')
# city_to = input('Where to? ')
# date_start = input('Search around which departure date? Please use YYYY-MM-DD format only ')
# date_end = input('Return when? Please use YYYY-MM-DD format only ')

city_from = 'SFO'
city_to = 'LAX'
date_start = '2020-07-10'
date_end = '2020-07-12'

start_kayak(city_from, city_to, date_start, date_end)


loading more...
starting the first scrape...
  Out Day              Out Time Out Weekday      Out Airline Out Cities  \
0    7/11  9:30 pm – 6:15 am +1         Sat              Bus    SFO‐LAX   
1    7/13    12:40 pm – 2:19 pm         Mon  Alaska Airlines    SFO‐LAX   

  Out Duration Out Stops Out Stop Cities Return Day            Return Time  \
0        8h45m         0                       7/13  11:50 pm – 8:15 am +1   
1        1h39m         0                       7/13      5:20 pm – 6:43 pm   

  Return Weekday   Return Airline Return Cities Return Duration Return Stops  \
0            Mon              Bus       LAX‐SFO           8h25m            0   
1            Mon  Alaska Airlines       LAX‐SFO           1h23m            0   

  Return Stop Cities  Price      timestamp  sort  
0                       132  20200710-1505  best  
1                       221  20200710-1505  best  


In [29]:
# Iterating through airports

#def iteration(my_airport, depart_date, return_date):
#    """This is where users enter their local airport!"""
my_airport = 'SFO'
depart_date = '2020-07-10'
return_date = '2020-07-12'

airports = pd.read_csv('airport_data.csv')
airports_code = airports['IATA']
    
for code in airports_code:
    df = start_kayak(my_airport, code, depart_date, return_date)
    df['Destination'] = code
    print(df)

loading more...
starting the first scrape...


IndexError: list index out of range

In [10]:
# debugging area

chromedriver_path = '/usr/local/Caskroom/chromedriver/83.0.4103.39/chromedriver'
driver = webdriver.Chrome(executable_path=chromedriver_path) # This will open the Chrome window
sleep(2)

kayak = 'https://www.kayak.com/flights/SFO-LAX/2020-07-10-flexible/2020-07-12-flexible?sort=bestflight_a'
driver.get(kayak)
sleep(randint(8,10))

try:
    xp_popup_close = '//button[contains(@id,"dialog-close") and contains(@class,"Button-No-Standard-Style close")]'
    driver.find_elements_by_xpath(xp_popup_close)[8].click()
except Exception as e:
    pass
sleep(randint(8,10))
print('loading more...')
    
xp_prices = '(//span[contains(@id,"price-text") and contains(@class,"price-text")])[position()<5]'
prices = driver.find_elements_by_xpath(xp_prices)
prices_list = [price.text.replace('$','') for price in prices if price.text != '']
prices_list = list(map(int, prices_list))

print(prices_list[:2])

loading more...
[132, 221]


In [19]:
airports = pd.read_csv('airport_data.csv')
print(airports['IATA'])

0      BHM
1      ANC
2      PHX
3      TUS
4      LIT
      ... 
98     PPG
99     GUM
100    SPN
101    SJU
102    STT
Name: IATA, Length: 103, dtype: object
