# Travel Destination Recommendation Application

### Project Goal
1. To recommend travel localtions based on users' requirements
2. Prelimilary settings: focusing on flight tickets, budgetary, cost driven
3. Extended: hotel prices, weathers, special occasions, ongoing status...

### Users
Everyone who loves traveling

### Use Cases
1. Real time recommendation: When the user enter the current location (city), return top 10 recommendation from low to high flight ticket price
2. In a certain timeframe, recommend top 10 traveling location from current location
3. Given a destination, return the rank of months to travel from current location

In [31]:
# Build a web scraping bot for best ticket price based on Kayak

from time import sleep, strftime
from random import randint
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [32]:
# Use Chromedriver to open Chrome

chromedriver_path = '/usr/local/Caskroom/chromedriver/83.0.4103.39/chromedriver'
driver = webdriver.Chrome(executable_path=chromedriver_path) # This will open the Chrome window
sleep(2)

In [33]:
# Define page scraping function

def page_scrape():
    """This is the function that does the scraping!"""
    
    xp_sections = '(//*[@class="section duration allow-multi-modal-icons"])[position()<3]'
    sections = driver.find_elements_by_xpath(xp_sections)
    sections_list = [value.text for value in sections]
    section_a_list = sections_list[::2]  # This is to separate the two flights
    section_b_list = sections_list[1::2] # This is to separate the two flights
    
    # If running into reCaptcha, add a sleep
    # If returned an empty list, use SystemExit, to test everything from the start
#    if section_a_list == []:
#        raise SystemExit
        
    # letter A for outbound flight; letter B for inbound flight
    a_duration = []
    a_section_names = []
    for n in section_a_list:
        # Separate time from the cities
        a_section_names.append(''.join(n.split()[2:5]))
        a_duration.append(''.join(n.split()[0:2]))
    b_duration = []
    b_section_names = []
    for n in section_b_list:
        # Separate time from the cities
        b_section_names.append(''.join(n.split()[2:5]))
        b_duration.append(''.join(n.split()[0:2]))
        
    xp_dates = '(//div[@class="section date"])[position()<3]'
    dates = driver.find_elements_by_xpath(xp_dates)
    dates_list = [value.text for value in dates]
    a_date_list = dates_list[::2]
    b_date_list = dates_list[1::2]
    # Separate the weekday from the day
    a_day = [value.split()[0] for value in a_date_list]
    a_weekday = [value.split()[1] for value in a_date_list]
    b_day = [value.split()[0] for value in b_date_list]
    b_weekday = [value.split()[1] for value in b_date_list]
    
    # Get the prices
    xp_prices = '(//span[contains(@id,"price-text") and contains(@class,"price-text")])[position()=1]'
    prices = driver.find_elements_by_xpath(xp_prices)
    prices_list = [price.text.replace('$', '') for price in prices if price.text != '']
#   prices_list = list(map(int, prices_list))
    
    # The stops are a big list with one leg at the even index and second leg on odd index
    xp_stops = '(//div[@class="section stops"]/div[1])[position()<3]'
    stops = driver.find_elements_by_xpath(xp_stops)
    stops_list = [stop.text[0].replace('n', '0') for stop in stops]
    a_stop_list = stops_list[::2]
    b_stop_list = stops_list[1::2]
    
    xp_stops_cities = '(//div[@class="section stops"]/div[2])[position()<3]'
    stops_cities = driver.find_elements_by_xpath(xp_stops_cities)
    stops_cities_list = [stop.text for stop in stops_cities]
    a_stop_name_list = stops_cities_list[::2]
    b_stop_name_list = stops_cities_list[1::2]
    
    # This part gets the airline company and the departure and arrival times for both legs
    xp_schedule = '(//div[@class="section times"])[position()<3]'
    schedules = driver.find_elements_by_xpath(xp_schedule)
    hours_list = []
    carrier_list = []
    for schedule in schedules:
        hours_list.append(schedule.text.split('\n')[0])
        carrier_list.append(schedule.text.split('\n')[1])
    # Split the hours and carriers, between a and b legs
    a_hours = hours_list[::2]
    a_carrier = carrier_list[::2]
    b_hours = hours_list[1::2]
    b_carrier = carrier_list[1::2]
    
    # Create dataframe
#    cols = (['Out Day', 'Out Time', 'Out Weekday', 'Out Airline', 'Out Cities', 'Out Duration', 'Out Stops', 'Out Stop Cities',
#            'Return Day', 'Return Time', 'Return Weekday', 'Return Airline', 'Return Cities', 'Return Duration', 'Return Stops', 'Return Stop Cities',
#            'Price'])
    
    flights_df = {'Out Day': next(iter(a_day)),
                  'Out Weekday': a_weekday[-1],
                  'Out Duration': a_duration[-1],
                  'Out Cities': a_section_names[-1],
                  'Return Day': b_day[-1],
                  'Return Weekday': b_weekday[-1],
                  'Return Duration': b_duration[-1],
                  'Return Cities': b_section_names[-1],
                  'Out Stops': a_stop_list[-1],
                  'Out Stop Cities': a_stop_name_list[-1],
                  'Return Stops': b_stop_list[-1],
                  'Return Stop Cities': b_stop_name_list[-1],
                  'Out Time': a_hours[-1],
                  'Out Airline': a_carrier[-1],
                  'Return Time':b_hours[-1],
                  'Return Airline': b_carrier[-1],
                  'Price': prices_list[-1]}
    
    value_only = flights_df.values()
    
#   flights_df['timestamp'] = strftime("%Y%m%d-%H%M") # so we can know when it was scraped
    return value_only

In [34]:
# Call Kayak

def start_kayak(city_from, city_to, date_start, date_end):
    """City codes - it's IATA codes!
       Date format - YYYY-MM-DD"""
    
    kayak = 'http://www.kayak.com/flights/' + city_from + '-' + city_to + '/' + date_start + '-flexible/' + date_end + '-flexible?sort=bestflight_a'
    driver.get(kayak)
    sleep(randint(8,10))
    
    # Sometimes a popup shows up, so we can use a try statement to check it and close (COVID19)
    try:
        xp_popup_close = '//button[contains(@id,"dialog-close") and contains(@class,"Button-No-Standard-Style close")]'
        driver.find_elements_by_xpath(xp_popup_close)[8].click()
    except Exception as e:
        pass
    sleep(randint(8,10))
    
    # Sometimes a popup shows up, so we can use a try statement to check it and close
    try:
        xp_popup_close = '//button[contains(@id,"dialog-close") and contains(@class,"Button-No-Standard-Style close")]'
        driver.find_elements_by_xpath(xp_popup_close)[9].click()
    except Exception as e:
        pass
    sleep(randint(8,10))
    
    
    # Scape the "best" price
    df_flights_best = page_scrape()
#   df_flights_best['sort'] = 'best'
    sleep(randint(3,6))

    # Let's also get the lowest prices from the matrix on top
#    matrix = driver.find_elements_by_xpath('//*[contains(@id,"FlexMatrixCell")]')
#    matrix_prices = [price.text.replace('$','') for price in matrix]
#    matrix_prices = list(map(int, matrix_prices))
#    matrix_min = min(matrix_prices)
#    matrix_avg = sum(matrix_prices)/len(matrix_prices)
    
    return df_flights_best

In [35]:
# Iterating through airports

#def iteration(my_airport, depart_date, return_date):
#    """This is where users enter their local airport!"""
my_airport = 'SFO'
depart_date = '2020-08-25'
return_date = '2020-08-28'

airports = pd.read_csv('airport_data.csv')
airports_code = airports['IATA']

cols = ['Out Day', 'Out Weekday', 'Out Duration', 'Out Cities', 'Return Day', 'Return Weekday', 'Return Duration', 'Return Cities', 'Out Stops', 'Out Stop Cities', 'Return Stops', 'Return Stop Cities', 'Out Time', 'Out Airline', 'Return Time', 'Return Airline', 'Price']
df = []

for code in airports_code:
    
    df.append(start_kayak(my_airport, code, depart_date, return_date))
    
    
df = pd.DataFrame(df, columns=cols)
df.sort_values(by='Price')
df

Unnamed: 0,Out Day,Out Weekday,Out Duration,Out Cities,Return Day,Return Weekday,Return Duration,Return Cities,Out Stops,Out Stop Cities,Return Stops,Return Stop Cities,Out Time,Out Airline,Return Time,Return Airline,Price
0,8/28,Fri,23h34m,SFO‐ATL,8/29,Sat,15h47m,ATL‐SFO,2,"DEN, IAH",1,DEN,11:00 am – 1:34 pm +1,"Frontier, Spirit Airlines",9:28 pm – 10:15 am +1,Frontier,188
1,8/25,Tue,1h30m,SFO‐LAX,8/28,Fri,1h25m,LAX‐SFO,0,,0,,5:40 pm – 7:10 pm,Southwest,12:15 pm – 1:40 pm,Southwest,Info
2,8/24,Mon,4h14m,SFO‐ORD,8/27,Thu,4h37m,ORD‐SFO,0,,0,,8:06 am – 2:20 pm,American Airlines,2:20 pm – 4:57 pm,American Airlines,597
3,8/26,Wed,3h29m,SFO‐DFW,8/29,Sat,3h38m,DFW‐SFO,0,,0,,11:50 am – 5:19 pm,American Airlines,3:05 pm – 4:43 pm,American Airlines,497
