# TechPoint XTern Data Science Project

In [1]:
# Import the libraries.
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
import googlemaps
import gmaps

## Step 1: Getting the Data
- Before we start working through an Indianapolis travel itenerary, we need places to see!
- We can find places by scraping the Top Indianapolis attractions from TripAdvisor using the BeautifulSoup library!


In [2]:
# Some preliminary steps:

url = ('https://www.tripadvisor.com/Attractions-g37209-Activities-a_allAttractions.true-Indianapolis_Indiana.html')
user_agent = ({'User-Agent':
			'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
			AppleWebKit/537.36 (KHTML, like Gecko) \
			Chrome/90.0.4430.212 Safari/537.36',
			'Accept-Language': 'en-US, en;q=0.5'})

def get_page_contents(url):
    page = requests.get(url, headers = user_agent)
    return BeautifulSoup(page.text, 'html.parser')

# Creating a BeautifulSoup object.
soup = get_page_contents(url)

# Finding and extracting elements.
# We will create lists for each category of information we want in our table.

places = []
for name in soup.findAll('div',{'class':'XfVdV o AIbhI'}):
    places.append(name.text.strip("1234567890. "))
    
ratings = []
for rating in soup.findAll('div',{'class':'jVDab o W f u w JqMhy'}):
    ratings.append(rating.svg['aria-label'][:3] + "/5")
    
categories = []
for section in soup.findAll('section', {'class': 'jemSU'}):
    for category in section.findAll('div', {'class': 'alPVI eNNhq PgLKC tnGGX yzLvM'}):
        cats = ""
        for subcategory in category.findAll('div', {'class': 'biGQs _P pZUbB hmDzD'}):
            subcategory = subcategory.text.replace('Downtown Indianapolis', 'DownTown')
            # This delimiter makes it look nicer :)
            cats += subcategory + " • "
        categories.append(cats)

# These require 'deeper' URLs
addresses = []
hours_open = []
estimated_time_spent = []
for link in soup.findAll('div', {'class': 'alPVI eNNhq PgLKC tnGGX'}):
    # The sub-url is the page of each attraction... 
    suburl = get_page_contents('http://www.tripadvisor.com' + link.a['href'])
    
    # These two variables are created to space out the estimated_time_spent list
    i = len(suburl.findAll('div', {'class': 'tyUdl'})) # how many elements found
    i_time = 0 #iterable

    # scrape addresses
    for button in suburl.findAll('button', {'class': 'UikNM _G B- _S _T c G_ P0 wSSLS wnNQG raEkE'}):
        for address in button.findAll('span', {'class': 'biGQs _P XWJSj Wb'}):
            if address.text[0].isdigit():
                addresses.append(address.text)
    
    # scrape for hours of operation
    open_time = suburl.find('span', {'class': 'EFKKt'})
    if open_time != None:
        hours_open.append(open_time.text)
    else:
        hours_open.append('NaN')
    
    # scrape for time spent
    for ribbon in suburl.findAll('div', {'class': 'tyUdl'}):
        duration = ribbon.find('div', {'class': '_c'})
        if duration != None:
            estimated_time_spent.append(duration.text)
        else:
            i_time += 1
            if i == i_time:
                estimated_time_spent.append('NaN')

In [3]:
# This was put in because the 11th item (zero-indexed) is just DownTown Indy.
# It doesn't have an associated address..
addresses.insert(10, 'N/A')

### The lists will be condensed into a dictionary, which will be turned into a pandas DataFrame

In [4]:
dict = {'Location Name':places,
        'Rating':ratings,'Type':categories,
        'Address': addresses,
        'Hours of Operation': hours_open,
        'Estimated Time': estimated_time_spent}

# Create the dataframe.
indy_places = pd.DataFrame.from_dict(dict)
# Let's see how this looks.
indy_places.head()

Unnamed: 0,Location Name,Rating,Type,Address,Hours of Operation,Estimated Time
0,Children's Museum of Indianapolis,4.5/5,Children's Museums • Open now •,"3000 N Meridian St, Indianapolis, IN 46208-4716",10:00 AM - 5:00 PM,More than 3 hours
1,Indianapolis Motor Speedway Museum,4.5/5,Speciality Museums • Auto Racing Tracks • Open...,"4750 W 16th St, Indianapolis, IN 46222-2550",10:00 AM - 4:00 PM,1-2 hours
2,Indianapolis Museum of Art,4.5/5,Speciality Museums • Historic Sites • Open now •,"4000 Michigan Rd, Indianapolis, IN 46208-4196",11:00 AM - 5:00 PM,More than 3 hours
3,Lucas Oil Stadium,4.5/5,Sports Complexes • DownTown •,"500 S Capitol Ave, Indianapolis, IN 46225-1117",,
4,Central Canal,4.5/5,Points of Interest & Landmarks • DownTown •,"337 W 11th St, Indianapolis, IN 46202-3001",,


## Step 2: Creating xy-coordinates from the address
- This will be done using the GoogleMaps API.
    - My own developer key is used in this example, a new key will have to be created for subsequent execution of this code. 

In [5]:
indy_places["Lat"] = ''  # latitude
indy_places["Long"] = '' # longitude
indy_places.head()

Unnamed: 0,Location Name,Rating,Type,Address,Hours of Operation,Estimated Time,Lat,Long
0,Children's Museum of Indianapolis,4.5/5,Children's Museums • Open now •,"3000 N Meridian St, Indianapolis, IN 46208-4716",10:00 AM - 5:00 PM,More than 3 hours,,
1,Indianapolis Motor Speedway Museum,4.5/5,Speciality Museums • Auto Racing Tracks • Open...,"4750 W 16th St, Indianapolis, IN 46222-2550",10:00 AM - 4:00 PM,1-2 hours,,
2,Indianapolis Museum of Art,4.5/5,Speciality Museums • Historic Sites • Open now •,"4000 Michigan Rd, Indianapolis, IN 46208-4196",11:00 AM - 5:00 PM,More than 3 hours,,
3,Lucas Oil Stadium,4.5/5,Sports Complexes • DownTown •,"500 S Capitol Ave, Indianapolis, IN 46225-1117",,,,
4,Central Canal,4.5/5,Points of Interest & Landmarks • DownTown •,"337 W 11th St, Indianapolis, IN 46202-3001",,,,


In [6]:
# The following key is unique to this example. 
gmaps = googlemaps.Client(key='AIzaSyAPN2Eb_B_6x4IJz2H4hOwQPUuE8DmRHKI')

# We will iterate through each address.
# If Google is able to find coordinates, they will be populated in the dataframe.
for i in range(len(indy_places["Address"])):
    geocode_result = gmaps.geocode(indy_places['Address'][i])
    
    if geocode_result != []:
        indy_places['Lat'][i] = geocode_result[0]['geometry']['location']['lat']
        indy_places['Long'][i] = geocode_result[0]['geometry']['location']['lng']
    else:
        indy_places['Lat'][i] = 'NaN'
        indy_places['Long'][i] = 'NaN'

# Let's see how it all looks
indy_places.head()

Unnamed: 0,Location Name,Rating,Type,Address,Hours of Operation,Estimated Time,Lat,Long
0,Children's Museum of Indianapolis,4.5/5,Children's Museums • Open now •,"3000 N Meridian St, Indianapolis, IN 46208-4716",10:00 AM - 5:00 PM,More than 3 hours,39.810613,-86.157892
1,Indianapolis Motor Speedway Museum,4.5/5,Speciality Museums • Auto Racing Tracks • Open...,"4750 W 16th St, Indianapolis, IN 46222-2550",10:00 AM - 4:00 PM,1-2 hours,39.790315,-86.233661
2,Indianapolis Museum of Art,4.5/5,Speciality Museums • Historic Sites • Open now •,"4000 Michigan Rd, Indianapolis, IN 46208-4196",11:00 AM - 5:00 PM,More than 3 hours,39.83011,-86.186241
3,Lucas Oil Stadium,4.5/5,Sports Complexes • DownTown •,"500 S Capitol Ave, Indianapolis, IN 46225-1117",,,39.760101,-86.163888
4,Central Canal,4.5/5,Points of Interest & Landmarks • DownTown •,"337 W 11th St, Indianapolis, IN 46202-3001",,,39.782034,-86.165996


## Step 3: Cleaning the Data
- We have an `Hours of Operation` column that's in a cumbersome format. Let's separate it out into two columns:
    - `closing_time` and `opening_time
    - The original column will be split up using the `.split` method

In [7]:
open_time = []
close_time = []

for hour in indy_places['Hours of Operation']:
    open_close = hour.split('-')
    open_time.append(open_close[0])
    close_time.append(open_close[-1])
    
indy_places['opening_time'] = open_time
indy_places['closing_time'] = close_time

indy_places = indy_places.drop(['Hours of Operation'], axis=1)

indy_places.head()

Unnamed: 0,Location Name,Rating,Type,Address,Estimated Time,Lat,Long,opening_time,closing_time
0,Children's Museum of Indianapolis,4.5/5,Children's Museums • Open now •,"3000 N Meridian St, Indianapolis, IN 46208-4716",More than 3 hours,39.810613,-86.157892,10:00 AM,5:00 PM
1,Indianapolis Motor Speedway Museum,4.5/5,Speciality Museums • Auto Racing Tracks • Open...,"4750 W 16th St, Indianapolis, IN 46222-2550",1-2 hours,39.790315,-86.233661,10:00 AM,4:00 PM
2,Indianapolis Museum of Art,4.5/5,Speciality Museums • Historic Sites • Open now •,"4000 Michigan Rd, Indianapolis, IN 46208-4196",More than 3 hours,39.83011,-86.186241,11:00 AM,5:00 PM
3,Lucas Oil Stadium,4.5/5,Sports Complexes • DownTown •,"500 S Capitol Ave, Indianapolis, IN 46225-1117",,39.760101,-86.163888,,
4,Central Canal,4.5/5,Points of Interest & Landmarks • DownTown •,"337 W 11th St, Indianapolis, IN 46202-3001",,39.782034,-86.165996,,


## Step 4: Filtering The Data
- We want places with clear opening and closing times.
- We want to make sure the trip goes smoothly, so we will pick places that are open during our 9:00AM - 9:00 PM window

In [8]:
# No more missing store hours...
indy_places = indy_places[indy_places.closing_time != 'NaN']

#Let's reset the indices after dropping a bunch of columns.
#This will make the next step easier.
indy_places = indy_places.reset_index(drop= True)

## Step 5: Finding Activities from Filtered Data
- Let's try to fill a schedule from the data provided!

In [9]:
# This statement just displays the columns we need.
indy_places[['Location Name', 'opening_time','closing_time', 'Estimated Time']]

Unnamed: 0,Location Name,opening_time,closing_time,Estimated Time
0,Children's Museum of Indianapolis,10:00 AM,5:00 PM,More than 3 hours
1,Indianapolis Motor Speedway Museum,10:00 AM,4:00 PM,1-2 hours
2,Indianapolis Museum of Art,11:00 AM,5:00 PM,More than 3 hours
3,Victory Field,9:00 AM,5:00 PM,
4,Indianapolis Zoo,9:00 AM,4:00 PM,More than 3 hours
5,Benjamin Harrison Presidential Site,10:00 AM,3:00 PM,1-2 hours
6,The Eiteljorg Museum,10:00 AM,5:00 PM,
7,Eagle Creek Park,7:00 AM,8:00 PM,
8,White River State Park,5:00 AM,11:00 PM,
9,Indiana Convention Center,8:00 AM,5:00 PM,


- The zoo opens at 9 and it should last until lunchtime. Let's use that as our first activity
    - `index = 4`, 9:00 AM - 12:00 PM, 3 hours
- Lunch can be centered around White River State Park. Let's give that an hour.
    - `index = 8`, 12:00 PM - 1:00 PM, 1 hour
- As a fan of history, I think the Harrison presidential site will be a good fit. A tour should last an hour or two and it closes at 3!
    - `index = 5`, 1:00 PM - 3:00 PM, 2 hours
- The Garfield Park Conservatory & Sunken Garden would be a nice intermission. It closes at 5 and should take about an hour.
    - `index = 10`, 3:00 PM - 4:00 PM, 1 hour
- The Indiana Repertory Theatre might add some variety to the trip. It closes at 6.
    - `index = 17`, 4:00 PM- 6:00 PM , 2 hours
- The Fashion Mall at Keystone can conclude our trip. It closes at 9PM and has a lot of built-in anemites.
    - `index = 18`, 6:00 PM - 9:00 PM, 3 hours

In [10]:
# These are the indices for the activities we picked.
indices = [4, 8, 5, 10, 17, 18]
# Let's create a new DataFrame and fill it up with the places we want to go!
new_locations = pd.DataFrame(columns=list(indy_places.columns))

# To fill it up, we have to iterate through the indices.
# This fills it row-by-row
for i in range(len(indices)):
    new_locations.loc[i] = indy_places.iloc[indices[i]]

In [11]:
# Let's see how we're looking...
new_locations

Unnamed: 0,Location Name,Rating,Type,Address,Estimated Time,Lat,Long,opening_time,closing_time
0,Indianapolis Zoo,4.0/5,Zoos • Open now •,"1200 W Washington St, Indianapolis, IN 46222-4500",More than 3 hours,39.76801,-86.180659,9:00 AM,4:00 PM
1,White River State Park,4.5/5,Parks • Gardens • DownTown • Open now •,"7840 W 56th St, Indianapolis, IN 46254-9706",,39.853102,-86.300291,5:00 AM,11:00 PM
2,Benjamin Harrison Presidential Site,4.5/5,Historic Sites • Open now •,"1230 N Delaware St, Indianapolis, IN 46202-2531",1-2 hours,39.784061,-86.15439,10:00 AM,3:00 PM
3,Garfield Park Conservatory & Sunken Garden,4.5/5,Gardens • Open now •,"650 W Washington St, Indianapolis, IN 46204-2725",1-2 hours,39.768692,-86.169612,10:00 AM,5:00 PM
4,Indiana Repertory Theatre,5.0/5,Theaters • DownTown • Open now •,"140 W Washington St, Indianapolis, IN 46204-3465",,39.767445,-86.161028,11:00 AM,6:00 PM
5,The Fashion Mall at Keystone,4.5/5,Shopping Malls • Open now •,"8702 Keystone Xing, Indianapolis, IN 46240-7621",,39.91462,-86.11124,10:00 AM,9:00 PM


## Step 6: Transporting Filtered Location Data to `.csv`
- Great, we now have 6 locations and have already decided on a time.
- Let's create a `.csv` file for the locations we selected.
- Let's create another file for the itinerary.

In [12]:
# Convert dataframe to CSV file.
new_locations.to_csv('places.csv', index=False, header=True)

In [13]:
itinerary = new_locations[['Location Name', 'Address', 'Type']]

In [14]:
# We've already decided on the times, let's list them out.
time_of_day = ['9:00 AM - 12:00 PM',
        '12:00 PM - 1:00 PM',
        '1:00 PM - 3:00 PM',
        '3:00 PM - 4:00 PM',
        '4:00 PM- 6:00 PM',
        '6:00 PM - 9:00 PM']

time_spent = ['3 hours',
              '1 hour',
              '2 hours',
              '1 hour',
              '2 hours',
              '3 hours']

# Not these lists will be added as columns in the DataFrame.
itinerary.insert(0, 'Time of Day', time_of_day)
itinerary.insert(4, 'Time Spent', time_spent)

In [15]:
itinerary

Unnamed: 0,Time of Day,Location Name,Address,Type,Time Spent
0,9:00 AM - 12:00 PM,Indianapolis Zoo,"1200 W Washington St, Indianapolis, IN 46222-4500",Zoos • Open now •,3 hours
1,12:00 PM - 1:00 PM,White River State Park,"7840 W 56th St, Indianapolis, IN 46254-9706",Parks • Gardens • DownTown • Open now •,1 hour
2,1:00 PM - 3:00 PM,Benjamin Harrison Presidential Site,"1230 N Delaware St, Indianapolis, IN 46202-2531",Historic Sites • Open now •,2 hours
3,3:00 PM - 4:00 PM,Garfield Park Conservatory & Sunken Garden,"650 W Washington St, Indianapolis, IN 46204-2725",Gardens • Open now •,1 hour
4,4:00 PM- 6:00 PM,Indiana Repertory Theatre,"140 W Washington St, Indianapolis, IN 46204-3465",Theaters • DownTown • Open now •,2 hours
5,6:00 PM - 9:00 PM,The Fashion Mall at Keystone,"8702 Keystone Xing, Indianapolis, IN 46240-7621",Shopping Malls • Open now •,3 hours


In [16]:
# The itinerary gets its own .csv too. 
itinerary.to_csv('itinerary.csv', index=False, header=True)