## Web Scraping TripAdvisor & Population Density data
TripAdvisor: https://www.tripadvisor.com/Attractions-g30196-Activities-c  
Population Density: http://zipatlas.com/us/tx/austin/zip-code-comparison/population-density.htm  
   
We want to proveide more objective features for the restaurant data. So we web scrapping these two websites by the zip code. 

In [1]:
import requests                 
from bs4 import BeautifulSoup
import pandas as pd
import json
import re
import numpy as np
from collections import Counter
import csv

### TripAdvisor
These are the attractions we focus on in TripAdvisor.com  

Nature & Parks in Austin:
https://www.tripadvisor.com/Attractions-g30196-Activities-c57-Austin_Texas.html  
Museums:
https://www.tripadvisor.com/Attractions-g30196-Activities-c49-Austin_Texas.html  
Shopping:
https://www.tripadvisor.com/Attractions-g30196-Activities-c26-Austin_Texas.html  
Landmarks:
https://www.tripadvisor.com/Attractions-g30196-Activities-c47-Austin_Texas.html  


In [2]:
full_pages = []

In [3]:
# get all the links which list the attractions that we are interested in
def get_all_pages(full_pages,index,max_num):
    # add all the pages under one link
    pages_numbers= np.arange(0, max_num, 30).tolist()
    # generate a list of all links
    for p in pages_numbers: 
        base_url = "https://www.tripadvisor.com/Attractions-g30196-Activities-c"
        page = (base_url + str(index) + "-oa" + str(p) + "-Austin_Texas.html") 
        full_pages.append(page)

In [4]:
get_all_pages(full_pages,49,63)

In [5]:
get_all_pages(full_pages,26,150)

In [6]:
get_all_pages(full_pages,47,97)

In [7]:
get_all_pages(full_pages,57,82)
full_pages

['https://www.tripadvisor.com/Attractions-g30196-Activities-c49-oa0-Austin_Texas.html',
 'https://www.tripadvisor.com/Attractions-g30196-Activities-c49-oa30-Austin_Texas.html',
 'https://www.tripadvisor.com/Attractions-g30196-Activities-c49-oa60-Austin_Texas.html',
 'https://www.tripadvisor.com/Attractions-g30196-Activities-c26-oa0-Austin_Texas.html',
 'https://www.tripadvisor.com/Attractions-g30196-Activities-c26-oa30-Austin_Texas.html',
 'https://www.tripadvisor.com/Attractions-g30196-Activities-c26-oa60-Austin_Texas.html',
 'https://www.tripadvisor.com/Attractions-g30196-Activities-c26-oa90-Austin_Texas.html',
 'https://www.tripadvisor.com/Attractions-g30196-Activities-c26-oa120-Austin_Texas.html',
 'https://www.tripadvisor.com/Attractions-g30196-Activities-c47-oa0-Austin_Texas.html',
 'https://www.tripadvisor.com/Attractions-g30196-Activities-c47-oa30-Austin_Texas.html',
 'https://www.tripadvisor.com/Attractions-g30196-Activities-c47-oa60-Austin_Texas.html',
 'https://www.tripadvis

In [8]:
# get the sub link of all the attractions
# we can find the zip code in the sub link.(next step)
sub_lst=[]
for link in full_pages:
    response = requests.get(link, headers={'User-Agent': "serana/1.0"})
    soup = BeautifulSoup(response.text, "html5lib")
    sublinks = soup.find_all('a', {'class': "FmrIP _R w _Z P0 M0 Gm ddFHE"})
    for u in sublinks:
        if u.has_attr('href'):
            sub_lst.append("https://www.tripadvisor.com"+u['href'])
len(sub_lst)

391

In [9]:
# use the regular expression to extract the zip code in the above sub link 
zipcode_lst=[]
for u in sub_lst:
    response = requests.get(u, headers={'User-Agent': "serana/2.0"})
    soup = BeautifulSoup(response.text, "html5lib")
    text = soup.find_all('script', {'type':"application/ld+json"})
    descript=re.findall(r'"postalCode":["][0-9][0-9][0-9][0-9][0-9]', str(text))
    zipcode=re.findall(r'[0-9][0-9][0-9][0-9][0-9]', str(descript))
    zipcode_lst.extend(zipcode)

# show the number of attractions in the neighborhood of each zip code
Counter(zipcode_lst)

Counter({'78705': 17,
         '78701': 70,
         '78723': 10,
         '78712': 13,
         '78703': 24,
         '78704': 51,
         '78751': 10,
         '78754': 2,
         '78702': 26,
         '78759': 11,
         '78744': 6,
         '78746': 19,
         '78753': 3,
         '78758': 6,
         '78757': 6,
         '78756': 12,
         '78731': 11,
         '78721': 4,
         '78748': 2,
         '78752': 6,
         '78724': 3,
         '78719': 10,
         '78745': 10,
         '78741': 2,
         '78722': 1,
         '78737': 6,
         '78734': 1,
         '78732': 5,
         '78617': 2,
         '78738': 1,
         '78739': 1,
         '78750': 3,
         '78736': 1,
         '78663': 1,
         '78730': 4,
         '78733': 1,
         '78725': 1,
         '78798': 1,
         '78749': 2,
         '78735': 2,
         '78726': 2,
         '78729': 1,
         '78717': 1})

In [10]:
attractions_dic=dict(Counter(zipcode_lst))

In [11]:
# write the number of attraction into csv 
with open("num_attractions.csv", 'w') as f:  
    writer = csv.writer(f)
    for k, v in attractions_dic.items():
        writer.writerow([k, v])

### Population Density

In [2]:
# get the zipatlas page of population density
url = "http://zipatlas.com/us/tx/austin/zip-code-comparison/population-density.htm"

In [3]:
response = requests.get(url)
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
results = soup.find('table', rules = "all")

In [4]:
headers = []
for i in results.find_all('td', class_ = "report_header"):
    title = i.text
    headers.append(title)
headers

['#',
 'Zip Code',
 'Location',
 'City',
 'Population',
 'People / Sq. Mile',
 'National Rank']

In [5]:
# get the zip code list
zip_code = []
for i in results.find_all('td', class_ = "report_data", align = "center"):
    row_data = i.find_all('a')
    for i in row_data:
        row = i.text
        zip_code.append(row)

In [6]:
# get the population density list
population_density = []
for i in results.find_all('td', class_ = "report_data", align = "right"):
    row_data = i.find_all('b')
    for i in row_data:
        row = i.text
        population_density.append(row)

In [7]:
# get the city list
city = []
for i in results.find_all('td', class_ = "report_data", align = "left"):
    row_data = i.text
    city.append(row_data)

In [8]:
# get other information
index = []
location = []
mark = '#'
rank = []
population = []
for i in results.find_all('td', class_ = "report_data", align = "right"):
    row_data = i.text
    if row_data not in population_density:
        if len(row_data) == 2 or (len(row_data) == 3 and row_data!= '625'):
            index.append(row_data)
        elif len(row_data) == 21:
            location.append(row_data)
        elif mark in row_data:
            rank.append(row_data)
        elif row_data not in index and row_data not in location and row_data not in rank:
            population.append(row_data)

In [9]:
# save the population density data in csv
rows = zip(index, zip_code, location, city, population, population_density, rank)
with open('population_density.csv','w',newline='') as f:
    writer = csv.writer(f)
    writer.writerow(headers)
    writer.writerows(rows)