In [1]:
import os
import time
import requests
import sys
import re
from bs4 import BeautifulSoup
import csv
from glob import glob

import pandas as pd
import numpy as np

In [2]:
# https://www.walkscore.com/cities-and-neighborhoods/states/
# get html pages with tables 

res = requests.get("https://www.walkscore.com/cities-and-neighborhoods/states/")
soup = BeautifulSoup(res.text, features="html.parser")

state_table = soup.find(class_="state-list")

links = []
for link in state_table.find_all("a"):
    links.append(link.get('href'))

def get_html(links):
    for l in links: 
        url = "https://www.walkscore.com"+l
        t0 = time.time()
        texts = requests.get(url)
        text_utf = texts.text.encode("utf=8")
        if not os.path.exists(f'data/data_html'):
            os.makedirs(f'data/data_html')

        with open(f'data/data_html{l}.html', "wb") as output:
            output.write(text_utf)
        
        sys.stdout.flush()

        response_delay = time.time() - t0
        # wait 10x longer than it took them to respond
        time.sleep(10*response_delay)
            

# check the time taken to retrieve data
start_time = time.time()
get_html(links)
end_time = time.time()
print(f"Time taken {end_time-start_time}")

Time taken 124.08236622810364


In [20]:
# this function process scraped html pages into text data and does a litle of feature engineering 
def met_data(state):
    file_html = open(f"data/data_html/{state}.html", "rb")
    plain_text = file_html.read()

    tempD = []
    finalD = []

    soup = BeautifulSoup(plain_text, "lxml")
    for tr in soup.findAll("td", class_=lambda x: x != "zipcode"):
    #     print(tr.get_text())
        text = tr.get_text()
        clean = re.sub(r'(\n+){1,}', "", text)
        tempD.append(clean)
    # we have 5 columns (excluding zipcode)
    rows = len(tempD) / 5

    for times in range(round(rows)):
        newTempD = [state]
        for i in range(5):
            newTempD.append(tempD[0])
            tempD.pop(0)
        finalD.append(newTempD)

    return finalD

In [29]:
f = met_data("AK")

In [30]:
for row in f:
    print(row)

['AK', 'Anchorage', '28', '21', '52', '291,826']
['AK', 'Fairbanks', '33', '24', '57', '31,535']
['AK', 'Juneau', '22', '18', '35', '31,275']
['AK', 'Badger', '2', '--', '35', '19,482']


In [31]:
# create a new folder to store cleaned data
states = ["AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "HI", "IA", "ID", "IL", "IN", "KS", 
          "KY", "LA", "MA", "MD", "ME", "MI", "MN", "MO", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", 
          "NY", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY"]

if not os.path.exists("data/cleaned_data"):
    os.makedirs("data/cleaned_data")
for state in states:
    temp = met_data(state)
    with open('data/cleaned_data/cleaned_' + str(state) + '.csv', 'w') as csvfile:
        wr = csv.writer(csvfile, dialect='excel')
        wr.writerow(['State', 'City', 'Walk Score', 'Transit Score', 'Bike Score', 'Population'])
        for row in temp:
            wr.writerow(row)

In [36]:
# combine all gathered data into one file
all_files = sorted(glob("data/cleaned_data/cleaned_*.csv"))
real_data = pd.concat((pd.read_csv(file,  encoding='latin-1') for file in all_files), ignore_index=True)

In [37]:
real_data.to_csv("data/walkability.csv")

In [38]:
df = pd.read_csv("data/walkability.csv", index_col=0)
df.head()

Unnamed: 0,State,City,Walk Score,Transit Score,Bike Score,Population
0,AK,Anchorage,28,21,52,291826
1,AK,Fairbanks,33,24,57,31535
2,AK,Juneau,22,18,35,31275
3,AK,Badger,2,--,35,19482
4,AL,Birmingham (the largest city in Alabama),35,25,31,212237


In [39]:
print(df.shape)

(2500, 6)


In [40]:
df.isnull().sum()

State            0
City             0
Walk Score       0
Transit Score    0
Bike Score       0
Population       0
dtype: int64