In [920]:
import requests
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np

In [933]:
def download_wiki(title):
    url = 'https://en.wikipedia.org/wiki/' + title
    page = requests.get(url).text
    return page

In [942]:
def extract_table(page):
    soup = BeautifulSoup(page, 'html5lib')
    table_elm = soup.find('table', {'class': 'wikitable sortable'})

    df = None
    for r in table_elm.tbody.find_all('tr'):
        if r.th is not None and df is None:
            #print('ok')
            col_names = [c.text.replace('\n', '') for c in r.findAll('th')]
            df = list()
            df.append(col_names)
        if r.td is not None and df is not None:
            df.append([c.text.replace('\n', '') for c in r.findAll('td')])

    table_cols = ['pop_rank', 'city', 'state', 'pop_estimate', 'census', 
                  'pop_delta', 'pop_density__mi', 'pop_density__km2', 
                  'land_area__mi', 'land_area__km2', 'location']
    return pd.DataFrame(df[1:], columns=table_cols)

In [943]:
def clean(df):
    df[['lat', 'long']] = df.pop('location').str.extract(r'([-+]?\d{0,3}\.\d{1,4})\; ([-+]?\d{0,3}\.\d{1,4})', expand=True)
    df.pop_rank = df.pop_rank.astype(int)
    df.city = df.city.str.replace(r'\[.+\]', '')
    df.state = df.state.str.replace(r'\[.+\]', '')
    df.pop_estimate = df.pop_estimate.str.replace(r'\,', '').astype(int)
    df.census = df.census.str.replace(r'\,', '').astype(int)
    df.pop_delta = df.pop_delta.str.extract(r'([+-].+)%', expand=False).str.replace('+', '').astype(float)/100
    df.pop_density__mi = df.pop_density__mi.str.replace(',', '').str.extract(r'(\d+\.\d+)', expand=False).astype(float)
    df.pop_density__km2 = df.pop_density__km2.str.replace(',', '').str.extract(r'(\d+\.\d+)', expand=False).astype(float)
    df.land_area__mi = df.land_area__mi.str.replace(',', '').str.extract(r'(\d+)\/', expand=False).astype(int)
    df.land_area__km2 = df.land_area__km2.str.replace(',', '').str.extract(r'(\d+)\/', expand=False).astype(int)
    return df

In [944]:
def population_data():
    html = download_wiki('List_of_United_States_cities_by_population')
    table = extract_table(html)
    return clean(table)

In [945]:
pop_df = population_data()