In [1]:
#Libraries
import pandas as pd
import numpy as np
import geopandas as gdp
import matplotlib.pyplot as plt

import requests
from bs4 import BeautifulSoup
import json
from pandas.io.json import json_normalize

from datetime import date
import re



In [2]:
def getwikitable(ext):
    response = requests.get(url=ext)
    soup = BeautifulSoup(response.content, 'html.parser')
    tab = soup.find("table")
    return tab

In [3]:
def getchildwiki(base, ext):
    loc = base + ext
    response = requests.get(url=loc)
    soup = BeautifulSoup(response.content, 'html.parser')
    try:
        ctab = soup.find('table', class_='infobox geography vcard')
        cbod = ctab.find('tbody')

        rows = cbod.find_all('tr')

        stuff = []
        heads = []
        for r in rows:
            cols = r.find_all('td')
            cols = [elem.text.strip() for elem in cols]
            stuff.append([elem for elem in cols if elem])

            cols = r.find_all('th')
            cols = [elem.text.strip() for elem in cols]
            heads.append([elem for elem in cols if elem])

        Data = []
        for x in range(0, len(stuff)):
            if len(stuff[x])>0:
                a=stuff[x][0]
            else:
                a="NaN"

            if len(heads[x])>0:
                b=heads[x][0]
            else:
                b="NaN"

            Data.append([b,a])
    except:
        Data = 'NaN'

    return Data

In [4]:
def getchildgeo(base, ext):
    loc = base + ext
    response = requests.get(url=loc)
    soup = BeautifulSoup(response.content, 'html.parser')
    ctab = soup.find('table', class_='infobox geography vcard')

    geo = ctab.find('span', class_='geo')
    geo = geo.string.split("; ")
    return geo

In [5]:
def rawnum(numstr):

    for x in range(0, len(numstr)-1):
        if numstr[x]=="[":
            numstr = numstr[0:x]
            break
            
    for x in range(len(numstr)-1, -1, -1):
        if not numstr[x].isnumeric():
            if numstr[x]==".":
                continue
            else:
                numstr = numstr.replace(numstr[x],"")
    
    return numstr

In [6]:
def getwikiinc(Data):
    inc = 0
    for x in range(0, len(Data)):
        if str(Data[x][0]).find('income') >=0:
            inc = Data[x][1]
            inc = rawnum(inc)
            break
    
    return inc

In [7]:
def parsewikihref(tab):
    lst = []
    for l in tab.find_all("a"):
        urll = l.get("href","")
        lst.append(urll)
    return lst

In [8]:
#Get List of wiki pages to pull from

ext = "https://en.wikipedia.org/wiki/Community_areas_in_Chicago"
tab = getwikitable(ext)
hlst = parsewikihref(tab)
hlst = list(set(hlst))
print(len(hlst))

83


In [9]:
hlst = [x for x in hlst if not x.startswith('#') and not x.startswith('https')]

In [10]:
print(len(hlst))

77


In [11]:
base = "https://en.wikipedia.org"
dset = []
for x in hlst:
    print(x)
    cw = getchildwiki(base, x)
    try:
        geo = getchildgeo(base, x)
    except:
        geo = ['NaN','NaN']
    try:
        inc = getwikiinc(cw)
    except:
        inc = 'NaN'
    
    line = [cw[0][0], inc, geo[0], geo[1]]
    dset.append(line)

dset

/wiki/Washington_Heights,_Chicago
/wiki/Greater_Grand_Crossing,_Chicago
/wiki/East_Side,_Chicago
/wiki/Forest_Glen,_Chicago
/wiki/Ashburn,_Chicago
/wiki/North_Center,_Chicago
/wiki/Montclare,_Chicago
/wiki/Lake_View,_Chicago
/wiki/Avalon_Park,_Chicago
/wiki/Riverdale,_Chicago
/wiki/Near_South_Side,_Chicago
/wiki/Rogers_Park,_Chicago
/wiki/South_Chicago,_Chicago
/wiki/Humboldt_Park,_Chicago
/wiki/West_Elsdon,_Chicago
/wiki/Woodlawn,_Chicago
/wiki/West_Lawn,_Chicago
/wiki/Near_West_Side,_Chicago
/wiki/O%27Hare,_Chicago
/wiki/West_Ridge,_Chicago
/wiki/Grand_Boulevard,_Chicago
/wiki/Douglas,_Chicago
/wiki/Chicago_Lawn,_Chicago
/wiki/West_Pullman,_Chicago
/wiki/West_Englewood,_Chicago
/wiki/Fuller_Park,_Chicago
/wiki/Portage_Park,_Chicago
/wiki/Clearing,_Chicago
/wiki/Lower_West_Side,_Chicago
/wiki/Gage_Park,_Chicago
/wiki/South_Deering,_Chicago
/wiki/Englewood,_Chicago
/wiki/Uptown,_Chicago
/wiki/Garfield_Ridge,_Chicago
/wiki/Armour_Square,_Chicago
/wiki/Archer_Heights,_Chicago
/wiki/Near_

[['Washington Heights', 0, '41.70383', '-87.65367'],
 ['Greater Grand Crossing', '26515', '41.7600', '-87.6100'],
 ['East Side', '43421', '41.7000', '-87.5600'],
 ['Forest Glen', '101559', '41.983', '-87.750'],
 ['Ashburn', '63573', '41.7500', '-87.7100'],
 ['North Center', '115756', '41.950', '-87.6800'],
 ['Montclare', '43015', '41.9300', '-87.800'],
 ['Lakeview', '89276', '41.94350', '-87.65417'],
 ['Avalon Park', '41531', '41.750', '-87.5900'],
 ['Riverdale', '13518', '41.6600', '-87.6100'],
 ['Near South Side', '100720', '41.85778', '-87.62389'],
 ['Rogers Park', '37223', '42.0100', '-87.6700'],
 ['South Chicago', '28504', '41.7400', '-87.550'],
 ['Humboldt Park', 0, '41.8800', '-87.700'],
 ['West Elsdon', '55380', '41.7900', '-87.7200'],
 ['Woodlawn', '26415', '41.7800', '-87.600'],
 ['West Lawn', '50384', '41.7700', '-87.7200'],
 ['Near West Side', '83575', '41.8700', '-87.6700'],
 ["O'Hare", '49295', '42.0000', '-87.9200'],
 ['West Ridge', '52039', '42.000', '-87.6900'],
 ['Gra

In [12]:
df = pd.DataFrame(dset)
df.shape

(77, 4)

In [13]:
df.columns = ['Geology', 'Income', 'Lat', 'Long']

In [15]:
#df.to_csv('ChiCommunityAreas.csv', index=False)