In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import re
import time
import numpy as np

In [2]:
# driver setup

chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)

# Lincoln Park

In [3]:
def get_attr(node, attr):
    if node == None:
        return None
    if attr == 'text':
        return node.text
    return node.get(attr)

def scrape_page(source_text):
    soup = BeautifulSoup(source_text)
    data = []
    for row in soup.find('div', class_='member_directory').find_all('tr', class_='list member'):
        name = get_attr(row.find('div', class_='title').find('a'), 'text')
        chamber_profile_url = get_attr(row.find('div', class_='title').find('a'), 'href')

        info_table = row.find('table', class_='info')

        phone = get_attr(info_table.find('td'), 'text')
        email = get_attr(info_table.find('a', href=re.compile("mailto:")), 'href')
        website = get_attr(info_table.find('a', target='_blank'), 'href')
        fb = get_attr(info_table.find('a', class_='facebook'), 'href')
        twitter = get_attr(info_table.find('a', class_='twitter'), 'href')
        insta = get_attr(info_table.find('a', class_='instagram'), 'href')
        
        data.append([name, chamber_profile_url, phone, email, website, fb, twitter, insta])

    return data

In [44]:
def scrape_lincoln_park(drv):
    drv.get('https://www.lincolnparkchamber.com/directory/')
    button = drv.find_element_by_css_selector('button.list-display')
    button.click()
    time.sleep(3)
    more_pages = True
    data = []
    
    while more_pages:
        new_data = scrape_page(drv.page_source)
        data.append(new_data)
        try:
            next_button = drv.find_element_by_css_selector('span.right-arrow')
            next_button.click()
            time.sleep(3)
        except:
            more_pages = False

    return data

In [45]:
results = scrape_lincoln_park(driver)

In [56]:
flattened = [row for page in results for row in page]

In [58]:
len(flattened)

485

In [63]:
lincoln_park = pd.DataFrame(flattened, columns=['name', 'chamber_profile_url', 'phone', 'email', 'website', 'fb', 'twitter', 'insta'])


In [71]:
lincoln_park.phone.value_counts()
# replace Email with nan
# cut 'mailto:' out of email
lincoln_park.email = lincoln_park.email.str.replace('mailto:', '')

In [76]:
lincoln_park.phone = lincoln_park.phone.replace(to_replace='Email', value=np.nan)

In [78]:
lincoln_park.to_csv('../data/processed/lincoln_park_chamber_scraped_0116.csv')

# Edgewater

In [5]:
def scrape_edgewater_directory(drv):
    drv.get('https://www.edgewater.org/membership-directory/')
    for _ in range(0, 5):
        driver.execute_script(
            "window.scrollTo(0, document.documentElement.scrollHeight);" #Alternatively, document.body.scrollHeight
        )
    time.sleep(3)
    soup = BeautifulSoup(drv.page_source)
    
    data = []
    for tile in soup.find_all('div', class_='article_box'):
        profile_page = tile.find('a').get('href')
        business_name = tile.find('div', class_='desc_wrapper').find('h4').text
        data.append([business_name, profile_page])
    
    return data

In [6]:
data = scrape_edgewater_directory(driver)

In [27]:
data

[['1134-36 W. Bryn Mawr LLC',
  'https://www.edgewater.org/businesses/1134-36-w-bryn-mawr-llc/'],
 ['11hundred Inc', 'https://www.edgewater.org/businesses/11hundred-inc/'],
 ['1350 W. Devon Ave.',
  'https://www.edgewater.org/businesses/1350-w-devon-ave/'],
 ['Aartwerk Studio LLC',
  'https://www.edgewater.org/businesses/aartwerk-studio-llc/'],
 ['About Face Theatre',
  'https://www.edgewater.org/businesses/about-face-theatre-2/'],
 ['Aegis Pest Management',
  'https://www.edgewater.org/businesses/aegis-pest-management/'],
 ['Alvino & Associates, Ltd.',
  'https://www.edgewater.org/businesses/alvino-associates-ltd/'],
 ['Andersonville Physical Therapy',
  'https://www.edgewater.org/businesses/andersonville-physical-therapy/'],
 ['Animal Accent', 'https://www.edgewater.org/businesses/animal-accent/'],
 ['Anna Held Florist',
  'https://www.edgewater.org/businesses/anna-held-florist/'],
 ['Anytime Fitness Chicago – North',
  'https://www.edgewater.org/businesses/anytime-fitness-chicago-no

In [24]:
def add_edgewater_contact_info(row, drv):
    name, url = row
    drv.get(url)
    soup = BeautifulSoup(drv.page_source)
    
    card = soup.find('div', class_='business')
    
    address = get_attr(card.find('p', id='address'), 'text')
    
    email = get_attr(card.find('p', id='email'), 'text')
    link = card.find('a')
    website = get_attr(link, 'href')
    fb = get_attr(card.find('a', attrs={'title': 'Facebook'}), 'href')
    twitter = get_attr(card.find('a', attrs={'title': 'Twitter'}), 'href')
    if link:
        desc = get_attr(link.find_parent(), 'text')
        match = re.search(r'\(\d{3}\) \d{3}-\d{4}', desc)
        if match:
            return [address, email, website, match.group(0), fb, twitter]
        
    return [address, email, website, None, fb, twitter]

In [28]:
enriched_data = []

for row in data:
    try:
        new_data = add_edgewater_contact_info(row, driver)
        print(new_data)
        enriched_data.append(row + new_data)
        time.sleep(1)
    except:
        enriched_data.append(row + [None, None, None, None, None, None])

['1134-36 W. Bryn Mawr Ave. Chicago, IL 60660', None, None, None, None, None]
['1210 W Granville\nChicago IL, 60660', 'david@11hundred.com', 'http://11hundred.com', '(213) 804-7600', None, None]
['1350 W. Devon Ave.\nChicago, IL\n60660', None, None, None, None, None]
['1141 W. Granville Ave.\nChicago, IL 60660', None, 'http://www.aartwerk.com/', '(773) 217-6604', 'https://www.facebook.com/AartwerkStudio/', 'https://twitter.com/AartwerkStudio']
['5252 N. Broadway \n(2nd Floor)\nChicago, IL 60640', None, 'http://aboutfacetheatre.com/', '(773) 784-8565', 'https://www.facebook.com/aboutfacetheatre/', 'https://twitter.com/aboutfacechi?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor']
['2141 W. North Ave.\nChicago, IL 60647', 'info@aegispest.com', 'http://aegispest.com/', '(773) 433-3827', 'https://www.facebook.com/aegisbbds/', 'https://www.facebook.com/ScouttheBedBugDog/']
['5820 N. Broadway\nChicago, IL 60660', None, 'http://www.alvinocpa.com', '(773) 561-9119', None, None]
['5414 N

['Edgewater, Chicago, IL', 'egachicago@gmail.com', 'https://edgeglen.com', None, 'https://www.facebook.com/edgeglen/', None]
['5358 N. Ashland Ave., Chicago, IL, 60640', None, 'http://www.edgewaterhistory.org', '(773) 506-4849', 'https://www.facebook.com/edgewaterhistory', None]
['1055 W. Bryn Mawr Ave.\nChicago IL, 60660', 'edgewatermexicancafe@gmail.com', 'http://www.edgewatermexicancafe.com', '(773) 944-0357', 'https://www.facebook.com/edgemexcafe/', None]
['1048 W. Bryn Mawr Ave.\nChicago, IL 60660', 'info@edgewaterplayhouse.com', 'https://edgewaterplayhouse.com/', '(773) 564-9963', None, None]
['1020 Bryn Mawr Ave.\nChicago IL, 60660', 'revcatheyepc@gmail.com', 'http://www.edgewaterpres.org', '(773) 561-4748', 'https://www.facebook.com/edgewater.presbyterian.church/', None]
['1130 W. Thorndale Ave. Chicago, IL, 60660', None, 'http://www.edgewaterworkbench.com/', '(773) 944-0650', 'https://www.facebook.com/edgeworkbench', 'https://twitter.com/edgeworkbench']
['548 W. Webster Ave.\n

['125 S. Wacker Drive, Suite 300 A\nChicago IL, 60606', 'kostopouloslawgroup@gmail.com', 'http://kostlaw.com', '(312) 883-4904', 'https://www.facebook.com/Kostopolous-Law-Group-745248992311530/', 'https://twitter.com/KostAttorneys']
['1208 W. Granville Ave.\nChicago, IL 60660', None, 'http://webcache.googleusercontent.com/search?q=cache:http://www.kwaifahacupuncture.com/', '(872) 216-3455', None, None]
['Lakewood and Balmoral\nChicago, Illinois', None, 'https://lakewoodbalmoral.nextdoor.com/news_feed/', '(312) 480-0781', 'https://www.facebook.com/Lakewood.Balmoral/timeline?ref=page_internal', None]
['6418 N. Greenview Ave., Chicago, IL, 60626', None, 'http://www.leatherarchives.org', '(773) 761-9200', 'https://www.facebook.com/leatherarchives', 'https://twitter.com/leatherarchives or @leatherarchives']
['6056 N. Broadway, Chicago, IL, 60660', None, 'http://www.lickitysplitchicago.com', '(773) 274-0830', 'https://www.facebook.com/LickitySplitChicago', None]
['5940 N. Sheridan Rd.\nChica

['5846 N. Broadway, Chicago, IL, 60660', None, 'http://www.rasdashenchicago.com', '(773) 506-9601', 'https://www.facebook.com/ras.dashen.9', 'https://twitter.com/rasdashenchi']
['6157 N. Clark St., Chicago, IL, 60660', None, 'http://www.raventheatre.com', '(773) 338-2177', 'https://www.facebook.com/RavenTheatre', 'https://twitter.com/raventheatre']
['5215 N. Ravenswood Ste 105, Chicago, IL, 60640', None, 'http://www.rennwellness.com/', '(773) 878-7330', None, None]
['Chicago, IL', None, 'http://www.rebirthphoto.com/', '(312) 291-1981', 'https://www.facebook.com/rebirthphotochicago', 'https://twitter.com/rebirthphoto']
['1044 W. Bryn Mawr Ave.,\nChicago, IL, 60660', None, 'http://www.redtwist.org', '(773) 728-7529', 'https://www.facebook.com/RedtwistTheatre', None]
['C/O Truman College\n1145 W. Wilson Avenue, (Room 1917)\nChicago IL, 60640', None, 'https://www.rescare.com/', '(773) 334-4747', None, None]
['1107 W. Berwyn Ave.\nChicago IL, 60640', None, 'http://facebook.com/rewiredcafes'

['6600 N Lincoln Ave #422\nLincolnwood, IL 60712', None, 'http://www.vladtodea.com/', '(773) 216-8516', 'https://www.facebook.com/real.eastate.vlad/', None]
['3601 W. Devon Ave., Chicago, IL, 60659', None, 'http://www.vranaschioros.com/', '(773) 478-3776', None, None]
['1 Northfield Plaza #300, Northfield, IL, 60093', None, 'http://wastemaster.com', '(773) 858-5642', 'https://www.facebook.com/WasteCostReduction', 'https://twitter.com/wasteservice']
['848 East Grand Ave. - Navy Pier', None, 'javascript:void(0);', None, None, None]
['2300 W Lawrence Avenue Chicago, IL 60625', None, 'https://www.wintrustbank.com/', '(773) 907-8100', None, None]


In [31]:
df = pd.DataFrame(enriched_data, columns=['name', 'chamber_profile_url', 'address', 'email', 'website', 'phone', 'fb', 'twitter'])

In [34]:
df.to_csv('../data/processed/edgewater_chamber_scraped_0116.csv')

# Wicker Park

In [37]:
def scrape_wicker():
    rows = []

    for pos in range(0,271,30):
        url = f'http://www.wickerparkbucktown.com/index.php?src=membership&srctype=membership_lister_alpha&pos={pos},30,296'
        new_rows = scrape_wicker_directory_page(url)
        rows = rows + new_rows

    return rows

In [54]:
def scrape_wicker_directory_page(url):
    rows = []
    directory_request = requests.get(url)
    soup = BeautifulSoup(directory_request.text)
    for title_link in soup.find_all('a', class_='title'):
        company_profile_url = title_link.get('href')
        row = [title_link.text, company_profile_url]
        other_fields = scrape_wicker_profile(company_profile_url)
        rows.append(row + other_fields)
        time.sleep(1)
    return rows
    
def scrape_wicker_profile(url):  
    profile_req = requests.get(f'http://www.wickerparkbucktown.com/{url}')
    profile_soup = BeautifulSoup(profile_req.text)

    address = get_attr(profile_soup.find('div', class_='address'), 'text')
    contact_title = get_attr(profile_soup.find('div', class_='jobTitle'), 'text')
    phone = get_attr(profile_soup.find('div', class_='phone'), 'text')
    email = get_attr(profile_soup.find('div', class_='email'), 'text')
    
    company_website = get_attr(profile_soup.find('div', class_='website'), 'text')
    contact_name = get_attr(profile_soup.find('h2'), 'text')

    return [address, contact_name, contact_title, phone, email]

In [55]:
scrape_wicker_directory_page('http://www.wickerparkbucktown.com/index.php?src=membership&srctype=membership_lister_alpha&pos=0,30,296')

[['1572 N Milwaukee Ave Bldg Corp',
  'membership/members/1572-n-milwaukee-ave-bldg-corp/',
  '\nAddress:\n1570 N Milwaukee Ave Chicago, IL \n',
  'Strauss, Brian ',
  None,
  None,
  "Email: document.write( 'bjs1572' + '@' + 'gmail' + '.' + 'com' );"],
 ['3rd Generation Painting and Remodeling',
  'membership/expired-members/3rd-generation-painting-and-remodeling/',
  '\nAddress:\n4479 Lawn Ave. #52 Western Springs, IL 60558\n',
  'Kazimierski, Andre ',
  'Title: Owner',
  'Phone: (708) 680-6078',
  "Email: document.write( 'contact' + '@' + '3rdgenpainting' + '.' + 'com' );"],
 ['5411 Empanadas',
  'membership/members/5411-empanadas/',
  '\nAddress:\n \n',
  'Rodriguez, Amanda',
  'Title: Manager',
  'Phone: 7737271092',
  "Email: document.write( 'amanda' + '@' + '5411empanadas' + '.' + 'com' );"],
 ['606 Karate & Self-Defense',
  'membership/members/606-karate-self-defense/',
  '\nAddress:\n5467 S. Ridgewood Court Chicago, IL 60615\n',
  'Degnan, Kathleen',
  'Title: Owner',
  'Phone

In [56]:
results = scrape_wicker()

In [57]:
results

[['1572 N Milwaukee Ave Bldg Corp',
  'membership/members/1572-n-milwaukee-ave-bldg-corp/',
  '\nAddress:\n1570 N Milwaukee Ave Chicago, IL \n',
  'Strauss, Brian ',
  None,
  None,
  "Email: document.write( 'bjs1572' + '@' + 'gmail' + '.' + 'com' );"],
 ['3rd Generation Painting and Remodeling',
  'membership/expired-members/3rd-generation-painting-and-remodeling/',
  '\nAddress:\n4479 Lawn Ave. #52 Western Springs, IL 60558\n',
  'Kazimierski, Andre ',
  'Title: Owner',
  'Phone: (708) 680-6078',
  "Email: document.write( 'contact' + '@' + '3rdgenpainting' + '.' + 'com' );"],
 ['5411 Empanadas',
  'membership/members/5411-empanadas/',
  '\nAddress:\n \n',
  'Rodriguez, Amanda',
  'Title: Manager',
  'Phone: 7737271092',
  "Email: document.write( 'amanda' + '@' + '5411empanadas' + '.' + 'com' );"],
 ['606 Karate & Self-Defense',
  'membership/members/606-karate-self-defense/',
  '\nAddress:\n5467 S. Ridgewood Court Chicago, IL 60615\n',
  'Degnan, Kathleen',
  'Title: Owner',
  'Phone

In [58]:
len(results)

296

In [60]:
columns = ['company_name', 'profile_url', 'address', 'contact_name', 'contact_title', 'phone', 'email']

In [92]:
wicker = pd.DataFrame(results, columns = columns)

In [105]:

def clean_wicker_columns(df):
    df.address = df.address.map(lambda v: v.split(':')[-1].strip() if v else None)
    df.contact_title = df.contact_title.map(lambda v: v.split(':')[-1].strip() if v else None)
    df.phone = df.phone.map(lambda v: v.split(':')[-1].strip() if v else None)
    df.email = df.email.str.replace("Email: document.write\(|\+|'|\);|/s", '')

In [106]:
clean_wicker_columns(wicker)

In [111]:
wicker.to_csv('../data/processed/wicker_chamber_scraped_0117.csv')