# Scraping stuff in bulk

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait

In [2]:
import pandas as pd

In [4]:
df = pd.read_csv('08-classwork/restaurants.tsv', sep='\t')

In [6]:
df.head()

Unnamed: 0,categories,name,price,rating,url
0,Italian,La Piccola Cucina,$$,4.0,https://www.yelp.com/biz/la-piccola-cucina-new...
1,Italian,Pisticci,$$,4.0,https://www.yelp.com/biz/pisticci-new-york?osq...
2,Italian,Max Soha,$$,4.0,https://www.yelp.com/biz/max-soha-new-york?osq...
3,Italian,Arco Cafe,$$,4.5,https://www.yelp.com/biz/arco-cafe-new-york?os...
4,"Pizza, Italian",Mezzogiorno,$$,4.0,https://www.yelp.com/biz/mezzogiorno-new-york-...


## Functions

In [12]:
def get_price_range(yelp_range: str) -> str:
  replacement = {
    '$': 'cheap',
    '$$': 'normal',
    '$$$': 'expensive',
    '$$$$': 'insane'
  }

  return replacement[yelp_range]


In [14]:
df['price_range'] = df['price'].apply(get_price_range)
df.head()

Unnamed: 0,categories,name,price,rating,url,price_range
0,Italian,La Piccola Cucina,$$,4.0,https://www.yelp.com/biz/la-piccola-cucina-new...,normal
1,Italian,Pisticci,$$,4.0,https://www.yelp.com/biz/pisticci-new-york?osq...,normal
2,Italian,Max Soha,$$,4.0,https://www.yelp.com/biz/max-soha-new-york?osq...,normal
3,Italian,Arco Cafe,$$,4.5,https://www.yelp.com/biz/arco-cafe-new-york?os...,normal
4,"Pizza, Italian",Mezzogiorno,$$,4.0,https://www.yelp.com/biz/mezzogiorno-new-york-...,normal


In [15]:
def get_somas_opinion(restaurant):
  if restaurant['price'] == '$' and 'Seafood' not in restaurant['categories']:
    return 'love'
  else:
    return 'hate'


In [19]:
# in order to apply functions to the whole data frame based on its rows, 
# it needs to be run with 'axis=1'
df['somas_opinion'] = df.apply(get_somas_opinion, axis=1)

In [20]:
df.head()

Unnamed: 0,categories,name,price,rating,url,price_range,somas_opinion
0,Italian,La Piccola Cucina,$$,4.0,https://www.yelp.com/biz/la-piccola-cucina-new...,normal,hate
1,Italian,Pisticci,$$,4.0,https://www.yelp.com/biz/pisticci-new-york?osq...,normal,hate
2,Italian,Max Soha,$$,4.0,https://www.yelp.com/biz/max-soha-new-york?osq...,normal,hate
3,Italian,Arco Cafe,$$,4.5,https://www.yelp.com/biz/arco-cafe-new-york?os...,normal,hate
4,"Pizza, Italian",Mezzogiorno,$$,4.0,https://www.yelp.com/biz/mezzogiorno-new-york-...,normal,hate


## scrape stuff

In [None]:
import requests
from bs4 import BeautifulSoup


In [45]:
def get_address_info(row):
  raw_data = requests.get(row['url']).content
  soup = BeautifulSoup(raw_data, 'html.parser')
  raw_address = soup.find(class_='street-address').get_text()
  address = raw_address.strip()
  zipcode = raw_address.strip().split(' ')[-1]
  print(address)
  print(zipcode)
  return pd.Series({
    'zipcode': zipcode,
    'address': address
  })


In [46]:
new_df = df.apply(get_address_info, axis=1).join(df)
new_df

964 Amsterdam AveNew York, NY 10025
10025


964 Amsterdam AveNew York, NY 10025
10025


125 La Salle StNew York, NY 10027
10027


1274 Amsterdam AveNew York, NY 10027
10027


886 Amsterdam AveNew York, NY 10025
10025


2791 BroadwayNew York, NY 10025
10025


1034 Amsterdam AveNew York, NY 10025
10025


53 W 106 StNew York, NY 10025
10025


1600 Amsterdam AveNew York, NY 10031
10031


3143 BroadwayNew York, NY 10027
10027


1415 Second AveNew York, NY 10021
10021


Unnamed: 0,zipcode,address,categories,name,price,rating,url,price_range,somas_opinion
0,10025,"964 Amsterdam AveNew York, NY 10025",Italian,La Piccola Cucina,$$,4.0,https://www.yelp.com/biz/la-piccola-cucina-new...,normal,hate
1,10027,"125 La Salle StNew York, NY 10027",Italian,Pisticci,$$,4.0,https://www.yelp.com/biz/pisticci-new-york?osq...,normal,hate
2,10027,"1274 Amsterdam AveNew York, NY 10027",Italian,Max Soha,$$,4.0,https://www.yelp.com/biz/max-soha-new-york?osq...,normal,hate
3,10025,"886 Amsterdam AveNew York, NY 10025",Italian,Arco Cafe,$$,4.5,https://www.yelp.com/biz/arco-cafe-new-york?os...,normal,hate
4,10025,"2791 BroadwayNew York, NY 10025","Pizza, Italian",Mezzogiorno,$$,4.0,https://www.yelp.com/biz/mezzogiorno-new-york-...,normal,hate
5,10025,"1034 Amsterdam AveNew York, NY 10025",Italian,Tartina,$$,4.0,https://www.yelp.com/biz/tartina-new-york-4?os...,normal,hate
6,10025,"53 W 106 StNew York, NY 10025",Italian,Osteria 106,$$,4.5,https://www.yelp.com/biz/osteria-106-new-york?...,normal,hate
7,10031,"1600 Amsterdam AveNew York, NY 10031","Pizza, Italian, Bars",Fumo Pizza-Bar-Pasta,$$,4.0,https://www.yelp.com/biz/fumo-pizza-bar-pasta-...,normal,hate
8,10027,"3143 BroadwayNew York, NY 10027","Pizza, Italian",Bettolona,$$,4.0,https://www.yelp.com/biz/bettolona-new-york?os...,normal,hate
9,10021,"1415 Second AveNew York, NY 10021","Italian, Pasta Shops",Bigoi Venezia,$,4.5,https://www.yelp.com/biz/bigoi-venezia-new-yor...,cheap,love


In [47]:
new_df.to_csv('08-classwork/restaurants-with-zip.tsv', sep='\t', index=False)

## Trying to scrape with a single row

In [48]:
name = 'Burger & Lobster'
zipcode = '10011'
url = 'http://a816-restaurantinspection.nyc.gov/RestaurantInspection/SearchBrowse.do'

In [56]:
import time

driver = webdriver.Firefox()

In [63]:
def get_inspection_results(row):
  try:
    driver.get(
      'http://a816-restaurantinspection.nyc.gov/RestaurantInspection/SearchBrowse.do')
    restaurant_input = driver.find_element_by_id('searchByNameText')
    zip_input = driver.find_element_by_id('searchByZipCodeText')

    restaurant_input.send_keys(row['name'])
    zip_input.send_keys(row['zipcode'])
    # wait for 1 second
    time.sleep(1)

    button = driver.find_element_by_name('button_display')
    button.click()
    time.sleep(1)
    driver.find_element_by_class_name('resultCell').find_element_by_tag_name(
      'a').click()

    time.sleep(1)
    violationScore = driver.find_element_by_css_selector(
      '#violationScore b').text
    violationDescriptions = driver.find_element_by_css_selector(
      '#violationDesc').text
    return pd.Series({
      'points': violationScore,
      'violations': violationDescriptions
    })
  except:
    return pd.Series({})


In [58]:
violationDescriptions

'Sanitary Violations\n1) Cold food item held above 41º F (smoked fish and reduced oxygen packaged foods above 38 ºF) except during necessary preparation.\n2) Thawing procedures improper.\n3) Non-food contact surface improperly constructed. Unacceptable material used. Non-food contact surface or equipment improperly maintained and/or not properly sealed, raised, spaced or movable to allow accessibility for cleaning on all sides, above and underneath the unit.'

In [59]:
violationScore

'12'

In [64]:
violations_df = new_df.apply(get_inspection_results, axis=1).join(
  new_df)

violations_df

Unnamed: 0,points,violations,zipcode,address,categories,name,price,rating,url,price_range,somas_opinion
0,7.0,Sanitary Violations\n1) Wiping cloths soiled o...,10025,"964 Amsterdam AveNew York, NY 10025",Italian,La Piccola Cucina,$$,4.0,https://www.yelp.com/biz/la-piccola-cucina-new...,normal,hate
1,8.0,Sanitary Violations\n1) Wiping cloths soiled o...,10027,"125 La Salle StNew York, NY 10027",Italian,Pisticci,$$,4.0,https://www.yelp.com/biz/pisticci-new-york?osq...,normal,hate
2,12.0,Sanitary Violations\n1) Live roaches present i...,10027,"1274 Amsterdam AveNew York, NY 10027",Italian,Max Soha,$$,4.0,https://www.yelp.com/biz/max-soha-new-york?osq...,normal,hate
3,12.0,Sanitary Violations\n1) Cold food item held ab...,10025,"886 Amsterdam AveNew York, NY 10025",Italian,Arco Cafe,$$,4.5,https://www.yelp.com/biz/arco-cafe-new-york?os...,normal,hate
4,7.0,Sanitary Violations\n1) Food not protected fro...,10025,"2791 BroadwayNew York, NY 10025","Pizza, Italian",Mezzogiorno,$$,4.0,https://www.yelp.com/biz/mezzogiorno-new-york-...,normal,hate
5,3.0,Sanitary Violations\n1) Food contact surface n...,10025,"1034 Amsterdam AveNew York, NY 10025",Italian,Tartina,$$,4.0,https://www.yelp.com/biz/tartina-new-york-4?os...,normal,hate
6,28.0,Sanitary Violations\n1) Evidence of mice or li...,10025,"53 W 106 StNew York, NY 10025",Italian,Osteria 106,$$,4.5,https://www.yelp.com/biz/osteria-106-new-york?...,normal,hate
7,,,10031,"1600 Amsterdam AveNew York, NY 10031","Pizza, Italian, Bars",Fumo Pizza-Bar-Pasta,$$,4.0,https://www.yelp.com/biz/fumo-pizza-bar-pasta-...,normal,hate
8,12.0,Sanitary Violations\n1) Food not cooled by an ...,10027,"3143 BroadwayNew York, NY 10027","Pizza, Italian",Bettolona,$$,4.0,https://www.yelp.com/biz/bettolona-new-york?os...,normal,hate
9,2.0,Sanitary Violations\n1) Mechanical or natural ...,10021,"1415 Second AveNew York, NY 10021","Italian, Pasta Shops",Bigoi Venezia,$,4.5,https://www.yelp.com/biz/bigoi-venezia-new-yor...,cheap,love


In [65]:
violations_df.to_csv('08-classwork/restaurants-with-violations.tsv',
                     sep='\t', index=False)
