In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import time
import pandas as pd
import ast

In [23]:
class YelpScraper:
    def __init__():
        self.fields = {
            'name': [],
            'rating': [],
            'review_count': [],
            'address': [],
            'barrio': [],
            'price': []
        }
        self.url = f"https://www.yelp.com/search?find_desc=brunch&find_loc=Buenos+Aires%2C+Argentina&start={multiple}"
        self.results = None 
    
    def scrape(self, start, end):
        for multiple in range(start, end, 10):
            page = urlopen(self.url) 
            soup = BeautifulSoup(page, 'html.parser')
            search_results = (
                soup
                .find('div', {'id': 'super-container'})
                .find('div', {'class': 'search-results-content'})
                .find_all('li', {'class': 'regular-search-result'})
            )    
            # iterate through search results
            for result in search_results:
                self._get_name(result)
                self._get_barrio(result)
                self._get_address(result)
                self._get_rating(result)
                self._get_review_count(result)
                self._get_price(result)  
                
            print(f'{multiple+10} results scraped!')
            time.sleep(60)
            
        self.results = pd.DataFrame(self.fields)
    
    def scrub(self, results):
        self.results['barrio'] = self.results['barrio'].apply(self._clean_barrio)
        self.results['address'] = self.results['address'].apply(self._clean_address)
        self.results['rating'] = self.results['rating'].apply(self._clean_rating)
        self.results['review_count'] = self.results['review_count'].apply(self._clean_review_count)
    
    def save(self, location):
        self.results.to_csv(location, index=False)
    
    def _get_name(self, result):
        try:
            name = (result
                    .find('div', {'class': 'main-attributes'})
                    .find('a', {'class': 'biz-name js-analytics-click'})
                    .span
                    .string)
            self.fields['name'].append(name)
            print(name)
        except:
            self.fields['names'].append('')
    
    def _get_barrio(self, result):
        try:
            self.fields['barrio'].append(
                result
                .find('div', {'class': 'secondary-attributes'})
                .span
                .string
            )
        except:
            self.fields['barrio'].append('')
            
    def _get_address(self, result):
        try:
            self.fields['address'].append(
                result
                .find('div', {'class': 'secondary-attributes'})
                .address
                .contents
            )
        except:
            self.fields['address'].append('')
    
    def _get_rating(self, result):
        try:
            self.fields['rating'].append(
                result
                .find('div', {'class': 'main-attributes'})
                .find('div', {'class': 'i-stars'})
                .attrs['title']
            )
        except:
            self.fields['rating'].append('')
    
    def _get_review_count(self, result):
        try:
            self.fields['review_count'].append(
                result
                .find('div', {'class': 'main-attributes'})
                .find('div', {'class': 'biz-rating'})
                .span
                .string
            )
        except AttributeError:
            self.fields['review_count'].append('')    
    
    def _get_price(self, result):
        try:
            self.fields['price'].append(
                result
                .find('div', {'class': 'main-attributes'})
                .find('div', {'class': 'price-category'})
                .find('span', {'class': 'business-attribute'})
                .string
            )
        except AttributeError:
            self.fields['price'].append('')
            
    def _clean_barrio(s):
        s = s.strip()
        if s.lower() == 'phone number':
            return ''
        else:
            return s
        
    def _clean_address(s):
        try:
            address_list = ast.literal_eval(s.replace('<br/>,', ''))
            return ', '.join([x.strip() for x in address_list])
        except:
            return ''
    
    def _clean_rating(s):
        rating = 0
        try:
            for x in s.split(): 
                try:
                    rating = float(x)
                except ValueError:
                    pass
        except AttributeError:
            pass
        return rating
    
    def _clean_review_count(s):
        try:
            for x in s.split(): 
                if x.isdigit():
                    break
            return int(x)
        except AttributeError:
            return 0

In [None]:
ys = YelpScraper()
results = ys.scrape()
results = results.scrub()

In [3]:
df = pd.read_csv('/Users/davidfstevens/brunch_places_500.csv')

In [24]:
df.to_csv('/Users/davidfstevens/brunch_places_500_cleaned.csv')

In [None]:
price:string, review_count:integer, rating:float, barrio:string

In [27]:
df[df['name'] == 'El Figón de Bonilla']

Unnamed: 0,name,price,review_count,rating,barrio,address
303,El Figón de Bonilla,,1,3.0,San Nicolas,"Av. Leandro Alem 673, C1001AAB Buenos Aires, A..."


In [34]:
df.iloc[304]

name                                                NaN
price                                               NaN
review_count                                          0
rating                                                0
barrio                                          Palermo
address         Arce 901, C1426 Buenos Aires, Argentina
Name: 304, dtype: object

In [36]:
df = df[~df.name.isnull()]

In [40]:
df.loc[:,'name'] = df['name'].fillna('')
df.loc[:,'price'] = df['price'].fillna('')
df.loc[:,'review_count'] = df['review_count'].fillna(0)
df.loc[:,'rating'] = df['rating'].fillna(0.0)
df.loc[:,'barrio'] = df['barrio'].fillna('')
df.loc[:,'address'] = df['address'].fillna('')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [43]:
df.to_csv('/Users/davidfstevens/workspace/brunch_finder/lib/seeds/brunch_places.csv')