In [96]:
#webscraping libraries
import requests as rq
from bs4 import BeautifulSoup as bs

# dataframe libraries
import pandas as pd 
import numpy as np

url = 'https://newyork.craigslist.org/d/apartments-housing-for-rent/search/apa'
#url = 'https://newyork.craigslist.org/d/apartments-housing-for-rent/search/apa?s=120'

In [97]:
# create soup object
page = rq.get(url)
soup = bs(page.content, 'html.parser')

In [98]:
# after inspecting the page, the urls exist with 'h3'
h3s = soup.find_all('h3', class_ = 'result-heading',
                   # attrs=('class': 'result-heading') 
                   )

In [99]:
h3s[:2]

[<h3 class="result-heading">
 <a class="result-title hdrlnk" data-id="7311053947" href="https://newyork.craigslist.org/brk/apa/d/brooklyn-deal-alert-williamsburg-bed/7311053947.html" id="postid_7311053947">DEAL ALERT! WILLIAMSBURG 1 BED PLUS OFFICE!!!!</a>
 </h3>,
 <h3 class="result-heading">
 <a class="result-title hdrlnk" data-id="7311053701" href="https://newyork.craigslist.org/wch/apa/d/white-plains-beautiful-1br-view-of-the/7311053701.html" id="postid_7311053701">Beautiful 1BR w/ View of The Park!</a>
 </h3>]

In [100]:
len(h3s)

120

In [101]:
# links are in 'href' , with 'a' object
h3s[0].find('a')['href']

'https://newyork.craigslist.org/brk/apa/d/brooklyn-deal-alert-williamsburg-bed/7311053947.html'

In [102]:
#find all links in each item in the list

links = [post.find('a')['href'] for post in h3s]
links[:5]

['https://newyork.craigslist.org/brk/apa/d/brooklyn-deal-alert-williamsburg-bed/7311053947.html',
 'https://newyork.craigslist.org/wch/apa/d/white-plains-beautiful-1br-view-of-the/7311053701.html',
 'https://newyork.craigslist.org/brk/apa/d/brooklyn-immediate-move-infloor-to/7311053591.html',
 'https://newyork.craigslist.org/que/apa/d/woodside-bedroom-apartment-in-woodside/7311053366.html',
 'https://newyork.craigslist.org/brk/apa/d/brooklyn-flex-three-bedroom-railroad/7311052324.html']

In [103]:
len(links)

120

In [104]:
test_url = links[0]
page = rq.get(test_url)
soup = bs(page.content, 'html.parser')

In [105]:
test_url

'https://newyork.craigslist.org/brk/apa/d/brooklyn-deal-alert-williamsburg-bed/7311053947.html'

In [106]:
# getting to the price 

int(soup.find('span', attrs= {'class': 'price'}).contents[0].replace('$', '').replace(',', ''))

2100

In [107]:
# getting location 

soup.find('small').contents[0].strip()\
    .replace('(','')\
    .replace(')','')\
    .replace(' ','')\
    .replace(' ','_')\
    .lower()

'williamsburg'

In [108]:
# bedrooms 

int(soup.find('span', attrs ={'class' : "shared-line-bubble"}).text.split('/')[0][:-3])

2

In [109]:
# bathrooms

int(soup.find('span', attrs ={'class' : "shared-line-bubble"}).contents[2].contents[0][:-2])

1

In [143]:
def info_grabber(url):
    
    info ={}
    
    # converting page to soup object
    page = rq.get(url)
    soup = bs(page.content, 'html.parser')
    
    # getting price
    price = soup.find('span', attrs= {'class': 'price'})
                      
    if price is not None:
        info['price'] = float(price.contents[0].replace('$', '').replace(',', ''))
    else:    
        print(f'Price Error: {url}')
        # put NaN in its place
        info['price'] = np.nan
    
    # getting location
    try: 
        info['location'] = soup.find_all('small')[0].contents[0].strip()\
            .replace('(','')\
            .replace(')','')\
            .replace(' ','')\
            .replace(' ','_')\
            .lower()
        
    except IndexError :

        print(f'Location Error: {url}')
        # put NaN in its place
        info['location'] = np.nan

        
    # getting bedrooms 
    try:
        info['num_beds'] = int(soup.find('span', attrs ={'class' : "shared-line-bubble"}).contents[0].contents[0][:-2])
    
    except :
        print(f'Bedrooms Error: {url}')
        # put NaN in its place
        info['num_beds'] = np.nan
        
        
    # getting bathrooms
    try : 
        info['num_bath'] = float(soup.find('span', attrs ={'class' : "shared-line-bubble"}).contents[2].contents[0][:-2])
    
    except ValueError :
        print(f'Bathroom Error: {url}')
        # put NaN in its place
        info['num_baths'] = np.nan
    
    info['url'] = url
        
    return info

In [144]:
%%time 

apts_info = [info_grabber(link) for link in links]

Price Error: https://newyork.craigslist.org/brx/apa/d/bronx-bedroom-bath-condo-with/7311042079.html
Location Error: https://newyork.craigslist.org/brx/apa/d/bronx-bedroom-bath-condo-with/7311042079.html
Bedrooms Error: https://newyork.craigslist.org/brx/apa/d/bronx-bedroom-bath-condo-with/7311042079.html


AttributeError: 'NoneType' object has no attribute 'contents'

In [63]:
apts_info[:2]

[{'price': 2653.0,
  'location': 'financialdistrict',
  'num_beds': 2,
  'num_bath': 1.0,
  'url': 'https://newyork.craigslist.org/mnh/apa/d/new-york-lux-loft-spanning-838-sf-mos/7311044350.html'},
 {'price': 2575.0,
  'location': 'astoria',
  'num_beds': 1,
  'num_bath': 1.0,
  'url': 'https://newyork.craigslist.org/que/apa/d/astoria-free-month-no-broker-fee-60-sec/7311042623.html'}]

In [65]:
df = pd.DataFrame(apts_info)

In [66]:
df.isna().sum()

price       0
location    1
num_beds    0
num_bath    0
url         0
dtype: int64

In [67]:
df.head()

Unnamed: 0,price,location,num_beds,num_bath,url
0,2653.0,financialdistrict,2,1.0,https://newyork.craigslist.org/mnh/apa/d/new-y...
1,2575.0,astoria,1,1.0,https://newyork.craigslist.org/que/apa/d/astor...
2,2000.0,queens,2,1.0,https://newyork.craigslist.org/que/apa/d/south...
3,750.0,bronx,2,1.0,https://newyork.craigslist.org/brx/apa/d/bronx...
4,500.0,queens,1,1.0,https://newyork.craigslist.org/que/apa/d/astor...


In [94]:
def scrape_all(url):
    
    based_url =  'https://newyork.craigslist.org/d/apartments-housing-for-rent/search/apa'
    
    page = rq.get(url)
    soup = bs(page.content, 'html.parser')
    h3s = soup.find_all('h3', class_ = 'result-heading')
    links = [post.find('a')['href'] for post in h3s]
    
    data =  [info_grabber(link) for link in links]
    
    
    return data

In [95]:
scrape_all(url)

AttributeError: 'NoneType' object has no attribute 'contents'

In [84]:
soup.find('span', {'class': 'button next'})

<span class="button next" title="next page"> next &gt; </span>

In [88]:
soup.find('a', {'class': 'button next'})['href']

'/d/apartments-housing-for-rent/search/apa?s=120'