In [91]:
#webscraping libraries
import requests as rq
from bs4 import BeautifulSoup as bs

# dataframe libraries
import pandas as pd 
import numpy as np

url = 'https://newyork.craigslist.org/d/apartments-housing-for-rent/search/apa'
#url = 'https://newyork.craigslist.org/d/apartments-housing-for-rent/search/apa?s=120'

In [79]:
# create soup object
page = rq.get(url)
soup = bs(page.content, 'html.parser')

In [57]:
# after inspecting the page, the urls exist with 'h3'
h3s = soup.find_all('h3', class_ = 'result-heading',
                   # attrs=('class': 'result-heading') 
                   )

In [58]:
h3s[:2]

[<h3 class="result-heading">
 <a class="result-title hdrlnk" data-id="7307969846" href="https://newyork.craigslist.org/que/apa/d/sunnyside-serene-gardens-community/7307969846.html" id="postid_7307969846">Serene Gardens Community</a>
 </h3>,
 <h3 class="result-heading">
 <a class="result-title hdrlnk" data-id="7307968308" href="https://newyork.craigslist.org/que/apa/d/ridgewood-one-bed-avail-in-ridgewood/7307968308.html" id="postid_7307968308">ONE BED AVAIL IN RIDGEWOOD!</a>
 </h3>]

In [59]:
len(h3s)

120

In [60]:
# links are in 'href' , with 'a' object
h3s[0].find('a')['href']

'https://newyork.craigslist.org/que/apa/d/sunnyside-serene-gardens-community/7307969846.html'

In [61]:
#find all links in each item in the list

links = [post.find('a')['href'] for post in h3s]
links[:5]

['https://newyork.craigslist.org/que/apa/d/sunnyside-serene-gardens-community/7307969846.html',
 'https://newyork.craigslist.org/que/apa/d/ridgewood-one-bed-avail-in-ridgewood/7307968308.html',
 'https://newyork.craigslist.org/wch/apa/d/west-harrison-west-harrison-bed-bth/7307966474.html',
 'https://newyork.craigslist.org/brk/apa/d/brooklyn-work-from-home-private-loft/7307965694.html',
 'https://newyork.craigslist.org/brk/apa/d/brooklyn-big-bedroom-close-to-prospect/7307964746.html']

In [62]:
len(links)

120

In [63]:
test_url = links[0]
page = rq.get(test_url)
soup = bs(page.content, 'html.parser')

In [64]:
test_url

'https://newyork.craigslist.org/que/apa/d/sunnyside-serene-gardens-community/7307969846.html'

In [65]:
# getting to the price 

int(soup.find('span', attrs= {'class': 'price'}).contents[0].replace('$', '').replace(',', ''))

1950

In [66]:
# getting location 

soup.find('small').contents[0].strip()\
    .replace('(','')\
    .replace(')','')\
    .replace(' ','')\
    .replace(' ','_')\
    .lower()

'sunnysidegardens,queens,newyork'

In [67]:
# bedrooms 

int(soup.find('span', attrs ={'class' : "shared-line-bubble"}).text.split('/')[0][:-3])

1

In [68]:
# bathrooms

int(soup.find('span', attrs ={'class' : "shared-line-bubble"}).contents[2].contents[0][:-2])

1

In [69]:
def info_grabber(url):
    
    """
    Scrape iformation about apt listed at corresponding URL 
    returns a dict with location, num_bedrooms, num_bath, url
    
    """
    info ={}
    
    # converting page to soup object
    page = rq.get(url)
    soup = bs(page.content, 'html.parser')
    
    # getting price
    price = soup.find('span', attrs= {'class': 'price'})
                      
    if price is not None:
        info['price'] = float(price.contents[0].replace('$', '').replace(',', ''))
    else:    
        print(f'Price Error: {url}')
        # put NaN in its place
        info['price'] = np.nan
    
    # getting location
    try: 
        info['location'] = soup.find_all('small')[0].contents[0].strip()\
            .replace('(','')\
            .replace(')','')\
            .replace(' ','')\
            .replace(' ','_')\
            .lower()
        
    except IndexError :

        print(f'Location Error: {url}')
        # put NaN in its place
        info['location'] = np.nan

        
    # getting bedrooms 
    try:
        info['num_beds'] = int(soup.find('span', attrs ={'class' : "shared-line-bubble"}).contents[0].contents[0][:-2])
    
    except :
        print(f'Bedrooms Error: {url}')
        # put NaN in its place
        info['num_beds'] = np.nan
        # put NaN in its place
        info['num_baths'] = np.nan
        
    # getting bathrooms
    
    bathrooms = soup.find('span', attrs ={'class' : "shared-line-bubble"})
    if bathrooms is not None:
        bath_value = bathrooms.contents[2].contents[0][:-2]
        if len(bath_value) < 3:
            info['num_bath'] = float(bathrooms.contents[2].contents[0][:-2])
        else:
            info['num_baths'] = 1
    else :
        print(f'Bathrooms Error: {url}')
        # put NaN in its place
        info['num_baths'] = np.nan
    
    info['url'] = url
        
    return info

In [102]:
%%time 

apts_info = [info_grabber(link) for link in links]

Location Error: https://newyork.craigslist.org/wch/apa/d/yonkers-bedroom-in-yonkers-serious/7307887699.html
Wall time: 23.2 s


In [103]:
apts_info[:2]

[{'price': 1950.0,
  'location': 'sunnysidegardens,queens,newyork',
  'num_beds': 1,
  'num_bath': 1.0,
  'url': 'https://newyork.craigslist.org/que/apa/d/sunnyside-serene-gardens-community/7307969846.html'},
 {'price': 1750.0,
  'location': 'ridgewood',
  'num_beds': 1,
  'num_bath': 1.0,
  'url': 'https://newyork.craigslist.org/que/apa/d/ridgewood-one-bed-avail-in-ridgewood/7307968308.html'}]

In [18]:
df = pd.DataFrame(apts_info)

In [20]:
df.head()

Unnamed: 0,price,location,num_beds,num_bath,url
0,1800.0,astoria,1,1.0,https://newyork.craigslist.org/brx/apa/d/astor...
1,2150.0,longwoodhistoricdistrictbronx,2,1.0,https://newyork.craigslist.org/brx/apa/d/bronx...
2,180.0,oldmillbasin,1,1.0,https://newyork.craigslist.org/brk/apa/d/brook...
3,800.0,2500johnsonavenue#bronx,1,1.0,https://newyork.craigslist.org/brx/apa/d/bronx...
4,2500.0,arrochar,2,2.5,https://newyork.craigslist.org/stn/apa/d/state...


In [92]:
def scrape_all(url):
    
    """
    scrape all availible pages with coressponding apt info
    """
    
    based_url =  'https://newyork.craigslist.org/d/apartments-housing-for-rent/search/apa'
    
    page = rq.get(url)
    soup = bs(page.content, 'html.parser')
    h3s = soup.find_all('h3', class_ = 'result-heading')
    links = [post.find('a')['href'] for post in h3s]
    
    data =  [info_grabber(link) for link in links]
    
    # pager = soup.find('span', {'class': 'button next'})
    # if pager:
    
    next_url = soup.find('a', {'class': 'button next'})['href']
    if len(next_url) > 0:
        next_page = based_url + next_url
        print('Scraping: ', next_page)

        data.extend(scrape_all(next_page))

    return data

In [93]:
all_data = scrape_all(url)

Location Error: https://newyork.craigslist.org/wch/apa/d/irvington-charming-bedroomcottage-in/7311118856.html
Scraping:  https://newyork.craigslist.org/d/apartments-housing-for-rent/search/apa/d/apartments-housing-for-rent/search/apa?s=120
Location Error: https://newyork.craigslist.org/wch/apa/d/bronx-nice-bedroom-apartements-available/7311010422.html
Scraping:  https://newyork.craigslist.org/d/apartments-housing-for-rent/search/apa/d/apartments-housing-for-rent/search/apa?s=240
Location Error: https://newyork.craigslist.org/mnh/apa/d/new-york-magnificent-3br-duplex-an/7310941320.html
Scraping:  https://newyork.craigslist.org/d/apartments-housing-for-rent/search/apa/d/apartments-housing-for-rent/search/apa?s=360
Bedrooms Error: https://newyork.craigslist.org/brk/apa/d/new-york-large-alcove-studio-no-fee/7310867144.html
Bathrooms Error: https://newyork.craigslist.org/brk/apa/d/new-york-large-alcove-studio-no-fee/7310867144.html
Price Error: https://newyork.craigslist.org/mnh/apa/d/new-y

In [107]:
all_data[:2]

[{'price': 1380.0,
  'location': 'elmhurst',
  'num_beds': 1,
  'num_bath': 1.0,
  'url': 'https://newyork.craigslist.org/que/apa/d/elmhurst-studio-with-big-backyard/7311219299.html'},
 {'price': 2100.0,
  'location': 'astoriaqueens',
  'num_beds': 2,
  'num_bath': 1.0,
  'url': 'https://newyork.craigslist.org/que/apa/d/astoria-2bedroom-1bath-apartment-close/7311217044.html'}]

In [108]:
df_final = pd.DataFrame(all_data)

In [114]:
df_final = df_final.drop(columns=['num_baths'])

In [115]:
df_final.head()

Unnamed: 0,price,location,num_beds,num_bath,url
0,1380.0,elmhurst,1.0,1.0,https://newyork.craigslist.org/que/apa/d/elmhu...
1,2100.0,astoriaqueens,2.0,1.0,https://newyork.craigslist.org/que/apa/d/astor...
2,2600.0,bushwick,3.0,1.0,https://newyork.craigslist.org/brk/apa/d/brook...
3,1500.0,queensvillagepppl,2.0,1.0,https://newyork.craigslist.org/que/apa/d/queen...
4,1800.0,astoria,1.0,1.0,https://newyork.craigslist.org/brx/apa/d/astor...


In [116]:
df_final.to_csv('scrapped_data.csv', )

In [119]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   price     2997 non-null   float64
 1   location  2964 non-null   object 
 2   num_beds  2999 non-null   float64
 3   num_bath  2846 non-null   float64
 4   url       3000 non-null   object 
dtypes: float64(3), object(2)
memory usage: 117.3+ KB
