In [1]:
from datetime import datetime as dt
from datetime import timedelta
import logging
import importlib
import urllib
import unicodecsv as csv
from lxml import html
import requests
import pandas as pd
import numpy as np
import json
#import syslog
#import psycopg2
#import shutil
#import os
#import glob
#import subprocess
#import time
#import sys
#from requests.auth import HTTPProxyAuth
#from __future__ import division




In [66]:
#Try to run this with San Francisco
DOMAINS = ['http://sfbay.craigslist.org/search/roo'] 

#Craigslist doesn't use time zones in its timestamps, so these cutoffs will be
#interpreted relative to the local time at the listing location. For example, dt.now()
#run from a machine in San Francisco will match listings from 3 hours ago in Boston.
LATEST_TS = dt.now()
EARLIEST_TS = LATEST_TS - timedelta(hours=.5)


OUT_DIR ="C:\\Users\\james\\Documents\\Berkeley_Docs\\Spring_17_Courses\\CP290 Data Lab\\scraper_output\\" #James's directory

#OUT_DIR ='/Users/anniedbr/Desktop/CSV/'  #Annie's directory

FNAME_BASE = 'data'  # filename prefix for saved data
FNAME_TS = True  # append timestamp to filename

S3_UPLOAD = False
S3_BUCKET = 'scraper2'

class RentalListingScraper(object):

    def __init__(
            self, 
            domains = DOMAINS,
            earliest_ts = EARLIEST_TS,
            latest_ts = LATEST_TS, 
            out_dir = OUT_DIR,
            fname_base = FNAME_BASE,
            fname_ts = FNAME_TS,
            s3_upload = S3_UPLOAD,
            s3_bucket = S3_BUCKET):
        
        self.domains = domains
        self.earliest_ts = earliest_ts
        self.latest_ts = latest_ts
        self.out_dir = out_dir
        self.fname_base = fname_base
        self.fname_ts = fname_ts
        self.s3_upload = s3_upload
        self.s3_bucket = s3_bucket
        self.ts = dt.now().strftime('%Y%m%d-%H%M%S')  # Use timestamp as file id
        #self.ts = fname_ts

        log_fname = self.out_dir + self.fname_base \
                + (self.ts if self.fname_ts else '') + '.log'
        
        importlib.reload(logging)
        
        logging.basicConfig(filename=log_fname, level=logging.INFO)
       
        #Suppress info messages from the 'requests' library
        #logging.getLogger('requests').setLevel(logging.WARNING)  

    
        
        
    def _get_str(self, list):
        '''
        The xpath() function returns a list of items that may be empty. Most of the time,
        we want the first of any strings that match the xml query. This helper function
        returns that string, or null if the list is empty.
        '''
        
        if len(list) > 0:
            return list[0]

        return ''
    
        
    def _get_int_prefix(self, str, label):
        '''
        Bedrooms and square footage have the format "xx 1br xx 450ft xx". This helper 
        function extracts relevant integers from strings of this format.
        '''     
        
        for s in str.split(' '):
            if label in s:
                return s.strip(label)
                
        return 0


    def _toFloat(self, string_value):
        string_value = string_value.strip()
        return np.float(string_value) if string_value else np.nan
 
    


    def _parseListing(self, item):
        '''
        Note that xpath() returns a list with elements of varying types depending on the
        query results: xml objects, strings, etc.
        '''
        pid = item.xpath('@data-pid')[0]  # post id, always present
        info = item.xpath('p[@class="result-info"]')[0]
        dt = info.xpath('time/@datetime')[0]
        url = info.xpath('a/@href')[0]
        if type(info.xpath('a/text()')) == str:
            title = info.xpath('a/text()')
        else:
            title = info.xpath('a/text()')[0]
        price = self._get_str(info.xpath('span[@class="result-meta"]/span[@class="result-price"]/text()')).strip('$')
        neighb_raw = info.xpath('span[@class="result-meta"]/span[@class="result-hood"]/text()')
        if len(neighb_raw) == 0:
            neighb = ''
        else:
            neighb = neighb_raw[0].strip(" ").strip("(").strip(")")
        housing_raw = info.xpath('span[@class="result-meta"]/span[@class="housing"]/text()')
        if len(housing_raw) == 0:
            #beds = 0
            sqft = 0
        else:
            bedsqft = housing_raw[0]
            #beds = self._get_int_prefix(bedsqft, "br")  # appears as "1br" to "8br" or missing
            sqft = self._get_int_prefix(bedsqft, "ft")  # appears as "000ft" or missing
        #for domain in self.domains:
            #url = domain.split('/search')[0] + info.xpath('a/@href')[0]
        #return [pid, dt, url, title, price, neighb, beds, sqft]
        return [pid, dt, url, title, price, neighb, sqft]

    

    def _parseAddress(self, tree):
        '''
        Some listings include an address, but we have to parse it out of an encoded
        Google Maps url.
        '''
        url = self._get_str(tree.xpath('//p[@class="mapaddress"]/small/a/@href'))
        
        if '?q=loc' not in url:
            # That string precedes an address search
            return ''
            
        return urllib.unquote_plus(url.split('?q=loc')[1]).strip(' :')
    
    #def PageBodyText(self, session, url, proxy=True):
    #We've tried section, div
    
    def PageBodyText(self, url):
        
        page = requests.get(url)        
        tree = html.fromstring(page.content)
        path = tree.xpath('//section[@id="postingbody"]')[0]
       
#         if type(path.xpath("text()")) == str:
#             body_text = path.xpath("text()")
#         else:
#             body_text = path.xpath("text()")[0]
#             #body_text = tree.xpath('//section[@id="postingbody"]/text()')[0]   
        
        body_text = path.xpath('text()')
        
        return [body_text]
    
# OLD ATTEMPT

#         #if type(tree.xpath('//div[@class="print-information print-qrcode-container"]/text()'))==str:
#         if type(path.xpath('text()'))==str:
#             body_text = path.xpath('text()')
#         else:
#             body_text = path.xpath('text()')[0]
#             #body_text = tree.xpath('//section[@id="postingbody"]/text()')[0]   
#         return [body_text]

    
    #def PageAttributes(self, session, url, proxy=True):
    def PageAttributes(self, url):   
        #s = session
         
        page = requests.get(url, timeout=30)        
        tree = html.fromstring(page.content)
        
        #page = s.get(url, timeout=30)        
        #tree = html.fromstring(page.content)
        try:
            pageattrs = tree.xpath('//div[@class="mapAndAttrs"]/p[@class="attrgroup"]/span/b')
        except:
            pageattrs = []

        return pageattrs
    
        if 'private room' in pageattrs:
            private_room = True 
   
        if 'private bath' in pageattrs:
            private_bath = True
            
        parking=['carport','attached garage','detached garage']
        if any(i in parking for i in pageattrs):
            carport_or_garage = True

        if 'w/d in unit' in pageattrs:
            washer_unit = True
            
        washer_list=['laundry in bldg','laundry on site']
        if any(i in washer_list for i in pageattrs):
            washer_building = True
      
        return [private_room, private_bath, carport_or_garage, washer_unit, washer_building]
            
    
     #def _scrapeLatLng(self, session, url, proxy=True):
    def _scrapeLatLng(self, url):
    
        #s = session
        # if proxy:
        #     requests.packages.urllib3.disable_warnings()
        #     authenticator = '87783015bbe2d2f900e2f8be352c414a'
        #     proxy_str = 'http://' + authenticator + '@' +'workdistribute.charityengine.com:20000'
        #     s.proxies = {'http': proxy_str, 'https': proxy_str}
        #     s.auth = HTTPProxyAuth(authenticator,'') 

        #page = s.get(url, timeout=30)
        
        page = requests.get(url)
        tree = html.fromstring(page.content)
        #try:
            #baths = tree.xpath('//div[@class="mapAndAttrs"]/p[@class="attrgroup"]/span/b')[1].text[:-2]
        #except:
            #baths = ''
        map = tree.xpath('//div[@id="map"]')

        # Sometimes there's no location info, and no map on the page        
        if len(map) == 0:
            return ['', '']

        map = map[0]
        lat = map.xpath('@data-latitude')[0]
        lng = map.xpath('@data-longitude')[0]
        
        
        accuracy = map.xpath('@data-accuracy')[0]

        
        
        #address = self._parseAddress(tree)
        
        #return [baths, lat, lng, accuracy, address]
        return [lat, lng, accuracy]

    def run(self):
            #colnames = ['pid','dt','url','title','price','neighb','beds','sqft', 'baths',
                        #'lat','lng','accuracy','address','private_room', 'private_bath', 'carport_or_garage', 'washer_unit', 'washer_building', 'body_text']
            colnames = ['pid','dt','url','title','price','neighb','sqft',
                        'lat','lng','accuracy','body_text','address','private_room', 'private_bath', 'carport_or_garage', 'washer_unit', 'washer_building']     #st_time = time.time()
        
            fname = self.out_dir + self.fname_base + '-' \
                + (self.ts if self.fname_ts else '') + '.csv'
            

            with open(fname, 'wb') as f:
                    writer = csv.writer(f)
                    writer.writerow(colnames)
                
                #total_listings = 0
                #listing_num = 0
                #ts_skipped = 0
                
                    for domain in self.domains:
                        #regionName = domain.split('//')[1].split('.craigslist')[0]
                        regionIsComplete = False
                        search_url = domain
                        print("beginning new region")
                        logging.info('BEGINNING NEW REGION')

       
                
                        while not regionIsComplete:

                            logging.info(search_url)
                            page = requests.get(search_url)
                            #print(page.status_code)
                            tree = html.fromstring(page.content)
                            #return tree
                            
                            listings = tree.xpath('//li[@class="result-row"]')
                            #print("got {0} listings".format(len(listings)))
                            for item in listings:

                                #listing_num += 1
                                try:
                                        row = self._parseListing(item)
                                        item_ts = dt.strptime(row[1], '%Y-%m-%d %H:%M')
                
                                        if (item_ts > self.latest_ts):
                                            # Skip this item but continue parsing search results
                                            #ts_skipped += 1
                                            continue

                                        if (item_ts < self.earliest_ts):
                                        # Break out of loop and move on to the next region
                                        #if listing_num == 1:
                                        #logging.info('NO LISTINGS BEFORE TIMESTAMP CUTOFF AT {0}'.format(str.upper(regionName)))    
                                    #else:
                                        #logging.info('REACHED TIMESTAMP CUTOFF')
                                    #ts_skipped += 1
                                            regionIsComplete = True
                                            logging.info('REACHED TIMESTAMP CUTOFF')
                                            break 
                    
                                        item_url = domain.split('/search/roo')[0] + row[2]
                                        row[2] = item_url
                                        #item_url = domain.split('/search')[0] + tree.xpath('a/@href')[0]
                                        logging.info(item_url)
                                        #row += self.PageAttributes(item_url)
                                        row += self._scrapeLatLng(item_url)
                                        row += self.PageBodyText(item_url)
                                        writer.writerow(row)
                            
                                except Exception as e:
                                    # Skip listing if there are problems parsing it
                                    logging.warning("{0}: {1}. Probably no beds/sqft info".format(type(e).__name__, e))
                                    continue
                                    
            return   

In [67]:
scraper = RentalListingScraper()

In [68]:
scraper.run()

beginning new region


In [None]:
#MUST FIGURE OUT HOW TO GET BEDROOMS
#Took out baths from original code because baths are in different location for shared
#What is row[2] in item_url = domain.split('/search')[0] + row[2]
#logging not working
#Add back in other functions
    
    
    
    
    def PageAttributes(self, session, url, proxy=True):
    
        s = session
        
        page = s.get(url, timeout=30)        
        tree = html.fromstring(page.content)
        try:
            pageattrs = tree.xpath('//div[@class="mapAndAttrs"]/p[@class="attrgroup"]/span/b')
        except:
            pageattrs = []

        return pageattrs
    
        if 'private room' in pageattrs:
            private_room = True 
   
        if 'private bath' in pageattrs:
            private_bath = True
            
        parking=['carport','attached garage','detached garage']
        if any(i in parking for i in pageattrs):
            carport_or_garage = True

        if 'w/d in unit' in pageattrs:
            washer_unit = True
            
        washer_list=['laundry in bldg','laundry on site']
        if any(i in washer_list for i in pageattrs):
            washer_building = True
      
        return [private_room, private_bath, carport_or_garage, washer_unit, washer_building]
    