In [None]:
from bs4 import BeautifulSoup # if this isn't installed, use pip install beautifulsoup4
import requests
import re
import pandas as pd
import numpy as np
from datetime import datetime
import time
import progressbar # if this isn't installed, use pip install progressbar2
import random
from selenium import webdriver # if not installed, do pip install selenium
from itertools import cycle

In [None]:
def generateProxies():
    # Get list of US-based proxy IPs and ports using selenium

    IPurl = "https://www.us-proxy.org/" # <-- the robots.txt file for this site allows full access for all user-agents

    # Specify incognito options for Chrome
    option = webdriver.ChromeOptions()
    option.add_argument("--incognito")

    # Create new Chrome instance
    browser = webdriver.Chrome(options=option)

    # Minimize window
    browser.minimize_window()

    # Go to desired website
    IPurl = "https://www.us-proxy.org/" # <-- the robots.txt file for this site allows full access for all user-agents
    browser.get(IPurl)

    # Filter by https only
    https_button = browser.find_elements_by_xpath("//*[@id='proxylisttable']/tfoot/tr/th[7]/select/option[3]")[0]
    https_button.click()

    # Set to 80 results
    maxnum_button = browser.find_elements_by_xpath("//*[@id='proxylisttable_length']/label/select/option[3]")[0]
    maxnum_button.click()

    # Grab IP's and Ports from the resulting table
    rows = browser.find_elements_by_xpath("//*[@id='proxylisttable']/tbody/tr")

    proxies = set() # using a set ensures there aren't duplicates
    for row in rows:
        row = row.text.split(' ')

        if row[3].strip().lower() != 'transparent': # don't want to include our real proxy when navigating KSL
            proxies.add(''.join(['http://', ':'.join([row[0].strip(), row[1].strip()])]))

    # Close browser when done
    browser.close()

    return proxies

In [None]:
# Make a function for the scraping done for each search page

def carscraper(**kwargs):
    '''VARIABLE INPUTS:
    url: should be of the form "https://cars.ksl.com/search/newUsed/Used;Certified/perPage/96/page/0"
    rooturl: should be something like "https://cars.ksl.com"
    maxts: the maximum timestamp of the all_cars repository
    use_proxy: a boolean or binary to indicate if a proxy should be used
    curr_proxy: a string indicating the current proxy IP from last function call
    proxydict: a dictionary of proxy IPs and associated user-agents to cycle through
    refreshmin: the number of minutes to wait before updating the proxy pool
    
    ***NOTE: This function is meant to work with a pool of proxy IPs and a various spoofed user-agents'''
    
    # Need to spoof a user-agent in order to get past crawler block
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
    
    # the following were pulled manually on 3/12/20 from https://www.whatismybrowser.com/guides/the-latest-user-agent/
    user_agents = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
                   'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
                   'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/74.0',
                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/74.0',
                   'Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/74.0',
                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0 Safari/605.1.15',
                   'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36 Edg/80.0.361.62',
                   'Mozilla/5.0 (Windows NT 10.0; Trident/7.0; rv:11.0) like Gecko']    
    
    
    # Parse the kwargs
    
    
    if 'url' in kwargs.keys():
        if isinstance(kwargs['url'],str):
            url = kwargs['url']
        else:
            raise TypeError(f'Expected string for url but got {type(kwargs["url"])}.')
    else:
        raise ValueError('url is a required input for carscraper().')
        
    if 'rooturl' in kwargs.keys():
        if isinstance(kwargs['rooturl'],str):
            rooturl = kwargs['rooturl']
        else:
            raise TypeError(f'Expected string for rooturl but got {type(kwargs["rooturl"])}.')
    else:
        raise ValueError('rooturl is a required input for carscraper().')
        
    if 'maxts' in kwargs.keys():
        if isinstance(kwargs['maxts'],np.int64) or isinstance(kwargs['maxts'],int):
            maxts = kwargs['maxts']
        else:
            raise TypeError(f'Expected np.int64 or int for maxts but got {type(kwargs["maxts"])}.')
    else:
        raise ValueError('maxts is a required input for carscraper().')
        
    if 'use_proxy' in kwargs.keys():
        if isinstance(kwargs['use_proxy'],int) or isinstance(kwargs['use_proxy'],bool):
            use_proxy = kwargs['use_proxy']
        else:
            raise TypeError(f'Expected int or bool for use_proxy but got {type(kwargs["use_proxy"])}.')
    else:
        # default is to NOT use proxy
        use_proxy = False
        
    if use_proxy:
        # The following inputs are only useful when using a proxy
        
        if 'proxydict' in kwargs.keys():
            if isinstance(kwargs['proxydict'],dict):
                proxydict = kwargs['proxydict']
            else:
                print(f'Expected dict type for proxydict but got {type(kwargs["proxydict"])}. Generating new proxydict...')
                newproxies = generateProxies()
                proxydict = {i:random.choice(user_agents) for i in newproxies}
        else:
            print('No proxydict found. Generating...')
            newproxies = generateProxies()
            proxydict = {i:random.choice(user_agents) for i in newproxies}

        if 'refreshmin' in kwargs.keys():
            if isinstance(kwargs['refreshmin'],int) or isinstance(kwargs['refreshmin'],float):
                refreshmin = kwargs['refreshmin']
            else:
                refreshmin = 15
                print(f'Expected int or float for refreshmin but got {type(kwargs["refreshmin"])}. Set to default value of {refreshmin}.')
        else:
            refreshmin = 15
            print(f'No refreshmin found. Set to default value of {refreshmin}.')
        
    
    
    if use_proxy:
        tstart = time.time() # set a start time to use for refreshing proxy list (if needed)    

        if 'currproxy' in kwargs.keys():
            if isinstance(kwargs['currproxy'],str):
                currproxy = kwargs['currproxy']
            else:
                proxy_pool = cycle(proxydict) # make a pool of proxies 
                currproxy = next(proxy_pool) # grab the next proxy in cycle
        else:
            proxy_pool = cycle(proxydict) # make a pool of proxies 
            currproxy = next(proxy_pool) # grab the next proxy in cycle                


        attempts = len(proxydict) # for now, limit the total number of attempts to one per proxy. This will prevent endless while loop
        chkproxy = 1
        while chkproxy and attempts:
            if (time.time() - tstart) > 60*refreshmin: # check if it's been more than refreshmin minutes since proxy_pool updated
                print('Refreshing proxy pool...')

                currproxies = set(proxydict.keys())
                newproxies = generateProxies()
                newproxies = newproxies.difference(currproxies)

                if newproxies:
                    newdict = {i:random.choice(user_agents) for i in newproxies}
                    proxydict.update(newdict)
                    proxy_pool = cycle(proxydict)
                    currproxy = next(proxy_pool)
                    print('Proxy pool updated!')

            try:
                resp = requests.get(url,proxies={"http":currproxy, "https":currproxy},headers={'User-Agent': proxydict[currproxy]}, timeout=20)
                print(f'Proxy success for {currproxy}')
                print()
                chkproxy = 0
                attempts += 1
            except:
                prevproxy = currproxy
                currproxy = next(proxy_pool)
                print(f'Proxy error for {prevproxy}! Next up is {currproxy}')
                attempts -= 1
                print(f'Attempts remaining: {attempts}')
                
    else:
        # don't use the proxy
        resp = requests.get(url, headers = {'User-Agent': user_agent})
        
    html = resp.content
    pgsoup = BeautifulSoup(html)
    
    # Check if there are additional pages of results
    if pgsoup.find("a", {"title" : "Go forward 1 page"}):
        moreresults = 1
    else:
        moreresults = 0
    
    links = pgsoup.select("div.title > a.link") # grab all 96 (or up to 96) links
    tstamps = pgsoup.select("div.listing-detail-line script") # grab all 96 (or up to 96) timestamps

    # Loop through links and scrape data for each new listing
    all_cars = []
    with progressbar.ProgressBar(max_value=len(links)) as bar:
        for idx, link in enumerate(links): # *** only load first x results for now to avoid ban before implementing spoofing

            # Reset all fields to None before next loop
            price=year=make=model=body=mileage=title_type=city=state=seller=None
            trim=ext_color=int_color=transmission=liters=cylinders=fuel_type=n_doors=ext_condition=int_condition=drive_type=None

            # We're going to want to strip the "?ad_cid=[number]" from the end of these links as they're not needed to load the page properly
            # Regular expressions should come in handy here

            cutidx = re.search('(\?ad_cid=.+)',link['href']).start()
            currlink = link['href'][:cutidx]

            # Somewhere here we should do a check to make sure that the timestamp for currlink is newer than our newest file in our repository
            # That is, compare the timestamps with a simple conditional, where if the conditional is not met, this loop breaks to avoid useless computation time

            # Generate full link for the current listing
            fulllink = '/'.join([rooturl.rstrip('/'), currlink.lstrip('/')])

            if use_proxy:
                attempts = len(proxydict) # for now, limit the total number of attempts to one per proxy. This will prevent endless while loop
                chkproxy = 1
                while chkproxy and attempts:
                    if (time.time() - tstart) > 60*refreshmin: # check if it's been more than refreshmin minutes since proxy_pool updated
                        print('Refreshing proxy pool...')

                        currproxies = set(proxydict.keys())
                        newproxies = generateProxies()
                        newproxies = newproxies.difference(currproxies)

                        if newproxies:
                            newdict = {i:random.choice(user_agents) for i in newproxies}
                            proxydict.update(newdict)
                            proxy_pool = cycle(proxydict)
                            currproxy = next(proxy_pool)
                            print('Proxy pool updated!')

                    try:
                        resp = requests.get(fulllink,proxies={"http":currproxy, "https":currproxy},headers={'User-Agent': proxydict[currproxy]}, timeout=20)
                        print(f'Proxy success for {currproxy}')
                        print()
                        chkproxy = 0
                        attempts += 1
                    except:
                        prevproxy = currproxy
                        currproxy = next(proxy_pool)
                        print(f'Proxy error for {prevproxy}! Next up is {currproxy}')
                        attempts -= 1
                        print(f'Attempts remaining: {attempts}')
                        
            else:
                # don't use the proxy
                resp = requests.get(fulllink, headers = {'User-Agent': user_agent})
            
            
            lsthtml = resp.content
            lstsoup = BeautifulSoup(lsthtml)
            
            # Check if link is still good (i.e. listing is still active)
            if lstsoup.title.text.strip().lower() == 'not found':
                print('Bad link. Skipping...')
                bar.update(idx)
            else:

                # Get timestamp
                tstamp = int(re.search('(\d+)',tstamps[idx].text).group(0))

                # Check if timestamp is newer than maxts
                if tstamp <= maxts:
                    print('************ Found end of new data ************')
                    moreresults = 0
                    break

                # Get listing price
                price = lstsoup.select('h3.price')[0].text.strip().replace('$','').replace(',','')

                # Get seller's location
                if lstsoup.select('h2.location > a'):
                    location = lstsoup.select('h2.location > a')[0].text.strip()
                    city, state = location.split(',')
                    city = city.strip()
                    state = state.strip()

                # Get seller type (dealer or owner)
                sellerstr = lstsoup.select('div.fsbo')[0].text.strip()
                if re.search('(Dealer)', sellerstr):
                    seller = 'Dealer'
                elif re.search('(Owner)', sellerstr):
                    seller = 'Owner'
                    
                # Get number of photos
                if lstsoup.select('div.slider-uninitialized > p'):
                    picstr = lstsoup.select('div.slider-uninitialized > p')[0].text.strip()
                    n_pics = int(re.search('(\d+)',picstr).group())
                else:
                    if lstsoup.find(id='widgetPhoto').p:
                        picstr = lstsoup.find(id='widgetPhoto').p.text.strip()
                        n_pics = int(re.search('(\d+)',picstr).group())
                    else:
                        n_pics = 0

                # Get table of car specs
                specs = lstsoup.select('ul.listing-specifications')

                for li in specs[0].find_all('li'):
                    lititle = li.select('span.title')[0].text.strip().strip(':')
                    livalue = li.select('span.value')[0].text.strip().strip(':')

                    if livalue.lower() == 'not specified':
                        livalue = None

                    # Now a bunch of if-else statements to determine which column to add data to
                    # There might be a more sophisticated way to do this, perhaps with a tuple or a dictionary?
                    if lititle.lower() == 'year':
                        if livalue:
                            year = int(livalue)
                        else:
                            year = livalue
                    elif lititle.lower() == 'make':
                        make = livalue
                    elif lititle.lower() == 'model':
                        model = livalue
                    elif lititle.lower() == 'body':
                        body = livalue
                    elif lititle.lower() == 'mileage':
                        if livalue:
                            mileage = int(livalue.replace(',',''))
                        else:
                            mileage = livalue
                    elif lititle.lower() == 'title type':
                        title_type = livalue

                    # Below this are non-required specs    
                    elif lititle.lower() == 'trim':
                        trim = livalue
                    elif lititle.lower() == 'exterior color':
                        if livalue:
                            ext_color = livalue.lower()
                        else:
                            ext_color = livalue
                    elif lititle.lower() == 'interior color':
                        if livalue:
                            int_color = livalue.lower()
                        else:
                            int_color = livalue
                    elif lititle.lower() == 'transmission':
                        transmission = livalue
                    elif lititle.lower() == 'liters':
                        try:
                            liters = float(livalue)
                        except:
                            if livalue:
                                str1 = re.search('^(.*?)L',livalue).group(0).strip().replace(' ','')
                                if re.search('^(\D+)',str1):
                                    idxend = re.search('^(\D+)',str1).end()
                                    livalue = str1[idxend:-1]
                                    if re.search('(\D+)',livalue): # check if still other pollutants
                                        idxend = re.search('(\D+)',livalue).end()
                                        livalue = livalue[idxend:]
                                else:
                                    livalue = str1[:-1]
                                try:
                                    livalue = float(livalue)
                                except:
                                    print(url)
                                    print('****')
                                    print(link)
                            else:
                                liters = livalue
                    elif lititle.lower() == 'cylinders':
                        if livalue:
                            cylinders = int(livalue)
                        else:
                            cylinders = livalue
                    elif lititle.lower() == 'fuel type':
                        fuel_type = livalue
                    elif lititle.lower() == 'number of doors':
                        if livalue:
                            n_doors = int(livalue)
                        else:
                            n_doors = livalue
                    elif lititle.lower() == 'exterior condition':
                        ext_condition = livalue
                    elif lititle.lower() == 'interior condition':
                        int_condition = livalue
                    elif lititle.lower() == 'drive type':
                        drive_type = livalue
                    elif (lititle.lower() == 'vin') | (lititle.lower() == 'stock number') | (lititle.lower() == 'dealer license'):
                        None # Don't want to save these
                    else:
                        None
                        print(f'Unmatched param {lititle}: {livalue}') # <-- could take advantage of some or all of these

                curr_car = pd.DataFrame({"timestamp":[tstamp],
                                         "lastpull_ts":[int(time.time())],
                                         "link":[fulllink],
                                         "price":[price],
                                         "year":[year],
                                         "make":[make],
                                         "model":[model],
                                         "body":[body],
                                         "mileage":[mileage],
                                         "title_type":[title_type],
                                         "city":[city],
                                         "state":[state],
                                         "seller":[seller],
                                         "trim":[trim],
                                         "ext_color":[ext_color],
                                         "int_color":[int_color],
                                         "transmission":[transmission],
                                         "liters":[liters],
                                         "cylinders":[cylinders],
                                         "fuel_type":[fuel_type],
                                         "n_doors":[n_doors],
                                         "ext_condition":[ext_condition],
                                         "int_condition":[int_condition],
                                         "drive_type":[drive_type],
                                         "n_pics":[n_pics]})
                try:
                    all_cars = pd.concat([all_cars, curr_car])
                except:
                    all_cars = curr_car

                bar.update(idx)

    if type(all_cars) is pd.core.frame.DataFrame: # make sure that some data was actually scraped
        all_cars = all_cars.reset_index()
        del all_cars['index']
        all_cars.fillna(value=pd.np.nan, inplace=True)
    if use_proxy:
        return all_cars, moreresults, currproxy, proxydict
    else:
        return all_cars, moreresults

In [None]:
# Regenerate repository from scratch

# determine whether or not to use proxy IPs. Can use boolean or int for this
use_proxy = False

# Define root url for KSL cars
rooturl = "https://cars.ksl.com"

# Note the url below specifies that we're looking for 96 per page and the default sort of newest to oldest posting
# This note about newest to oldest is useful so that we can avoid scraping repeat listings based on their timestamps
# Also note that this url does NOT have a page number associated with it. This is added in the while loop below
lurl = "https://cars.ksl.com/search/newUsed/Used;Certified/perPage/96/page/"

count = 0
all_cars = []
moreresults = 1
while moreresults:
    url = lurl + str(count)
    try:
        if use_proxy:
            curr_cars, moreresults, currproxy, proxydict = carscraper(url=url, rooturl=rooturl, maxts=0, use_proxy=use_proxy, currproxy=currproxy, refreshmin = 15, proxydict = proxydict)
        else:
            curr_cars, moreresults = carscraper(url=url, rooturl=rooturl, maxts=0, use_proxy=False)
        
    except:
        if use_proxy:
            curr_cars, moreresults, currproxy, proxydict = carscraper(url=url, rooturl=rooturl, maxts=0, use_proxy=use_proxy, refreshmin = 15)
        else:
            pass
        
    
    count += 1    
    print(f'More results? {moreresults}')
    print(f'Next page is {url}')
    if type(curr_cars) is pd.core.frame.DataFrame: # make sure real data was returned
        try:
            all_cars = pd.concat([all_cars, curr_cars], ignore_index=True)
        except:
            all_cars = curr_cars
    else:
        print('No car data found!')
    
all_cars

In [None]:
# Save dataframe to csv

all_cars.to_csv('data/all_cars.csv', index=False)