## To Do:
* Carve up carscraper() function into subfunctions where possible:
    * Checking for validity of \*\*kwargs could be a separate function
    * Checking for lititle and livalue type could be a separate function
* Get page views (after certain time period?)
* Get number of times listing has been favorited?
* Keep a database or dictionary of IP:user-agent combos to avoid using different user-agents for the same IP?
* Figure out how to use API for getting user-agents (to avoid IP ban from user-agent website)

In [1]:
from bs4 import BeautifulSoup # if this isn't installed, use pip install beautifulsoup4
import requests
import re
import pandas as pd
import numpy as np
from datetime import datetime
import time
import progressbar # if this isn't installed, use pip install progressbar2
import random
from selenium import webdriver # if not installed, do pip install selenium
from itertools import cycle
# import json

Make sure chromedriver.exe has been added to the `PATH` before running the generateProxies() function. You can do so using [this guide](https://zwbetz.com/download-chromedriver-binary-and-add-to-your-path-for-automated-functional-testing/). The driver itself is located in the repository at `~/automodeals/selenium/chromedriver.exe`

**Importantly**, make sure that the chromedriver version used (e.g. 80) is the same as the full Chrome version you have installed (e.g. 80).

Chromedriver can be downloaded [here](https://sites.google.com/a/chromium.org/chromedriver/downloads)

Chrome version can be found [here](https://www.whatismybrowser.com/detect/what-version-of-chrome-do-i-have)

In [2]:
def generateProxies():
    # Get list of US-based proxy IPs and ports using selenium

    IPurl = "https://www.us-proxy.org/" # <-- the robots.txt file for this site allows full access for all user-agents

    # Specify incognito options for Chrome
    option = webdriver.ChromeOptions()
    option.add_argument("--incognito")

    # Create new Chrome instance
    browser = webdriver.Chrome(options=option)

    # Minimize window
    browser.minimize_window()

    # Go to desired website
    IPurl = "https://www.us-proxy.org/" # <-- the robots.txt file for this site allows full access for all user-agents
    browser.get(IPurl)

    # Filter by https only
    https_button = browser.find_elements_by_xpath("//*[@id='proxylisttable']/tfoot/tr/th[7]/select/option[3]")[0]
    https_button.click()

    # Set to 80 results
    maxnum_button = browser.find_elements_by_xpath("//*[@id='proxylisttable_length']/label/select/option[3]")[0]
    maxnum_button.click()

    # Grab IP's and Ports from the resulting table
    rows = browser.find_elements_by_xpath("//*[@id='proxylisttable']/tbody/tr")

    proxies = set() # using a set ensures there aren't duplicates
    for row in rows:
        row = row.text.split(' ')

        if row[3].strip().lower() != 'transparent': # don't want to include our real proxy when navigating KSL
            proxies.add(''.join(['http://', ':'.join([row[0].strip(), row[1].strip()])]))

    # Close browser when done
    browser.close()

    return proxies

In [4]:
# Make a function for the scraping done for each search page

# def carscraper(url, rooturl, maxts, **kwargs):
def carscraper(**kwargs):
    '''VARIABLE INPUTS:
    url: should be of the form "https://cars.ksl.com/search/newUsed/Used;Certified/perPage/96/page/0"
    rooturl: should be something like "https://cars.ksl.com"
    maxts: the maximum timestamp of the all_cars repository
    use_proxy: a boolean or binary to indicate if a proxy should be used
    curr_proxy: a string indicating the current proxy IP from last function call
    proxydict: a dictionary of proxy IPs and associated user-agents to cycle through
    refreshmin: the number of minutes to wait before updating the proxy pool
    
    ***NOTE: This function is meant to work with a pool of proxy IPs and a various spoofed user-agents'''
    
    # Need to spoof a user-agent in order to get past crawler block
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'
    
    # the following were pulled manually on 3/12/20 from https://www.whatismybrowser.com/guides/the-latest-user-agent/
    user_agents = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
                   'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
                   'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/74.0',
                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/74.0',
                   'Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/74.0',
                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0 Safari/605.1.15',
                   'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36 Edg/80.0.361.62',
                   'Mozilla/5.0 (Windows NT 10.0; Trident/7.0; rv:11.0) like Gecko']    
    
    
    # Parse the kwargs
    
    
    if 'url' in kwargs.keys():
        if isinstance(kwargs['url'],str):
            url = kwargs['url']
        else:
            raise TypeError(f'Expected string for url but got {type(kwargs["url"])}.')
    else:
        raise ValueError('url is a required input for carscraper().')
        
    if 'rooturl' in kwargs.keys():
        if isinstance(kwargs['rooturl'],str):
            rooturl = kwargs['rooturl']
        else:
            raise TypeError(f'Expected string for rooturl but got {type(kwargs["rooturl"])}.')
    else:
        raise ValueError('rooturl is a required input for carscraper().')
        
    if 'maxts' in kwargs.keys():
        if isinstance(kwargs['maxts'],np.int64) or isinstance(kwargs['maxts'],int):
            maxts = kwargs['maxts']
        else:
            raise TypeError(f'Expected np.int64 or int for maxts but got {type(kwargs["maxts"])}.')
    else:
        raise ValueError('maxts is a required input for carscraper().')
        
    if 'use_proxy' in kwargs.keys():
        if isinstance(kwargs['use_proxy'],int) or isinstance(kwargs['use_proxy'],bool):
            use_proxy = kwargs['use_proxy']
        else:
            raise TypeError(f'Expected int or bool for use_proxy but got {type(kwargs["use_proxy"])}.')
    else:
        # default is to NOT use proxy
        use_proxy = False
        
    if use_proxy:
        # The following inputs are only useful when using a proxy
        
        if 'proxydict' in kwargs.keys():
            if isinstance(kwargs['proxydict'],dict):
                proxydict = kwargs['proxydict']
            else:
                print(f'Expected dict type for proxydict but got {type(kwargs["proxydict"])}. Generating new proxydict...')
                newproxies = generateProxies()
                proxydict = {i:random.choice(user_agents) for i in newproxies}
        else:
            print('No proxydict found. Generating...')
            newproxies = generateProxies()
            proxydict = {i:random.choice(user_agents) for i in newproxies}

        if 'refreshmin' in kwargs.keys():
            if isinstance(kwargs['refreshmin'],int) or isinstance(kwargs['refreshmin'],float):
                refreshmin = kwargs['refreshmin']
            else:
                refreshmin = 15
                print(f'Expected int or float for refreshmin but got {type(kwargs["refreshmin"])}. Set to default value of {refreshmin}.')
        else:
            refreshmin = 15
            print(f'No refreshmin found. Set to default value of {refreshmin}.')
        
    
    
    if use_proxy:
        tstart = time.time() # set a start time to use for refreshing proxy list (if needed)    

        if 'currproxy' in kwargs.keys():
            if isinstance(kwargs['currproxy'],str):
                currproxy = kwargs['currproxy']
            else:
                proxy_pool = cycle(proxydict) # make a pool of proxies 
                currproxy = next(proxy_pool) # grab the next proxy in cycle
        else:
            proxy_pool = cycle(proxydict) # make a pool of proxies 
            currproxy = next(proxy_pool) # grab the next proxy in cycle                


        attempts = len(proxydict) # for now, limit the total number of attempts to one per proxy. This will prevent endless while loop
        chkproxy = 1
        while chkproxy and attempts:
            if (time.time() - tstart) > 60*refreshmin: # check if it's been more than refreshmin minutes since proxy_pool updated
                print('Refreshing proxy pool...')

                currproxies = set(proxydict.keys())
                newproxies = generateProxies()
                newproxies = newproxies.difference(currproxies)

                if newproxies:
                    newdict = {i:random.choice(user_agents) for i in newproxies}
                    proxydict.update(newdict)
                    proxy_pool = cycle(proxydict)
                    currproxy = next(proxy_pool)
                    print('Proxy pool updated!')

            try:
                resp = requests.get(url,proxies={"http":currproxy, "https":currproxy},headers={'User-Agent': proxydict[currproxy]}, timeout=20)
                print(f'Proxy success for {currproxy}')
                print()
                chkproxy = 0
                attempts += 1
            except:
                prevproxy = currproxy
                currproxy = next(proxy_pool)
                print(f'Proxy error for {prevproxy}! Next up is {currproxy}')
                attempts -= 1
                print(f'Attempts remaining: {attempts}')
                
    else:
        # don't use the proxy
        resp = requests.get(url, headers = {'User-Agent': user_agent})
        
    html = resp.content
    pgsoup = BeautifulSoup(html)
    
    # Check if there are additional pages of results
    if pgsoup.find("a", {"title" : "Go forward 1 page"}):
        moreresults = 1
    else:
        moreresults = 0
    
    links = pgsoup.select("div.title > a.link") # grab all 96 (or up to 96) links
    tstamps = pgsoup.select("div.listing-detail-line script") # grab all 96 (or up to 96) timestamps

    # Loop through links and scrape data for each new listing
    all_cars = []
    with progressbar.ProgressBar(max_value=len(links)) as bar:
        for idx, link in enumerate(links): # *** only load first x results for now to avoid ban before implementing spoofing

            # Reset all fields to None before next loop
            price=year=make=model=body=mileage=title_type=city=state=seller=None
            trim=ext_color=int_color=transmission=liters=cylinders=fuel_type=n_doors=ext_condition=int_condition=drive_type=None

            # We're going to want to strip the "?ad_cid=[number]" from the end of these links as they're not needed to load the page properly
            # Regular expressions should come in handy here

            cutidx = re.search('(\?ad_cid=.+)',link['href']).start()
            currlink = link['href'][:cutidx]

            # Somewhere here we should do a check to make sure that the timestamp for currlink is newer than our newest file in our repository
            # That is, compare the timestamps with a simple conditional, where if the conditional is not met, this loop breaks to avoid useless computation time

            # Generate full link for the current listing
            fulllink = '/'.join([rooturl.rstrip('/'), currlink.lstrip('/')])

            if use_proxy:
                attempts = len(proxydict) # for now, limit the total number of attempts to one per proxy. This will prevent endless while loop
                chkproxy = 1
                while chkproxy and attempts:
                    if (time.time() - tstart) > 60*refreshmin: # check if it's been more than refreshmin minutes since proxy_pool updated
                        print('Refreshing proxy pool...')

                        currproxies = set(proxydict.keys())
                        newproxies = generateProxies()
                        newproxies = newproxies.difference(currproxies)

                        if newproxies:
                            newdict = {i:random.choice(user_agents) for i in newproxies}
                            proxydict.update(newdict)
                            proxy_pool = cycle(proxydict)
                            currproxy = next(proxy_pool)
                            print('Proxy pool updated!')

                    try:
                        resp = requests.get(fulllink,proxies={"http":currproxy, "https":currproxy},headers={'User-Agent': proxydict[currproxy]}, timeout=20)
                        print(f'Proxy success for {currproxy}')
                        print()
                        chkproxy = 0
                        attempts += 1
                    except:
                        prevproxy = currproxy
                        currproxy = next(proxy_pool)
                        print(f'Proxy error for {prevproxy}! Next up is {currproxy}')
                        attempts -= 1
                        print(f'Attempts remaining: {attempts}')
                        
            else:
                # don't use the proxy
                resp = requests.get(fulllink, headers = {'User-Agent': user_agent})
            
            
            lsthtml = resp.content
            lstsoup = BeautifulSoup(lsthtml)
            
            # Check if link is still good (i.e. listing is still active)
            if lstsoup.title.text.strip().lower() == 'not found':
                print('Bad link. Skipping...')
                bar.update(idx)
            else:

                # Get timestamp
                tstamp = int(re.search('(\d+)',tstamps[idx].text).group(0))

                # Check if timestamp is newer than maxts
                if tstamp <= maxts:
                    print('************ Found end of new data ************')
#                     print(f'var type of all_cars is: {type(all_cars)}')
                    moreresults = 0
                    break
#                 else:
#                     print(f'New car found: {idx} in link {fulllink}')

                # Get listing price
                price = lstsoup.select('h3.price')[0].text.strip().replace('$','').replace(',','')

                # Get seller's location
                if lstsoup.select('h2.location > a'):
                    location = lstsoup.select('h2.location > a')[0].text.strip()
                    city, state = location.split(',')
                    city = city.strip()
                    state = state.strip()

                # Get seller type (dealer or owner)
                sellerstr = lstsoup.select('div.fsbo')[0].text.strip()
                if re.search('(Dealer)', sellerstr):
                    seller = 'Dealer'
                elif re.search('(Owner)', sellerstr):
                    seller = 'Owner'
                    
                # Get number of photos
                if lstsoup.select('div.slider-uninitialized > p'):
                    picstr = lstsoup.select('div.slider-uninitialized > p')[0].text.strip()
                    n_pics = int(re.search('(\d+)',picstr).group())
                else:
                    if lstsoup.find(id='widgetPhoto').p:
                        picstr = lstsoup.find(id='widgetPhoto').p.text.strip()
                        n_pics = int(re.search('(\d+)',picstr).group())
                    else:
                        n_pics = 0

                # Get table of car specs
                specs = lstsoup.select('ul.listing-specifications')

                for li in specs[0].find_all('li'):
                    lititle = li.select('span.title')[0].text.strip().strip(':')
                    livalue = li.select('span.value')[0].text.strip().strip(':')

                    if livalue.lower() == 'not specified':
                        livalue = None

                    # Now a bunch of if-else statements to determine which column to add data to
                    # There might be a more sophisticated way to do this, perhaps with a tuple or a dictionary?
                    if lititle.lower() == 'year':
                        if livalue:
                            year = int(livalue)
                        else:
                            year = livalue
                    elif lititle.lower() == 'make':
                        make = livalue
                    elif lititle.lower() == 'model':
                        model = livalue
                    elif lititle.lower() == 'body':
                        body = livalue
                    elif lititle.lower() == 'mileage':
                        if livalue:
                            mileage = int(livalue.replace(',',''))
                        else:
                            mileage = livalue
                    elif lititle.lower() == 'title type':
                        title_type = livalue

                    # Below this are non-required specs    
                    elif lititle.lower() == 'trim':
                        trim = livalue
                    elif lititle.lower() == 'exterior color':
                        if livalue:
                            ext_color = livalue.lower()
                        else:
                            ext_color = livalue
                    elif lititle.lower() == 'interior color':
                        if livalue:
                            int_color = livalue.lower()
                        else:
                            int_color = livalue
                    elif lititle.lower() == 'transmission':
                        transmission = livalue
                    elif lititle.lower() == 'liters':
                        try:
                            liters = float(livalue)
                        except:
                            if livalue:
                                str1 = re.search('^(.*?)L',livalue).group(0).strip().replace(' ','')
                                if re.search('^(\D+)',str1):
                                    idxend = re.search('^(\D+)',str1).end()
                                    livalue = str1[idxend:-1]
                                    if re.search('(\D+)',livalue): # check if still other pollutants
                                        idxend = re.search('(\D+)',livalue).end()
                                        livalue = livalue[idxend:]
                                else:
                                    livalue = str1[:-1]
                                try:
                                    livalue = float(livalue)
                                except:
                                    print(url)
                                    print('****')
                                    print(link)
                            else:
                                liters = livalue
                    elif lititle.lower() == 'cylinders':
                        if livalue:
                            cylinders = int(livalue)
                        else:
                            cylinders = livalue
                    elif lititle.lower() == 'fuel type':
                        fuel_type = livalue
                    elif lititle.lower() == 'number of doors':
                        if livalue:
                            n_doors = int(livalue)
                        else:
                            n_doors = livalue
                    elif lititle.lower() == 'exterior condition':
                        ext_condition = livalue
                    elif lititle.lower() == 'interior condition':
                        int_condition = livalue
                    elif lititle.lower() == 'drive type':
                        drive_type = livalue
                    elif (lititle.lower() == 'vin') | (lititle.lower() == 'stock number') | (lititle.lower() == 'dealer license'):
                        None # Don't want to save these
                    else:
                        None
                        print(f'Unmatched param {lititle}: {livalue}') # <-- could take advantage of some or all of these

                curr_car = pd.DataFrame({"timestamp":[tstamp],
                                         "price":[price],
                                         "year":[year],
                                         "make":[make],
                                         "model":[model],
                                         "body":[body],
                                         "mileage":[mileage],
                                         "title_type":[title_type],
                                         "city":[city],
                                         "state":[state],
                                         "seller":[seller],
                                         "trim":[trim],
                                         "ext_color":[ext_color],
                                         "int_color":[int_color],
                                         "transmission":[transmission],
                                         "liters":[liters],
                                         "cylinders":[cylinders],
                                         "fuel_type":[fuel_type],
                                         "n_doors":[n_doors],
                                         "ext_condition":[ext_condition],
                                         "int_condition":[int_condition],
                                         "drive_type":[drive_type],
                                         "n_pics":[n_pics]})
                try:
                    all_cars = pd.concat([curr_car, all_cars])
                except:
                    all_cars = curr_car

                bar.update(idx)

    if type(all_cars) is pd.core.frame.DataFrame: # make sure that some data was actually scraped
        all_cars = all_cars.reset_index()
        del all_cars['index']
        all_cars.fillna(value=pd.np.nan, inplace=True)
    if use_proxy:
        return all_cars, moreresults, currproxy, proxydict
    else:
        return all_cars, moreresults

In [4]:
# Try a multi-page test using carscraper function
import sys

# determine whether or not to use proxy IPs. Can use boolean or int for this
use_proxy = False

# set cap for number of search pages to load (i.e. pages with up to 96 listings)
maxpg = 2

# Define root url for KSL cars
rooturl = "https://cars.ksl.com"

# Note the url below specifies that we're looking for 96 per page and the default sort of newest to oldest posting
# This note about newest to oldest is useful so that we can avoid scraping repeat listings based on their timestamps
# Also note that this url does NOT have a page number associated with it. This is added in the while loop below
lurl = "https://cars.ksl.com/search/newUsed/Used;Certified/perPage/96/page/"

count = 0
all_cars = []
while count < maxpg:
    url = lurl + str(count)
    try:
        if use_proxy:
            curr_cars, moreresults, currproxy, proxydict = carscraper(url=url, rooturl=rooturl, maxts=0, use_proxy=use_proxy, currproxy=currproxy, refreshmin = 15, proxydict = proxydict)
        else:
            curr_cars, moreresults = carscraper(url=url, rooturl=rooturl, maxts=0, use_proxy=False)
        
    except:
        if use_proxy:
            curr_cars, moreresults, currproxy, proxydict = carscraper(url=url, rooturl=rooturl, maxts=0, use_proxy=use_proxy, refreshmin = 15)
        else:
            print('Unexpected error:', sys.exc_info())
            pass
        
    
    count += 1    
#     print(f'More results? {moreresults}')
    if type(curr_cars) is pd.core.frame.DataFrame: # make sure real data was returned
        try:
            all_cars = pd.concat([curr_cars, all_cars], ignore_index=True)
        except:
            all_cars = curr_cars
    else:
        print('No car data found!')
    
all_cars

100% (96 of 96) |########################| Elapsed Time: 0:01:46 Time:  0:01:46
100% (96 of 96) |########################| Elapsed Time: 0:02:01 Time:  0:02:01


Unnamed: 0,timestamp,price,year,make,model,body,mileage,title_type,city,state,...,int_color,transmission,liters,cylinders,fuel_type,n_doors,ext_condition,int_condition,drive_type,n_pics
0,1584384008,21990,2019,Nissan,Frontier,Truck,17667,Clean Title,Tooele,UT,...,gray,Automatic,4.0,6.0,Gasoline,4.0,Excellent,Excellent,4-Wheel Drive,23
1,1584384026,11790,2018,Hyundai,Elantra,Sedan,37506,Clean Title,Tooele,UT,...,gray,Automatic,2.0,4.0,Gasoline,4.0,Excellent,Excellent,FWD,24
2,1584384045,11990,2014,Volkswagen,Jetta,Sedan,36429,Clean Title,Tooele,UT,...,black,Automatic,,4.0,Diesel,4.0,Excellent,Excellent,FWD,22
3,1584384058,19500,2016,Chevrolet,Express Cargo Van,Van,68000,Clean Title,Bountiful,UT,...,gray,,,8.0,Flex Fuel,,Excellent,Excellent,RWD,33
4,1584384059,14999,2011,Ford,F-150,Truck,103293,Clean Title,American Fork,UT,...,black cloth interior,Automatic,,6.0,Gasoline,,,,4-Wheel Drive,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,1584388175,32995,2016,Ford,F-150,Truck,102306,Clean Title,Idaho Falls,ID,...,black,Automatic,,6.0,Gasoline,,,,4-Wheel Drive,28
188,1584388214,3150,2005,Pontiac,Grand Prix,Sedan,132807,,Taylorsville,UT,...,black,Automatic,,6.0,Gasoline,4.0,Good,Good,FWD,7
189,1584388230,21995,2015,BMW,3 Series,Hatchback,22637,Clean Title,American Fork,UT,...,black,Automatic,,4.0,Gasoline,4.0,,,AWD,30
190,1584388396,12499,2013,Ford,Edge,Sport Utility,72818,,Salt Lake City,UT,...,charcoal black,Automatic,,4.0,Gasoline,4.0,,,FWD,19


In [None]:
# Save dataframe to csv

all_cars.to_csv('data/all_cars.csv', index=False)

In [None]:
# Load dataframe
all_cars = pd.read_csv('data/all_cars.csv')

# get most recent timestamp from the dataframe
rep_ts = all_cars['timestamp'].max()

In [None]:
# Now scrape for more cars and check for timestamp

# Define root url for KSL cars
rooturl = "https://cars.ksl.com"

lurl = "https://cars.ksl.com/search/newUsed/Used;Certified/perPage/96/page/"

count = 0
newer_cars = []
moreresults = 1
while moreresults:
    url = lurl + str(count)
    
    try:
        if use_proxy:
            curr_cars, moreresults, currproxy, proxydict = carscraper(url=url, rooturl=rooturl, maxts=rep_ts, use_proxy=use_proxy, currproxy=currproxy, refreshmin = 15, proxydict = proxydict)
        else:
            curr_cars, moreresults = carscraper(url=url, rooturl=rooturl, maxts=rep_ts, use_proxy=use_proxy)
    except:
        if use_proxy:
            curr_cars, moreresults, currproxy, proxydict = carscraper(url=url, rooturl=rooturl, maxts=0, use_proxy=use_proxy, refreshmin = 15)
        else:
            pass
    
    count += 1    
#     print(f'More results? {moreresults}')
    if type(curr_cars) is pd.core.frame.DataFrame: # make sure real data was returned
        try:
            newer_cars = pd.concat([curr_cars, newer_cars], ignore_index=True)
        except:
            newer_cars = curr_cars
    else:
        print('No newer car data found!')
    
# add newer_cars
if type(newer_cars) is pd.core.frame.DataFrame:
    all_cars_UPDATED = pd.concat([newer_cars, all_cars], ignore_index=True)

In [None]:
# Save updated dataframe to csv

all_cars_UPDATED.to_csv('data/all_cars_UPDATED.csv', index=False) # in later iterations, remove the "_UPDATED" part of filename

In [None]:
# Load updated dataframe
all_cars_UPDATED = pd.read_csv('data/all_cars_UPDATED.csv') # in later iterations, remove the "_UPDATED" part of filename
all_cars_UPDATED

### Sandbox code before implementing in main loop:

In [47]:
# Michael - working on updating favorites and views
cars_csv = pd.read_csv('data/all_cars.csv')

# temporarily make up columns for csv
cars_csv['lastpull'] = cars_csv.timestamp.max()+100
cars_csv['views'] = np.random.randint(1000,size=len(cars_csv.index))
cars_csv['favorites'] = np.random.randint(100,size=len(cars_csv.index))
cars_csv['url'] = 'https://cars.ksl.com/listing/6301654'

# find ads more than x days old (time.time() is in seconds)
curr_time = time.time()
min_days = 0.1
min_dt = min_days*60*60*24 # time in seconds for use with datetime
old_ads = cars_csv['timestamp'] < (curr_time - min_dt)

# find ads that haven't been pulled for more than x days
min_last_pull = 0.07
min_last_pull_dt = min_last_pull*60*60*24 # time in seconds for use with datetime
no_recent_update = cars_csv['lastpull'] < (curr_time - min_last_pull_dt)

cars_need_update = cars_csv[old_ads & no_recent_update]


cars_need_update

# iterate through pulling new information from each ad
last_pull = []
views = []
favorites = []
for _, ad in cars_need_update.iterrows():
    ad_response = requests.get(ad['url'])
#     print(ad)
resp = requests.get(fulllink, headers = {'User-Agent': user_agent})
            
            
            lsthtml = resp.content
            lstsoup = BeautifulSoup(lsthtml)
            
            # Check if link is still good (i.e. listing is still active)
            if lstsoup.title.text.strip().lower() == 'not found':
                print('Bad link. Skipping...')
                bar.update(idx)
            else:

                # Get timestamp
                tstamp = int(re.search('(\d+)',tstamps[idx].text).group(0))

                # Check if timestamp is newer than maxts
                if tstamp <= maxts:
                    print('************ Found end of new data ************')
#                     print(f'var type of all_cars is: {type(all_cars)}')
                    moreresults = 0
                    break
#                 else:
#                     print(f'New car found: {idx} in link {fulllink}')

                # Get listing price
                price = lstsoup.select('h3.price')[0].text.strip().replace('$','').replace(',','')

                # Get seller's location
                if lstsoup.select('h2.location > a'):
                    location = lstsoup.select('h2.location > a')[0].text.strip()
                    city, state = location.split(',')
                    city = city.strip()
                    state = state.strip()

                # Get seller type (dealer or owner)
                sellerstr = lstsoup.select('div.fsbo')[0].text.strip()
                if re.search('(Dealer)', sellerstr):
                    seller = 'Dealer'
                elif re.search('(Owner)', sellerstr):
                    seller = 'Owner'
                    
                # Get number of photos
                if lstsoup.select('div.slider-uninitialized > p'):
                    picstr = lstsoup.select('div.slider-uninitialized > p')[0].text.strip()
                    n_pics = int(re.search('(\d+)',picstr).group())
                else:
                    if lstsoup.find(id='widgetPhoto').p:
                        picstr = lstsoup.find(id='widgetPhoto').p.text.strip()
                        n_pics = int(re.search('(\d+)',picstr).group())
                    else:
                        n_pics = 0

                # Get table of car specs
                specs = lstsoup.select('ul.listing-specifications')

                for li in specs[0].find_all('li'):
                    lititle = li.select('span.title')[0].text.strip().strip(':')
                    livalue = li.select('span.value')[0].text.strip().strip(':')

                    if livalue.lower() == 'not specified':
                        livalue = None

                    # Now a bunch of if-else statements to determine which column to add data to
                    # There might be a more sophisticated way to do this, perhaps with a tuple or a dictionary?
                    if lititle.lower() == 'year':
                        if livalue:
                            year = int(livalue)
                        else:
                            year = livalue
                    elif lititle.lower() == 'make':
                        make = livalue
                    elif lititle.lower() == 'model':
                        model = livalue
                    elif lititle.lower() == 'body':
                        body = livalue
                    elif lititle.lower() == 'mileage':
                        if livalue:
                            mileage = int(livalue.replace(',',''))
                        else:
                            mileage = livalue
                    elif lititle.lower() == 'title type':
                        title_type = livalue

                    # Below this are non-required specs    
                    elif lititle.lower() == 'trim':
                        trim = livalue
                    elif lititle.lower() == 'exterior color':
                        if livalue:
                            ext_color = livalue.lower()
                        else:
                            ext_color = livalue
                    elif lititle.lower() == 'interior color':
                        if livalue:
                            int_color = livalue.lower()
                        else:
                            int_color = livalue
                    elif lititle.lower() == 'transmission':
                        transmission = livalue
                    elif lititle.lower() == 'liters':
                        try:
                            liters = float(livalue)
                        except:
                            if livalue:
                                str1 = re.search('^(.*?)L',livalue).group(0).strip().replace(' ','')
                                if re.search('^(\D+)',str1):
                                    idxend = re.search('^(\D+)',str1).end()
                                    livalue = str1[idxend:-1]
                                    if re.search('(\D+)',livalue): # check if still other pollutants
                                        idxend = re.search('(\D+)',livalue).end()
                                        livalue = livalue[idxend:]
                                else:
                                    livalue = str1[:-1]
                                try:
                                    livalue = float(livalue)
                                except:
                                    print(url)
                                    print('****')
                                    print(link)
                            else:
                                liters = livalue
                    elif lititle.lower() == 'cylinders':
                        if livalue:
                            cylinders = int(livalue)
                        else:
                            cylinders = livalue
                    elif lititle.lower() == 'fuel type':
                        fuel_type = livalue
                    elif lititle.lower() == 'number of doors':
                        if livalue:
                            n_doors = int(livalue)
                        else:
                            n_doors = livalue
                    elif lititle.lower() == 'exterior condition':
                        ext_condition = livalue
                    elif lititle.lower() == 'interior condition':
                        int_condition = livalue
                    elif lititle.lower() == 'drive type':
                        drive_type = livalue
                    elif (lititle.lower() == 'vin') | (lititle.lower() == 'stock number') | (lititle.lower() == 'dealer license'):
                        None # Don't want to save these
                    else:
                        None
                        print(f'Unmatched param {lititle}: {livalue}') # <-- could take advantage of some or all of these

                curr_car = pd.DataFrame({"timestamp":[tstamp],
                                         "price":[price],
                                         "year":[year],
                                         "make":[make],
                                         "model":[model],
                                         "body":[body],
                                         "mileage":[mileage],
                                         "title_type":[title_type],
                                         "city":[city],
                                         "state":[state],
                                         "seller":[seller],
                                         "trim":[trim],
                                         "ext_color":[ext_color],
                                         "int_color":[int_color],
                                         "transmission":[transmission],
                                         "liters":[liters],
                                         "cylinders":[cylinders],
                                         "fuel_type":[fuel_type],
                                         "n_doors":[n_doors],
                                         "ext_condition":[ext_condition],
                                         "int_condition":[int_condition],
                                         "drive_type":[drive_type],
                                         "n_pics":[n_pics]})

timestamp                                  1584381281
price                                            9990
year                                             2013
make                                           Nissan
model                                            Juke
body                                        Hatchback
mileage                                         58000
title_type                                Clean Title
city                                   Salt Lake City
state                                              UT
seller                                         Dealer
trim                                               SV
ext_color                                        gray
int_color                                       black
transmission                               Automanual
liters                                            NaN
cylinders                                           4
fuel_type                                    Gasoline
n_doors                     

In [None]:
def update_listing(cars_csv):
    

In [None]:
# Try a multi-page test using carscraper function

# set cap for number of search pages to load (i.e. pages with up to 96 listings)
maxpg = 2

# Define root url for KSL cars
rooturl = "https://cars.ksl.com"

# Note the url below specifies that we're looking for 96 per page and the default sort of newest to oldest posting
# This note about newest to oldest is useful so that we can avoid scraping repeat listings based on their timestamps
# Also note that this url does NOT have a page number associated with it. This is added in the while loop below
lurl = "https://cars.ksl.com/search/newUsed/Used;Certified/perPage/96/page/"

count = 0
all_cars = []
while count < maxpg:
    url = lurl + str(count)
#     curr_cars, moreresults = carscraper(url, rooturl, 0)
    
#     curr_cars, moreresults = carscraperproxy(url, rooturl, 0)
    try:
        curr_cars, moreresults, proxydict = carscraperproxy(url, rooturl, 0, refreshmin = 15, proxydict = proxydict)
    except:
        curr_cars, moreresults, proxydict = carscraperproxy(url, rooturl, 0, refreshmin = 15)
    
    count += 1    
#     print(f'More results? {moreresults}')
    if type(curr_cars) is pd.core.frame.DataFrame: # make sure real data was returned
        try:
            all_cars = pd.concat([curr_cars, all_cars], ignore_index=True)
        except:
            all_cars = curr_cars
    else:
        print('No car data found!')
    
all_cars

In [None]:
# Dummy code to figure out how to find which proxies are new when refreshing list
set1 = set(['a','b','c','d','e'])
set2 = set(['b','c','e','f','h'])
# set2 = set(['a','b','c','d','e'])
newset = set2.difference(set1) # return elements in set2 that aren't in set1
if newset:
    print(newset)
else:
    print('No new elements in set 2')

In [None]:
# Finding a way to try out a new proxy without moving on to the next href until success

url = "https://cars.ksl.com/search/newUsed/Used;Certified/perPage/96/page/0"
# url = "https://httpbin.org/ip"

# user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'

# the following were pulled manually on 3/12/20 from https://www.whatismybrowser.com/guides/the-latest-user-agent/
user_agents = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
               'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
               'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
               'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/74.0',
               'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/74.0',
               'Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/74.0',
               'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0 Safari/605.1.15',
               'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36 Edg/80.0.361.62',
               'Mozilla/5.0 (Windows NT 10.0; Trident/7.0; rv:11.0) like Gecko']

if not proxies:
    proxies = generateProxies()

# make a dictionary of proxies and user-agents using dictionary comprehension
proxydict = {i:random.choice(user_agents) for i in proxies}

proxy_pool = cycle(proxydict)
currproxy = next(proxy_pool)

attempts = len(proxies) # for now, limit the total number of attempts to one per proxy. This will prevent endless while loop
for i in range(5):
    chkproxy = 1
    while chkproxy and attempts:
        try:
            resp = requests.get(url,proxies={"http":currproxy, "https":currproxy},headers={'User-Agent': proxydict[currproxy]}, timeout=20)
            html = resp.content
            print()
            if url == "https://httpbin.org/ip":
                print(html)
            else:
                pgsoup = BeautifulSoup(html)
                links = pgsoup.select("div.title > a.link") # grab all 96 (or up to 96) links
                print(f'Number of links found: {len(links)}')
            chkproxy = 0
        except:
            prevproxy = currproxy
            currproxy = next(proxy_pool)
            print(f'Proxy error for {prevproxy}! Next up is {currproxy}')
            attempts -= 1
            print(f'Attempts remaining: {attempts}')

In [None]:
set(proxydict.keys())

In [None]:
# Experiment with proxy and user-agent combos

url = "https://cars.ksl.com/search/newUsed/Used;Certified/perPage/96/page/0"
# url = "https://httpbin.org/ip"

# user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'

# the following were pulled manually on 3/12/20 from https://www.whatismybrowser.com/guides/the-latest-user-agent/
user_agents = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
               'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
               'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
               'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/74.0',
               'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/74.0',
               'Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/74.0',
               'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0 Safari/605.1.15',
               'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36 Edg/80.0.361.62',
               'Mozilla/5.0 (Windows NT 10.0; Trident/7.0; rv:11.0) like Gecko']

proxies = generateProxies()

# make a dictionary of proxies and user-agents using dictionary comprehension
proxydict = {i:random.choice(user_agents) for i in proxies}

proxy_pool = cycle(proxydict)
for i in range(30):
    currproxy = next(proxy_pool)
    try:
        resp = requests.get(url,proxies={"http":currproxy, "https":currproxy},headers={'User-Agent': proxydict[currproxy]}, timeout=15)
        html = resp.content
        print()
        if url == "https://httpbin.org/ip":
            print(html)
        else:
            pgsoup = BeautifulSoup(html)
            links = pgsoup.select("div.title > a.link") # grab all 96 (or up to 96) links
            print(f'Number of links found: {len(links)}')
        print(f'Success! proxy used: {currproxy}')
        print()
    except:
        print(f'Proxy error! proxy used: {currproxy}')

In [None]:
# Figuring out number of photos on listing page
# url = 'https://cars.ksl.com/listing/6269343' # 19 photos (seller)
# url = 'https://cars.ksl.com/listing/6101875' # 26 photos (dealer)
# url = 'https://cars.ksl.com/listing/6304284' # no photos (dealer)
url = 'https://cars.ksl.com/listing/6302069' # 1 photo (seller)

user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'

resp = requests.get(url, headers = {'User-Agent': user_agent})
lsthtml = resp.content
lstsoup = BeautifulSoup(lsthtml)
    
if lstsoup.select('div.slider-uninitialized > p'):
    picstr = lstsoup.select('div.slider-uninitialized > p')[0].text.strip()
    n_pics = int(re.search('(\d+)',picstr).group())
else:
    if lstsoup.find(id='widgetPhoto').p:
        picstr = lstsoup.find(id='widgetPhoto').p.text.strip()
        n_pics = int(re.search('(\d+)',picstr).group())
    else:
        n_pics = 0
        
print(f'Number of photos found: {n_pics}')
print(type(n_pics))

In [None]:
# Playing around with determining if car listing is still good (or if it's been removed)
testurl = "https://cars.ksl.com/listing/9999999"

user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'

resp = requests.get(testurl, headers = {'User-Agent': user_agent})
lsthtml = resp.content
lstsoup = BeautifulSoup(lsthtml)

if lstsoup.title.text.strip().lower() == 'not found':
    print('bad link')

In [None]:
# Playing around with checking time to see when to refresh proxy list
tstart = time.time()
time.sleep(5) # time in seconds
tend = time.time() - tstart
tend

In [None]:
# Playing around with timestamps for use when checking for new data

print(datetime.fromtimestamp(all_cars['timestamp'][0]).isoformat())
print(datetime.now())
currtime = time.time()
print(currtime)
print(datetime.fromtimestamp(currtime).isoformat())

### Below this is old code, some of which is no longer useful. Proceed with caution!

In [None]:
#######################################################################################
### DEPRECATED CODE: All functionality built into new optional proxy-based function ###
#######################################################################################

# Make a function for the scraping done for each search page

def carscraper(url, rooturl, maxts):
    '''INPUTS:
    url should be of the form "https://cars.ksl.com/search/newUsed/Used;Certified/perPage/96/page/0"
    rooturl should be something like "https://cars.ksl.com"
    maxts is the maximum timestamp of the all_cars repository
    
    ***NOTE: This function is meant to work with original IP address (as opposed to proxy) and a single spoofed user-agent'''
    
    # Need to spoof a user-agent in order to get past crawler block
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'

    resp = requests.get(url, headers = {'User-Agent': user_agent})
    html = resp.content
    pgsoup = BeautifulSoup(html)
    
    # Check if there are additional pages of results
    if pgsoup.find("a", {"title" : "Go forward 1 page"}):
        moreresults = 1
    else:
        moreresults = 0
    
    links = pgsoup.select("div.title > a.link") # grab all 96 (or up to 96) links
    tstamps = pgsoup.select("div.listing-detail-line script") # grab all 96 (or up to 96) timestamps

    # Loop through links and scrape data for each new listing
    all_cars = []
    with progressbar.ProgressBar(max_value=len(links)) as bar:
        for idx, link in enumerate(links): # *** only load first x results for now to avoid ban before implementing spoofing

            # Reset all fields to None before next loop
            price=year=make=model=body=mileage=title_type=city=state=seller=None
            trim=ext_color=int_color=transmission=liters=cylinders=fuel_type=n_doors=ext_condition=int_condition=drive_type=None

            # We're going to want to strip the "?ad_cid=[number]" from the end of these links as they're not needed to load the page properly
            # Regular expressions should come in handy here

            cutidx = re.search('(\?ad_cid=.+)',link['href']).start()
            currlink = link['href'][:cutidx]

            # Somewhere here we should do a check to make sure that the timestamp for currlink is newer than our newest file in our repository
            # That is, compare the timestamps with a simple conditional, where if the conditional is not met, this loop breaks to avoid useless computation time

            # Open listing link and pull html from it
            fulllink = '/'.join([rooturl.rstrip('/'), currlink.lstrip('/')])

            resp = requests.get(fulllink, headers = {'User-Agent': user_agent})
            lsthtml = resp.content
            lstsoup = BeautifulSoup(lsthtml)
            
            # Check if link is still good (i.e. listing is still active)
            if lstsoup.title.text.strip().lower() == 'not found':
                print('Bad link. Skipping...')
                bar.update(idx)
            else:

                # Get timestamp
                tstamp = int(re.search('(\d+)',tstamps[idx].text).group(0))

                # Check if timestamp is newer than maxts
                if tstamp <= maxts:
                    print('************ Found end of new data ************')
#                     print(f'var type of all_cars is: {type(all_cars)}')
                    moreresults = 0
                    break
#                 else:
#                     print(f'New car found: {idx} in link {fulllink}')

                # Get listing price
                price = lstsoup.select('h3.price')[0].text.strip().replace('$','').replace(',','')

                # Get seller's location
                if lstsoup.select('h2.location > a'):
                    location = lstsoup.select('h2.location > a')[0].text.strip()
                    city, state = location.split(',')
                    city = city.strip()
                    state = state.strip()

                # Get seller type (dealer or owner)
                sellerstr = lstsoup.select('div.fsbo')[0].text.strip()
                if re.search('(Dealer)', sellerstr):
                    seller = 'Dealer'
                elif re.search('(Owner)', sellerstr):
                    seller = 'Owner'
                    
                # Get number of photos
                if lstsoup.select('div.slider-uninitialized > p'):
                    picstr = lstsoup.select('div.slider-uninitialized > p')[0].text.strip()
                    n_pics = int(re.search('(\d+)',picstr).group())
                else:
                    if lstsoup.find(id='widgetPhoto').p:
                        picstr = lstsoup.find(id='widgetPhoto').p.text.strip()
                        n_pics = int(re.search('(\d+)',picstr).group())
                    else:
                        n_pics = 0

                # Get table of car specs
                specs = lstsoup.select('ul.listing-specifications')

                for li in specs[0].find_all('li'):
                    lititle = li.select('span.title')[0].text.strip().strip(':')
                    livalue = li.select('span.value')[0].text.strip().strip(':')

                    if livalue.lower() == 'not specified':
                        livalue = None

                    # Now a bunch of if-else statements to determine which column to add data to
                    # There might be a more sophisticated way to do this, perhaps with a tuple or a dictionary?
                    if lititle.lower() == 'year':
                        if livalue:
                            year = int(livalue)
                        else:
                            year = livalue
                    elif lititle.lower() == 'make':
                        make = livalue
                    elif lititle.lower() == 'model':
                        model = livalue
                    elif lititle.lower() == 'body':
                        body = livalue
                    elif lititle.lower() == 'mileage':
                        if livalue:
                            mileage = int(livalue.replace(',',''))
                        else:
                            mileage = livalue
                    elif lititle.lower() == 'title type':
                        title_type = livalue

                    # Below this are non-required specs    
                    elif lititle.lower() == 'trim':
                        trim = livalue
                    elif lititle.lower() == 'exterior color':
                        if livalue:
                            ext_color = livalue.lower()
                        else:
                            ext_color = livalue
                    elif lititle.lower() == 'interior color':
                        if livalue:
                            int_color = livalue.lower()
                        else:
                            int_color = livalue
                    elif lititle.lower() == 'transmission':
                        transmission = livalue
                    elif lititle.lower() == 'liters':
                        try:
                            liters = float(livalue)
                        except:
                            if livalue:
                                str1 = re.search('^(.*?)L',livalue).group(0).strip().replace(' ','')
                                if re.search('^(\D+)',str1):
                                    idxend = re.search('^(\D+)',str1).end()
                                    livalue = str1[idxend:-1]
                                    if re.search('(\D+)',livalue): # check if still other pollutants
                                        idxend = re.search('(\D+)',livalue).end()
                                        livalue = livalue[idxend:]
                                else:
                                    livalue = str1[:-1]
                                try:
                                    livalue = float(livalue)
                                except:
                                    print(url)
                                    print('****')
                                    print(link)
                            else:
                                liters = livalue
                    elif lititle.lower() == 'cylinders':
                        if livalue:
                            cylinders = int(livalue)
                        else:
                            cylinders = livalue
                    elif lititle.lower() == 'fuel type':
                        fuel_type = livalue
                    elif lititle.lower() == 'number of doors':
                        if livalue:
                            n_doors = int(livalue)
                        else:
                            n_doors = livalue
                    elif lititle.lower() == 'exterior condition':
                        ext_condition = livalue
                    elif lititle.lower() == 'interior condition':
                        int_condition = livalue
                    elif lititle.lower() == 'drive type':
                        drive_type = livalue
                    elif (lititle.lower() == 'vin') | (lititle.lower() == 'stock number') | (lititle.lower() == 'dealer license'):
                        None # Don't want to save these
                    else:
                        None
                        print(f'Unmatched param {lititle}: {livalue}') # <-- could take advantage of some or all of these

                curr_car = pd.DataFrame({"timestamp":[tstamp],
                                         "price":[price],
                                         "year":[year],
                                         "make":[make],
                                         "model":[model],
                                         "body":[body],
                                         "mileage":[mileage],
                                         "title_type":[title_type],
                                         "city":[city],
                                         "state":[state],
                                         "seller":[seller],
                                         "trim":[trim],
                                         "ext_color":[ext_color],
                                         "int_color":[int_color],
                                         "transmission":[transmission],
                                         "liters":[liters],
                                         "cylinders":[cylinders],
                                         "fuel_type":[fuel_type],
                                         "n_doors":[n_doors],
                                         "ext_condition":[ext_condition],
                                         "int_condition":[int_condition],
                                         "drive_type":[drive_type],
                                         "n_pics":[n_pics]})
                try:
                    all_cars = pd.concat([curr_car, all_cars])
                except:
                    all_cars = curr_car

                bar.update(idx)

    if type(all_cars) is pd.core.frame.DataFrame: # make sure that some data was actually scraped
        all_cars = all_cars.reset_index()
        del all_cars['index']
        all_cars.fillna(value=pd.np.nan, inplace=True)
    return all_cars, moreresults

In [None]:
# Experiment with using proxy IPs from above

# url = "https://cars.ksl.com/search/newUsed/Used;Certified/perPage/96/page/0"
url = "https://httpbin.org/ip"

user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'

for idx, proxy in enumerate(proxies):
#     user_agent = UAlist[idx]
    try:
        resp = requests.get(url,proxies={"http":proxy, "https":proxy},headers={'User-Agent': user_agent}, timeout=10)
        print(resp.content)
        print(f'Success! proxy used: {proxy}')
    except:
        print(f'Proxy error! proxy used: {proxy}')

In [None]:
####################################################
######### MIGHT GET BY WITH JUST PROXY IPs #########
####################################################

### Revisit this in the future if it becomes an issue
### For time being, just use a handful of user-agents

# Get list of user-agents

### Need to use API at some point rather than crawl/scrape since you can get 500 user-agents for free per month...and my IP got banned
### username: automodeals
### pw: kslclass123
### API documentation: https://developers.whatismybrowser.com/api/docs/v2/

API_key = '5ecab60888f7aebfbc4aad5850de52fa'

UAurl = "https://developers.whatismybrowser.com/useragents/explore/software_name/chrome/"

resp = requests.get(UAurl)
UAhtml = resp.content
UAsoup = BeautifulSoup(UAhtml)

UAlist = []
matches = UAsoup.select("table.table-useragents td.useragent")
for match in matches[:len(proxies)]: # only get as many user-agents are there are proxies. Dangerous to use more than one user-agent per IP
    UAlist.append(match.find('a').text.strip())
random.shuffle(UAlist)
UAlist

In [None]:
# Use user-agents API (example from https://github.com/whatismybrowser/api-v2-sample-code/blob/master/sample-code/python-3.6/user_agent_parse.py)

API_key = '5ecab60888f7aebfbc4aad5850de52fa'

headers = {'X-API-KEY': API_key}
# UAurl = "https://api.whatismybrowser.com/api/v2/user_agent_database_dump_url"
# UAurl = "https://api.whatismybrowser.com/api/v2/user_agent_database_search"

# The code below works for POSTing data, but we want to GET data

UAurl = "https://api.whatismybrowser.com/api/v2/user_agent_parse"

post_data = {
    "user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3282.167 Safari/537.36",
}

result = requests.post(UAurl, data=json.dumps(post_data), headers=headers)
result # if result is 200, then success!
result.json()

In [None]:
# Example of working live html parser without crawler block and user-agent spoof

# url = "http://www.python.org"
# resp = requests.get(url)
# html = resp.content
# print(html)

In [None]:
#################################################
####### DEPRECATED AS OF MARCH 12, 2020 #########
#################################################

# TCH: Many functionalities implemented in the carscraper function have not been copied over to this cell block


### Working example for a SINGLE KSL search results page

# maxresults = 20 # Set max number of listings to parse (per search results page)

# # Define root url for KSL cars
# rooturl = "https://cars.ksl.com"

# # Note the url below specifies that we're looking for 96 per page and the default sort of newest to oldest posting
# # This note about newest to oldest is useful so that we can avoid scraping repeat listings based on their timestamps
# url = "https://cars.ksl.com/search/newUsed/Used;Certified/perPage/96/page/0"

# # Need to spoof a user-agent in order to get past crawler block
# user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'

# # Note: The above user_agent might need to be rotated (along with IP) to avoid IP ban
# # Example found on https://www.scrapehero.com/how-to-fake-and-rotate-user-agents-using-python-3/

# all_cars = []

# # Open live page (as opposed to downloaded)
# resp = requests.get(url, headers = {'User-Agent': user_agent})
# html = resp.content
# pgsoup = BeautifulSoup(html)
# lastpg = int(pgsoup.find(attrs={"title": "Go to last page"}).text.strip()) # Note that this is 1 more than number from href for this page
# # print(f'Total number of search results pages: {lastpg}')
# # print()

# links = pgsoup.select("div.title > a.link") # grab all 96 (or up to 96) links
# # print(f'Total number of links found on current page: {len(links)}')
# tstamps = pgsoup.select("div.listing-detail-line script") # grab all 96 (or up to 96) timestamps
# # print(f'Total number of timestamps found on current page: {len(tstamps)}')

# # print()

# # for tstamp in tstamps:
# #     print(int(re.search('(\d+)',tstamp.text).group(0))) # <-- This is WORKING code to extract timestamp for each listing from search page
# # print()

# print(f'Limiting Subsequent Listing Results to {maxresults}')

# # Loop through links and scrape data for each new listing
# with progressbar.ProgressBar(max_value=maxresults) as bar:
#     for idx, link in enumerate(links[:maxresults]): # *** only load first x results for now to avoid ban before implementing spoofing

#         # Reset all fields to None before next loop
#         price=year=make=model=body=mileage=title_type=city=state=seller=None
#         trim=ext_color=int_color=transmission=liters=cylinders=fuel_type=n_doors=ext_condition=int_condition=drive_type=None
        
#         # We're going to want to strip the "?ad_cid=[number]" from the end of these links as they're not needed to load the page properly
#         # Regular expressions should come in handy here

#         cutidx = re.search('(\?ad_cid=.+)',link['href']).start()
#         currlink = link['href'][:cutidx]

#         # Somewhere here we should do a check to make sure that the timestamp for currlink is newer than our newest file in our repository
#         # That is, compare the timestamps with a simple conditional, where if the conditional is not met, this loop breaks to avoid useless computation time

#         # Open listing link and pull html from it
#         fulllink = '/'.join([rooturl.rstrip('/'), currlink.lstrip('/')])

#         resp = requests.get(fulllink, headers = {'User-Agent': user_agent})
#         lsthtml = resp.content
#         lstsoup = BeautifulSoup(lsthtml)

#         # Get listing price
#         price = lstsoup.select('h3.price')[0].text.strip().replace('$','').replace(',','')

#         # Get seller's location
#         location = lstsoup.select('h2.location > a')[0].text.strip()
#         city, state = location.split(',')
#         city = city.strip()
#         state = state.strip()

#         # Get seller type (dealer or owner)
#         sellerstr = lstsoup.select('div.fsbo')[0].text.strip()
#         if re.search('(Dealer)', sellerstr):
#             seller = 'Dealer'
#         elif re.search('(Owner)', sellerstr):
#             seller = 'Owner'

#         # Get timestamp
#         tstamp = int(re.search('(\d+)',tstamps[idx].text).group(0))

#         # Get table of car specs
#         specs = lstsoup.select('ul.listing-specifications')

#         for li in specs[0].find_all('li'):
#             lititle = li.select('span.title')[0].text.strip().strip(':')
#             livalue = li.select('span.value')[0].text.strip().strip(':')
            
#             if livalue.lower() == 'not specified':
#                 livalue = None

#             # Now a bunch of if-else statements to determine which column to add data to
#             # There might be a more sophisticated way to do this, perhaps with a tuple or a dictionary?
#             if lititle.lower() == 'year':
#                 if livalue:
#                     year = int(livalue)
#                 else:
#                     year = livalue
#             elif lititle.lower() == 'make':
#                 make = livalue
#             elif lititle.lower() == 'model':
#                 model = livalue
#             elif lititle.lower() == 'body':
#                 body = livalue
#             elif lititle.lower() == 'mileage':
#                 if livalue:
#                     mileage = int(livalue.replace(',',''))
#                 else:
#                     mileage = livalue
#             elif lititle.lower() == 'title type':
#                 title_type = livalue
                
#             # Below this are non-required specs    
#             elif lititle.lower() == 'trim':
#                 trim = livalue
#             elif lititle.lower() == 'exterior color':
#                 if livalue:
#                     ext_color = livalue.lower()
#                 else:
#                     ext_color = livalue
#             elif lititle.lower() == 'interior color':
#                 if livalue:
#                     int_color = livalue.lower()
#                 else:
#                     int_color = livalue
#             elif lititle.lower() == 'transmission':
#                 transmission = livalue
#             elif lititle.lower() == 'liters':
#                 try:
#                     liters = float(livalue)
#                 except:
#                     if livalue:
#                         str1 = re.search('^(.*?)L',livalue).group(0).strip().replace(' ','')
#                         if re.search('^(\D+)',str1):
#                             idxend = re.search('^(\D+)',str1).end()
#                             livalue = str1[idxend:-1]
#                         else:
#                             livalue = str1[:-1]
#                         livalue = float(livalue)
#                     else:
#                         liters = livalue
#             elif lititle.lower() == 'cylinders':
#                 if livalue:
#                     cylinders = int(livalue)
#                 else:
#                     cylinders = livalue
#             elif lititle.lower() == 'fuel type':
#                 fuel_type = livalue
#             elif lititle.lower() == 'number of doors':
#                 if livalue:
#                     n_doors = int(livalue)
#                 else:
#                     n_doors = livalue
#             elif lititle.lower() == 'exterior condition':
#                 ext_condition = livalue
#             elif lititle.lower() == 'interior condition':
#                 int_condition = livalue
#             elif lititle.lower() == 'drive type':
#                 drive_type = livalue
#             elif (lititle.lower() == 'vin') | (lititle.lower() == 'stock number') | (lititle.lower() == 'dealer license'):
#                 None # Don't want to save these
#             else:
#                 None
#                 print(f'Unmatched param {lititle}: {livalue}') # <-- could take advantage of some or all of these

#         curr_car = pd.DataFrame({"timestamp":[tstamp],
#                                  "price":[price],
#                                  "year":[year],
#                                  "make":[make],
#                                  "model":[model],
#                                  "body":[body],
#                                  "mileage":[mileage],
#                                  "title_type":[title_type],
#                                  "city":[city],
#                                  "state":[state],
#                                  "seller":[seller],
#                                  "trim":[trim],
#                                  "ext_color":[ext_color],
#                                  "int_color":[int_color],
#                                  "transmission":[transmission],
#                                  "liters":[liters],
#                                  "cylinders":[cylinders],
#                                  "fuel_type":[fuel_type],
#                                  "n_doors":[n_doors],
#                                  "ext_condition":[ext_condition],
#                                  "int_condition":[int_condition],
#                                  "drive_type":[drive_type]})
#         try:
#             all_cars = pd.concat([all_cars, curr_car])
#         except:
#             all_cars = curr_car

#         bar.update(idx)
        
# all_cars = all_cars.reset_index()
# del all_cars['index']
# all_cars.fillna(value=pd.np.nan, inplace=True)
# all_cars

In [None]:
#########################################################
######## USE SELENIUM INSTEAD TO GET BIGGER LIST ########
#########################################################

# # Get list of proxy IPs

# IPurl = "https://free-proxy-list.net"

# resp = requests.get(IPurl)
# IPhtml = resp.content
# IPsoup = BeautifulSoup(IPhtml)

# proxies = []
# for tr in IPsoup.find(id='proxylisttable').find('tbody').find_all('tr'):
#     tds = tr.find_all('td')
#     if (tds[2].text.strip() == 'US') & (tds[6].text.strip() == 'yes') & (tds[4].text.strip() != 'transparent'):
#         proxies.append(''.join(['http://', ':'.join([tds[0].text.strip(), tds[1].text.strip()])])) # grab the IP addresses matching the above criteria
# random.shuffle(proxies)
# proxies