## To Do:
* Rotate through user-agents and IPs
* Keep a database or dictionary of IP:user-agent combos to avoid using different user-agents for the same IP?
* Consider expanding IP list to include limited countries outside US
* Figure out how to use API for getting user-agents (to avoid IP ban)
* Make sure newer cars are added at top of repository such that timestamps are _roughly_ in order
* Get page views (after certain time period?)
* Get number of times listing has been favorited?
* Limit results to just those with a picture?
* Get count of number of pictures?
* Handle cases when no new cars are found
* What do we do when there's a broken link?

In [1]:
from bs4 import BeautifulSoup # if this isn't installed, use pip install beautifulsoup4
import requests
import re
import pandas as pd
import numpy as np
from datetime import datetime
import time
import progressbar # if this isn't installed, use pip install progressbar2
import random
import json

In [None]:
# Get list of proxy IPs

IPurl = "https://free-proxy-list.net"

resp = requests.get(IPurl)
IPhtml = resp.content
IPsoup = BeautifulSoup(IPhtml)

proxies = []
for tr in IPsoup.find(id='proxylisttable').find('tbody').find_all('tr'):
    tds = tr.find_all('td')
    if (tds[2].text.strip() == 'US') & (tds[6].text.strip() == 'yes') & (tds[4].text.strip() != 'transparent'):
        proxies.append(''.join(['http://', ':'.join([tds[0].text.strip(), tds[1].text.strip()])])) # grab the IP addresses matching the above criteria
random.shuffle(proxies)
proxies

In [None]:
# Get list of US-based proxy IPs

IPurl = "https://www.us-proxy.org/" # <-- the robots.txt file for this site allows full access for all user-agents

resp = requests.get(IPurl)
IPhtml = resp.content
IPsoup = BeautifulSoup(IPhtml)

proxies = []


In [None]:
# Get list of user-agents

### Need to use API at some point rather than crawl/scrape since you can get 500 user-agents for free per month...and my IP got banned
### username: automodeals
### pw: kslclass123
### API documentation: https://developers.whatismybrowser.com/api/docs/v2/

API_key = '5ecab60888f7aebfbc4aad5850de52fa'

UAurl = "https://developers.whatismybrowser.com/useragents/explore/software_name/chrome/"

resp = requests.get(UAurl)
UAhtml = resp.content
UAsoup = BeautifulSoup(UAhtml)

UAlist = []
matches = UAsoup.select("table.table-useragents td.useragent")
for match in matches[:len(proxies)]: # only get as many user-agents are there are proxies. Dangerous to use more than one user-agent per IP
    UAlist.append(match.find('a').text.strip())
random.shuffle(UAlist)
UAlist

In [None]:
# Use user-agents API (example from https://github.com/whatismybrowser/api-v2-sample-code/blob/master/sample-code/python-3.6/user_agent_parse.py)

API_key = '5ecab60888f7aebfbc4aad5850de52fa'

headers = {'X-API-KEY': API_key}
# UAurl = "https://api.whatismybrowser.com/api/v2/user_agent_database_dump_url"
# UAurl = "https://api.whatismybrowser.com/api/v2/user_agent_database_search"

# The code below works for POSTing data, but we want to GET data

UAurl = "https://api.whatismybrowser.com/api/v2/user_agent_parse"

post_data = {
    "user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3282.167 Safari/537.36",
}

result = requests.post(UAurl, data=json.dumps(post_data), headers=headers)
result # if result is 200, then success!
result.json()

In [None]:
# Experiment with using proxy IPs from above

# url = "https://cars.ksl.com/search/newUsed/Used;Certified/perPage/96/page/0"
url = "https://httpbin.org/ip"

user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'

for idx, proxy in enumerate(proxies):
#     user_agent = UAlist[idx]
    try:
        resp = requests.get(url,proxies={"http":proxy, "https":proxy},headers={'User-Agent': user_agent})
#         print(resp.content)
        print(f'Success! proxy used: {proxy}')
    except:
        print('Proxy error')

In [2]:
# Make a function for the scraping done for each search page
def carscraper(url, rooturl, maxts):
    # url should be of the form "https://cars.ksl.com/search/newUsed/Used;Certified/perPage/96/page/0"
    # rooturl should be something like "https://cars.ksl.com"
    # maxts is the maximum timestamp of the all_cars repository
    
    # Need to spoof a user-agent in order to get past crawler block
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'

    resp = requests.get(url, headers = {'User-Agent': user_agent})
    html = resp.content
    pgsoup = BeautifulSoup(html)
    
    # Check if there are additional pages of results
    if pgsoup.find("a", {"title" : "Go forward 1 page"}):
        moreresults = 1
    else:
        moreresults = 0
    
    links = pgsoup.select("div.title > a.link") # grab all 96 (or up to 96) links
    tstamps = pgsoup.select("div.listing-detail-line script") # grab all 96 (or up to 96) timestamps

    # Loop through links and scrape data for each new listing
    all_cars = []
    with progressbar.ProgressBar(max_value=len(links)) as bar:
        for idx, link in enumerate(links): # *** only load first x results for now to avoid ban before implementing spoofing

            # Reset all fields to None before next loop
            price=year=make=model=body=mileage=title_type=city=state=seller=None
            trim=ext_color=int_color=transmission=liters=cylinders=fuel_type=num_doors=ext_condition=int_condition=drive_type=None

            # We're going to want to strip the "?ad_cid=[number]" from the end of these links as they're not needed to load the page properly
            # Regular expressions should come in handy here

            cutidx = re.search('(\?ad_cid=.+)',link['href']).start()
            currlink = link['href'][:cutidx]

            # Somewhere here we should do a check to make sure that the timestamp for currlink is newer than our newest file in our repository
            # That is, compare the timestamps with a simple conditional, where if the conditional is not met, this loop breaks to avoid useless computation time

            # Open listing link and pull html from it
            fulllink = '/'.join([rooturl.rstrip('/'), currlink.lstrip('/')])

            resp = requests.get(fulllink, headers = {'User-Agent': user_agent})
            lsthtml = resp.content
            lstsoup = BeautifulSoup(lsthtml)
            
            # Check if link is still good (i.e. listing is still active)
            if lstsoup.title.text.strip().lower() == 'not found':
                print('Bad link. Skipping...')
                bar.update(idx)
            else:

                # Get listing price
                price = lstsoup.select('h3.price')[0].text.strip().replace('$','').replace(',','')

                # Get seller's location
                location = lstsoup.select('h2.location > a')[0].text.strip()
                city, state = location.split(',')
                city = city.strip()
                state = state.strip()

                # Get seller type (dealer or owner)
                sellerstr = lstsoup.select('div.fsbo')[0].text.strip()
                if re.search('(Dealer)', sellerstr):
                    seller = 'Dealer'
                elif re.search('(Owner)', sellerstr):
                    seller = 'Owner'

                # Get timestamp
                tstamp = int(re.search('(\d+)',tstamps[idx].text).group(0))

                # Check if timestamp is newer than maxts
                if tstamp <= maxts:
                    print('************ Found end of new data ************')
#                     print(f'var type of all_cars is: {type(all_cars)}')
                    moreresults = 0
                    break
#                 else:
#                     print(f'New car found: {idx} in link {fulllink}')

                # Get table of car specs
                specs = lstsoup.select('ul.listing-specifications')

                for li in specs[0].find_all('li'):
                    lititle = li.select('span.title')[0].text.strip().strip(':')
                    livalue = li.select('span.value')[0].text.strip().strip(':')

                    if livalue.lower() == 'not specified':
                        livalue = None

                    # Now a bunch of if-else statements to determine which column to add data to
                    # There might be a more sophisticated way to do this, perhaps with a tuple or a dictionary?
                    if lititle.lower() == 'year':
                        if livalue:
                            year = int(livalue)
                        else:
                            year = livalue
                    elif lititle.lower() == 'make':
                        make = livalue
                    elif lititle.lower() == 'model':
                        model = livalue
                    elif lititle.lower() == 'body':
                        body = livalue
                    elif lititle.lower() == 'mileage':
                        if livalue:
                            mileage = int(livalue.replace(',',''))
                        else:
                            mileage = livalue
                    elif lititle.lower() == 'title type':
                        title_type = livalue

                    # Below this are non-required specs    
                    elif lititle.lower() == 'trim':
                        trim = livalue
                    elif lititle.lower() == 'exterior color':
                        if livalue:
                            ext_color = livalue.lower()
                        else:
                            ext_color = livalue
                    elif lititle.lower() == 'interior color':
                        if livalue:
                            int_color = livalue.lower()
                        else:
                            int_color = livalue
                    elif lititle.lower() == 'transmission':
                        transmission = livalue
                    elif lititle.lower() == 'liters':
                        try:
                            liters = float(livalue)
                        except:
                            if livalue:
                                str1 = re.search('^(.*?)L',livalue).group(0).strip().replace(' ','')
                                if re.search('^(\D+)',str1):
                                    idxend = re.search('^(\D+)',str1).end()
                                    livalue = str1[idxend:-1]
                                else:
                                    livalue = str1[:-1]
                                try:
                                    livalue = float(livalue)
                                except:
                                    print(url)
                                    print('****')
                                    print(link)
                            else:
                                liters = livalue
                    elif lititle.lower() == 'cylinders':
                        if livalue:
                            cylinders = int(livalue)
                        else:
                            cylinders = livalue
                    elif lititle.lower() == 'fuel type':
                        fuel_type = livalue
                    elif lititle.lower() == 'number of doors':
                        if livalue:
                            num_doors = int(livalue)
                        else:
                            num_doors = livalue
                    elif lititle.lower() == 'exterior condition':
                        ext_condition = livalue
                    elif lititle.lower() == 'interior condition':
                        int_condition = livalue
                    elif lititle.lower() == 'drive type':
                        drive_type = livalue
                    elif (lititle.lower() == 'vin') | (lititle.lower() == 'stock number') | (lititle.lower() == 'dealer license'):
                        None # Don't want to save these
                    else:
                        None
                        print(f'Unmatched param {lititle}: {livalue}') # <-- could take advantage of some or all of these

                curr_car = pd.DataFrame({"timestamp":[tstamp],
                                         "price":[price],
                                         "year":[year],
                                         "make":[make],
                                         "model":[model],
                                         "body":[body],
                                         "mileage":[mileage],
                                         "title_type":[title_type],
                                         "city":[city],
                                         "state":[state],
                                         "seller":[seller],
                                         "trim":[trim],
                                         "ext_color":[ext_color],
                                         "int_color":[int_color],
                                         "transmission":[transmission],
                                         "liters":[liters],
                                         "cylinders":[cylinders],
                                         "fuel_type":[fuel_type],
                                         "num_doors":[num_doors],
                                         "ext_condition":[ext_condition],
                                         "int_condition":[int_condition],
                                         "drive_type":[drive_type]})
                try:
                    all_cars = pd.concat([all_cars, curr_car])
                except:
                    all_cars = curr_car

                bar.update(idx)

    if type(all_cars) is pd.core.frame.DataFrame: # make sure that some data was actually scraped
        all_cars = all_cars.reset_index()
        del all_cars['index']
        all_cars.fillna(value=pd.np.nan, inplace=True)
    return all_cars, moreresults

In [3]:
# Try a multi-page test using carscraper function

# set cap for number of search pages to load (i.e. pages with up to 96 listings)
maxpg = 2

# Define root url for KSL cars
rooturl = "https://cars.ksl.com"

# Note the url below specifies that we're looking for 96 per page and the default sort of newest to oldest posting
# This note about newest to oldest is useful so that we can avoid scraping repeat listings based on their timestamps
# Also note that this url does NOT have a page number associated with it. This is added in the while loop below
lurl = "https://cars.ksl.com/search/newUsed/Used;Certified/perPage/96/page/"

count = 0
all_cars = []
while count < maxpg:
    url = lurl + str(count)
    curr_cars, moreresults = carscraper(url, rooturl, 0)
    count += 1    
#     print(f'More results? {moreresults}')
    if type(curr_cars) is pd.core.frame.DataFrame: # make sure real data was returned
        try:
            all_cars = pd.concat([all_cars, curr_cars], ignore_index=True)
        except:
            all_cars = curr_cars
    else:
        print('No car data found!')
    
all_cars

N/A% (0 of 96) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--

New car found: 0 in link https://cars.ksl.com/listing/6304315


  1% (1 of 96) |                         | Elapsed Time: 0:00:00 ETA:   0:01:06

New car found: 1 in link https://cars.ksl.com/listing/6187638


  2% (2 of 96) |                         | Elapsed Time: 0:00:02 ETA:   0:01:51

New car found: 2 in link https://cars.ksl.com/listing/6304249


  3% (3 of 96) |                         | Elapsed Time: 0:00:05 ETA:   0:04:14

New car found: 3 in link https://cars.ksl.com/listing/6172464


  4% (4 of 96) |#                        | Elapsed Time: 0:00:05 ETA:   0:02:36

New car found: 4 in link https://cars.ksl.com/listing/6304306


  5% (5 of 96) |#                        | Elapsed Time: 0:00:07 ETA:   0:01:55

New car found: 5 in link https://cars.ksl.com/listing/6172462


  6% (6 of 96) |#                        | Elapsed Time: 0:00:08 ETA:   0:02:01

New car found: 6 in link https://cars.ksl.com/listing/6187634


  7% (7 of 96) |#                        | Elapsed Time: 0:00:10 ETA:   0:01:51

New car found: 7 in link https://cars.ksl.com/listing/6285417


  8% (8 of 96) |##                       | Elapsed Time: 0:00:10 ETA:   0:01:43

New car found: 8 in link https://cars.ksl.com/listing/6119936


  9% (9 of 96) |##                       | Elapsed Time: 0:00:11 ETA:   0:01:27

New car found: 9 in link https://cars.ksl.com/listing/6304302


 10% (10 of 96) |##                      | Elapsed Time: 0:00:12 ETA:   0:01:01

New car found: 10 in link https://cars.ksl.com/listing/6172442


 11% (11 of 96) |##                      | Elapsed Time: 0:00:13 ETA:   0:01:07

New car found: 11 in link https://cars.ksl.com/listing/6119935


 12% (12 of 96) |###                     | Elapsed Time: 0:00:13 ETA:   0:01:07

New car found: 12 in link https://cars.ksl.com/listing/6304300


 13% (13 of 96) |###                     | Elapsed Time: 0:00:16 ETA:   0:03:58

New car found: 13 in link https://cars.ksl.com/listing/6285415


 14% (14 of 96) |###                     | Elapsed Time: 0:00:17 ETA:   0:02:24

New car found: 14 in link https://cars.ksl.com/listing/6285354


 15% (15 of 96) |###                     | Elapsed Time: 0:00:18 ETA:   0:01:53

New car found: 15 in link https://cars.ksl.com/listing/6187627


 16% (16 of 96) |####                    | Elapsed Time: 0:00:19 ETA:   0:01:35

New car found: 16 in link https://cars.ksl.com/listing/6157787


 17% (17 of 96) |####                    | Elapsed Time: 0:00:21 ETA:   0:02:25

New car found: 17 in link https://cars.ksl.com/listing/6172435


 18% (18 of 96) |####                    | Elapsed Time: 0:00:22 ETA:   0:01:52

New car found: 18 in link https://cars.ksl.com/listing/6295947


 19% (19 of 96) |####                    | Elapsed Time: 0:00:23 ETA:   0:01:30

New car found: 19 in link https://cars.ksl.com/listing/6216182


 20% (20 of 96) |#####                   | Elapsed Time: 0:00:24 ETA:   0:00:56

New car found: 20 in link https://cars.ksl.com/listing/6154855


 21% (21 of 96) |#####                   | Elapsed Time: 0:00:25 ETA:   0:01:30

New car found: 21 in link https://cars.ksl.com/listing/6219733


 22% (22 of 96) |#####                   | Elapsed Time: 0:00:26 ETA:   0:01:28

New car found: 22 in link https://cars.ksl.com/listing/6304299


 23% (23 of 96) |#####                   | Elapsed Time: 0:00:29 ETA:   0:03:22

New car found: 23 in link https://cars.ksl.com/listing/6232253


 25% (24 of 96) |######                  | Elapsed Time: 0:00:30 ETA:   0:02:10

New car found: 24 in link https://cars.ksl.com/listing/6172833


 26% (25 of 96) |######                  | Elapsed Time: 0:00:30 ETA:   0:01:44

New car found: 25 in link https://cars.ksl.com/listing/6172832


 27% (26 of 96) |######                  | Elapsed Time: 0:00:31 ETA:   0:00:54

New car found: 26 in link https://cars.ksl.com/listing/6218402


 28% (27 of 96) |######                  | Elapsed Time: 0:00:32 ETA:   0:00:46

New car found: 27 in link https://cars.ksl.com/listing/6304296


 29% (28 of 96) |#######                 | Elapsed Time: 0:00:33 ETA:   0:01:19

New car found: 28 in link https://cars.ksl.com/listing/6304295


 30% (29 of 96) |#######                 | Elapsed Time: 0:00:35 ETA:   0:01:59

New car found: 29 in link https://cars.ksl.com/listing/6187618


 31% (30 of 96) |#######                 | Elapsed Time: 0:00:37 ETA:   0:01:59

New car found: 30 in link https://cars.ksl.com/listing/6267965


 32% (31 of 96) |#######                 | Elapsed Time: 0:00:38 ETA:   0:01:22

New car found: 31 in link https://cars.ksl.com/listing/6267964


 33% (32 of 96) |########                | Elapsed Time: 0:00:39 ETA:   0:01:17

New car found: 32 in link https://cars.ksl.com/listing/6267962


 34% (33 of 96) |########                | Elapsed Time: 0:00:40 ETA:   0:01:11

New car found: 33 in link https://cars.ksl.com/listing/6304294


 35% (34 of 96) |########                | Elapsed Time: 0:00:42 ETA:   0:01:15

New car found: 34 in link https://cars.ksl.com/listing/6267963


 36% (35 of 96) |########                | Elapsed Time: 0:00:43 ETA:   0:01:19

New car found: 35 in link https://cars.ksl.com/listing/6267960


 37% (36 of 96) |#########               | Elapsed Time: 0:00:43 ETA:   0:01:10

New car found: 36 in link https://cars.ksl.com/listing/6119298


 38% (37 of 96) |#########               | Elapsed Time: 0:00:46 ETA:   0:02:41

New car found: 37 in link https://cars.ksl.com/listing/6157756


 39% (38 of 96) |#########               | Elapsed Time: 0:00:47 ETA:   0:01:43

New car found: 38 in link https://cars.ksl.com/listing/6172419


 40% (39 of 96) |#########               | Elapsed Time: 0:00:48 ETA:   0:01:19

New car found: 39 in link https://cars.ksl.com/listing/6304293


 41% (40 of 96) |##########              | Elapsed Time: 0:00:49 ETA:   0:01:06

New car found: 40 in link https://cars.ksl.com/listing/6218399


 42% (41 of 96) |##########              | Elapsed Time: 0:00:51 ETA:   0:01:35

New car found: 41 in link https://cars.ksl.com/listing/6187612


 43% (42 of 96) |##########              | Elapsed Time: 0:00:52 ETA:   0:01:05

New car found: 42 in link https://cars.ksl.com/listing/6157755


 44% (43 of 96) |##########              | Elapsed Time: 0:00:53 ETA:   0:00:54

New car found: 43 in link https://cars.ksl.com/listing/6187609


 45% (44 of 96) |###########             | Elapsed Time: 0:00:53 ETA:   0:00:38

New car found: 44 in link https://cars.ksl.com/listing/6172416


 46% (45 of 96) |###########             | Elapsed Time: 0:00:54 ETA:   0:00:36

New car found: 45 in link https://cars.ksl.com/listing/6304289


 47% (46 of 96) |###########             | Elapsed Time: 0:00:56 ETA:   0:00:57

New car found: 46 in link https://cars.ksl.com/listing/6208472


 48% (47 of 96) |###########             | Elapsed Time: 0:00:58 ETA:   0:01:27

New car found: 47 in link https://cars.ksl.com/listing/6157754


 50% (48 of 96) |############            | Elapsed Time: 0:00:58 ETA:   0:01:00

New car found: 48 in link https://cars.ksl.com/listing/6187607


 51% (49 of 96) |############            | Elapsed Time: 0:00:59 ETA:   0:00:49

New car found: 49 in link https://cars.ksl.com/listing/6172413


 52% (50 of 96) |############            | Elapsed Time: 0:01:00 ETA:   0:00:32

New car found: 50 in link https://cars.ksl.com/listing/6187606


 53% (51 of 96) |############            | Elapsed Time: 0:01:00 ETA:   0:00:30

New car found: 51 in link https://cars.ksl.com/listing/6172412


 54% (52 of 96) |#############           | Elapsed Time: 0:01:01 ETA:   0:00:30

New car found: 52 in link https://cars.ksl.com/listing/6119295


 55% (53 of 96) |#############           | Elapsed Time: 0:01:03 ETA:   0:00:49

New car found: 53 in link https://cars.ksl.com/listing/6208471


 56% (54 of 96) |#############           | Elapsed Time: 0:01:04 ETA:   0:01:14

New car found: 54 in link https://cars.ksl.com/listing/6088715


 57% (55 of 96) |#############           | Elapsed Time: 0:01:05 ETA:   0:00:56

New car found: 55 in link https://cars.ksl.com/listing/6285084


 58% (56 of 96) |##############          | Elapsed Time: 0:01:06 ETA:   0:00:46

New car found: 56 in link https://cars.ksl.com/listing/6157753


 59% (57 of 96) |##############          | Elapsed Time: 0:01:07 ETA:   0:00:29

New car found: 57 in link https://cars.ksl.com/listing/6266615


 60% (58 of 96) |##############          | Elapsed Time: 0:01:08 ETA:   0:00:45

New car found: 58 in link https://cars.ksl.com/listing/6304287


 61% (59 of 96) |##############          | Elapsed Time: 0:01:09 ETA:   0:00:43

New car found: 59 in link https://cars.ksl.com/listing/6267955


 62% (60 of 96) |###############         | Elapsed Time: 0:01:10 ETA:   0:00:36

New car found: 60 in link https://cars.ksl.com/listing/6267953


 63% (61 of 96) |###############         | Elapsed Time: 0:01:10 ETA:   0:00:23

New car found: 61 in link https://cars.ksl.com/listing/6304284


 64% (62 of 96) |###############         | Elapsed Time: 0:01:11 ETA:   0:00:23

New car found: 62 in link https://cars.ksl.com/listing/6267954


 65% (63 of 96) |###############         | Elapsed Time: 0:01:13 ETA:   0:00:41

New car found: 63 in link https://cars.ksl.com/listing/6088714


 66% (64 of 96) |################        | Elapsed Time: 0:01:14 ETA:   0:00:40

New car found: 64 in link https://cars.ksl.com/listing/6103830


 67% (65 of 96) |################        | Elapsed Time: 0:01:15 ETA:   0:00:37

New car found: 65 in link https://cars.ksl.com/listing/6267951


 68% (66 of 96) |################        | Elapsed Time: 0:01:16 ETA:   0:00:36

New car found: 66 in link https://cars.ksl.com/listing/6172407


 69% (67 of 96) |################        | Elapsed Time: 0:01:17 ETA:   0:00:30

New car found: 67 in link https://cars.ksl.com/listing/6187597


 70% (68 of 96) |#################       | Elapsed Time: 0:01:19 ETA:   0:00:34

New car found: 68 in link https://cars.ksl.com/listing/6203437


 71% (69 of 96) |#################       | Elapsed Time: 0:01:19 ETA:   0:00:32

New car found: 69 in link https://cars.ksl.com/listing/6157750


 72% (70 of 96) |#################       | Elapsed Time: 0:01:20 ETA:   0:00:27

New car found: 70 in link https://cars.ksl.com/listing/6119293


 73% (71 of 96) |#################       | Elapsed Time: 0:01:21 ETA:   0:00:19

New car found: 71 in link https://cars.ksl.com/listing/6187591


 75% (72 of 96) |##################      | Elapsed Time: 0:01:22 ETA:   0:00:18

New car found: 72 in link https://cars.ksl.com/listing/6285388


 76% (73 of 96) |##################      | Elapsed Time: 0:01:22 ETA:   0:00:17

New car found: 73 in link https://cars.ksl.com/listing/6157744


 77% (74 of 96) |##################      | Elapsed Time: 0:01:24 ETA:   0:00:27

New car found: 74 in link https://cars.ksl.com/listing/6172401


 78% (75 of 96) |##################      | Elapsed Time: 0:01:26 ETA:   0:00:36

New car found: 75 in link https://cars.ksl.com/listing/6187587


 79% (76 of 96) |###################     | Elapsed Time: 0:01:27 ETA:   0:00:24

New car found: 76 in link https://cars.ksl.com/listing/6187584


 80% (77 of 96) |###################     | Elapsed Time: 0:01:28 ETA:   0:00:21

New car found: 77 in link https://cars.ksl.com/listing/6304279


 81% (78 of 96) |###################     | Elapsed Time: 0:01:29 ETA:   0:00:19

New car found: 78 in link https://cars.ksl.com/listing/6304278


 82% (79 of 96) |###################     | Elapsed Time: 0:01:30 ETA:   0:00:19

New car found: 79 in link https://cars.ksl.com/listing/6304277


 83% (80 of 96) |####################    | Elapsed Time: 0:01:33 ETA:   0:00:35

New car found: 80 in link https://cars.ksl.com/listing/5361979


 84% (81 of 96) |####################    | Elapsed Time: 0:01:34 ETA:   0:00:29

New car found: 81 in link https://cars.ksl.com/listing/6172394


 85% (82 of 96) |####################    | Elapsed Time: 0:01:35 ETA:   0:00:16

New car found: 82 in link https://cars.ksl.com/listing/6187582


 86% (83 of 96) |####################    | Elapsed Time: 0:01:36 ETA:   0:00:13

New car found: 83 in link https://cars.ksl.com/listing/6172392


 87% (84 of 96) |#####################   | Elapsed Time: 0:01:37 ETA:   0:00:14

New car found: 84 in link https://cars.ksl.com/listing/6119907


 88% (85 of 96) |#####################   | Elapsed Time: 0:01:40 ETA:   0:00:24

New car found: 85 in link https://cars.ksl.com/listing/6285383


 89% (86 of 96) |#####################   | Elapsed Time: 0:01:42 ETA:   0:00:20

New car found: 86 in link https://cars.ksl.com/listing/6285382


 90% (87 of 96) |#####################   | Elapsed Time: 0:01:42 ETA:   0:00:11

New car found: 87 in link https://cars.ksl.com/listing/6304276


 91% (88 of 96) |######################  | Elapsed Time: 0:01:43 ETA:   0:00:08

New car found: 88 in link https://cars.ksl.com/listing/6119906


 92% (89 of 96) |######################  | Elapsed Time: 0:01:45 ETA:   0:00:08

New car found: 89 in link https://cars.ksl.com/listing/6304275


 93% (90 of 96) |######################  | Elapsed Time: 0:01:47 ETA:   0:00:10

New car found: 90 in link https://cars.ksl.com/listing/6203427


 94% (91 of 96) |######################  | Elapsed Time: 0:01:47 ETA:   0:00:06

New car found: 91 in link https://cars.ksl.com/listing/6119905


 95% (92 of 96) |####################### | Elapsed Time: 0:01:48 ETA:   0:00:04

New car found: 92 in link https://cars.ksl.com/listing/6187577


 96% (93 of 96) |####################### | Elapsed Time: 0:01:49 ETA:   0:00:02

New car found: 93 in link https://cars.ksl.com/listing/6304274


 97% (94 of 96) |####################### | Elapsed Time: 0:01:51 ETA:   0:00:05

New car found: 94 in link https://cars.ksl.com/listing/6304273


100% (96 of 96) |########################| Elapsed Time: 0:01:54 Time:  0:01:54


New car found: 95 in link https://cars.ksl.com/listing/6119903
More results? 1


N/A% (0 of 96) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--

New car found: 0 in link https://cars.ksl.com/listing/6172394


  1% (1 of 96) |                         | Elapsed Time: 0:00:02 ETA:   0:03:15

New car found: 1 in link https://cars.ksl.com/listing/6187582


  2% (2 of 96) |                         | Elapsed Time: 0:00:02 ETA:   0:02:07

New car found: 2 in link https://cars.ksl.com/listing/6172392


  3% (3 of 96) |                         | Elapsed Time: 0:00:03 ETA:   0:01:38

New car found: 3 in link https://cars.ksl.com/listing/6119907


  4% (4 of 96) |#                        | Elapsed Time: 0:00:04 ETA:   0:01:03

New car found: 4 in link https://cars.ksl.com/listing/6285383


  5% (5 of 96) |#                        | Elapsed Time: 0:00:05 ETA:   0:01:59

New car found: 5 in link https://cars.ksl.com/listing/6285382


  6% (6 of 96) |#                        | Elapsed Time: 0:00:06 ETA:   0:01:44

New car found: 6 in link https://cars.ksl.com/listing/6304276


  7% (7 of 96) |#                        | Elapsed Time: 0:00:07 ETA:   0:01:29

New car found: 7 in link https://cars.ksl.com/listing/6119906


  8% (8 of 96) |##                       | Elapsed Time: 0:00:07 ETA:   0:01:20

New car found: 8 in link https://cars.ksl.com/listing/6304275


  9% (9 of 96) |##                       | Elapsed Time: 0:00:08 ETA:   0:00:59

New car found: 9 in link https://cars.ksl.com/listing/6203427


 10% (10 of 96) |##                      | Elapsed Time: 0:00:09 ETA:   0:00:57

New car found: 10 in link https://cars.ksl.com/listing/6119905


 11% (11 of 96) |##                      | Elapsed Time: 0:00:09 ETA:   0:00:58

New car found: 11 in link https://cars.ksl.com/listing/6187577


 12% (12 of 96) |###                     | Elapsed Time: 0:00:10 ETA:   0:00:57

New car found: 12 in link https://cars.ksl.com/listing/6304274


 13% (13 of 96) |###                     | Elapsed Time: 0:00:11 ETA:   0:01:02

New car found: 13 in link https://cars.ksl.com/listing/6304273


 14% (14 of 96) |###                     | Elapsed Time: 0:00:12 ETA:   0:01:06

New car found: 14 in link https://cars.ksl.com/listing/6119903


 15% (15 of 96) |###                     | Elapsed Time: 0:00:13 ETA:   0:01:10

New car found: 15 in link https://cars.ksl.com/listing/6187572


 16% (16 of 96) |####                    | Elapsed Time: 0:00:14 ETA:   0:01:47

New car found: 16 in link https://cars.ksl.com/listing/6172381


 17% (17 of 96) |####                    | Elapsed Time: 0:00:15 ETA:   0:01:40

New car found: 17 in link https://cars.ksl.com/listing/6304272


 18% (18 of 96) |####                    | Elapsed Time: 0:00:19 ETA:   0:04:53

New car found: 18 in link https://cars.ksl.com/listing/6187567


 19% (19 of 96) |####                    | Elapsed Time: 0:00:20 ETA:   0:02:50

New car found: 19 in link https://cars.ksl.com/listing/6304271


 20% (20 of 96) |#####                   | Elapsed Time: 0:00:23 ETA:   0:03:47

New car found: 20 in link https://cars.ksl.com/listing/5757686


 21% (21 of 96) |#####                   | Elapsed Time: 0:00:24 ETA:   0:02:56

New car found: 21 in link https://cars.ksl.com/listing/6304270


 22% (22 of 96) |#####                   | Elapsed Time: 0:00:25 ETA:   0:01:29

New car found: 22 in link https://cars.ksl.com/listing/6172378


 23% (23 of 96) |#####                   | Elapsed Time: 0:00:26 ETA:   0:01:18

New car found: 23 in link https://cars.ksl.com/listing/6172374


 25% (24 of 96) |######                  | Elapsed Time: 0:00:26 ETA:   0:00:52

New car found: 24 in link https://cars.ksl.com/listing/6172373


 26% (25 of 96) |######                  | Elapsed Time: 0:00:27 ETA:   0:00:54

New car found: 25 in link https://cars.ksl.com/listing/6103819


 27% (26 of 96) |######                  | Elapsed Time: 0:00:28 ETA:   0:00:54

New car found: 26 in link https://cars.ksl.com/listing/6187559


 28% (27 of 96) |######                  | Elapsed Time: 0:00:29 ETA:   0:00:56

New car found: 27 in link https://cars.ksl.com/listing/6233302


 29% (28 of 96) |#######                 | Elapsed Time: 0:00:30 ETA:   0:00:52

New car found: 28 in link https://cars.ksl.com/listing/6171091


 30% (29 of 96) |#######                 | Elapsed Time: 0:00:31 ETA:   0:01:21

New car found: 29 in link https://cars.ksl.com/listing/6201265


 31% (30 of 96) |#######                 | Elapsed Time: 0:00:33 ETA:   0:01:57

New car found: 30 in link https://cars.ksl.com/listing/6201391


 32% (31 of 96) |#######                 | Elapsed Time: 0:00:36 ETA:   0:03:10

New car found: 31 in link https://cars.ksl.com/listing/6003346


 33% (32 of 96) |########                | Elapsed Time: 0:00:37 ETA:   0:01:57

New car found: 32 in link https://cars.ksl.com/listing/6137551


 34% (33 of 96) |########                | Elapsed Time: 0:00:39 ETA:   0:01:20

New car found: 33 in link https://cars.ksl.com/listing/6218386


 35% (34 of 96) |########                | Elapsed Time: 0:00:40 ETA:   0:01:37

New car found: 34 in link https://cars.ksl.com/listing/6137521


 36% (35 of 96) |########                | Elapsed Time: 0:00:41 ETA:   0:01:15

New car found: 35 in link https://cars.ksl.com/listing/6304268


 37% (36 of 96) |#########               | Elapsed Time: 0:00:41 ETA:   0:00:53

New car found: 36 in link https://cars.ksl.com/listing/6218385


 38% (37 of 96) |#########               | Elapsed Time: 0:00:42 ETA:   0:00:41

New car found: 37 in link https://cars.ksl.com/listing/6187552


 39% (38 of 96) |#########               | Elapsed Time: 0:00:44 ETA:   0:01:11

New car found: 38 in link https://cars.ksl.com/listing/6119895


 40% (39 of 96) |#########               | Elapsed Time: 0:00:46 ETA:   0:01:39

New car found: 39 in link https://cars.ksl.com/listing/6241098


 41% (40 of 96) |##########              | Elapsed Time: 0:00:46 ETA:   0:01:08

New car found: 40 in link https://cars.ksl.com/listing/6304267


 42% (41 of 96) |##########              | Elapsed Time: 0:00:47 ETA:   0:00:58

New car found: 41 in link https://cars.ksl.com/listing/6143391


 43% (42 of 96) |##########              | Elapsed Time: 0:00:48 ETA:   0:00:38

New car found: 42 in link https://cars.ksl.com/listing/6119894


 44% (43 of 96) |##########              | Elapsed Time: 0:00:50 ETA:   0:01:09

New car found: 43 in link https://cars.ksl.com/listing/6119893


 45% (44 of 96) |###########             | Elapsed Time: 0:00:50 ETA:   0:01:09

New car found: 44 in link https://cars.ksl.com/listing/6203416


 46% (45 of 96) |###########             | Elapsed Time: 0:00:51 ETA:   0:00:57

New car found: 45 in link https://cars.ksl.com/listing/6304265


 47% (46 of 96) |###########             | Elapsed Time: 0:00:52 ETA:   0:00:40

New car found: 46 in link https://cars.ksl.com/listing/6187547


 48% (47 of 96) |###########             | Elapsed Time: 0:00:54 ETA:   0:01:05

New car found: 47 in link https://cars.ksl.com/listing/6304264


 50% (48 of 96) |############            | Elapsed Time: 0:00:56 ETA:   0:01:23

New car found: 48 in link https://cars.ksl.com/listing/6304263


 51% (49 of 96) |############            | Elapsed Time: 0:00:57 ETA:   0:01:09

New car found: 49 in link https://cars.ksl.com/listing/6059446


 52% (50 of 96) |############            | Elapsed Time: 0:00:58 ETA:   0:00:57

New car found: 50 in link https://cars.ksl.com/listing/6266523


 53% (51 of 96) |############            | Elapsed Time: 0:00:58 ETA:   0:00:41

New car found: 51 in link https://cars.ksl.com/listing/6304262


 54% (52 of 96) |#############           | Elapsed Time: 0:01:00 ETA:   0:01:28

New car found: 52 in link https://cars.ksl.com/listing/6187532


 55% (53 of 96) |#############           | Elapsed Time: 0:01:03 ETA:   0:01:49

New car found: 53 in link https://cars.ksl.com/listing/5806269


 56% (54 of 96) |#############           | Elapsed Time: 0:01:05 ETA:   0:01:27

New car found: 54 in link https://cars.ksl.com/listing/6304259


 57% (55 of 96) |#############           | Elapsed Time: 0:01:05 ETA:   0:00:51

New car found: 55 in link https://cars.ksl.com/listing/6157612


 58% (56 of 96) |##############          | Elapsed Time: 0:01:06 ETA:   0:00:43

New car found: 56 in link https://cars.ksl.com/listing/6203398


 59% (57 of 96) |##############          | Elapsed Time: 0:01:08 ETA:   0:01:20

New car found: 57 in link https://cars.ksl.com/listing/6172823


 60% (58 of 96) |##############          | Elapsed Time: 0:01:09 ETA:   0:00:53

New car found: 58 in link https://cars.ksl.com/listing/6040062


 61% (59 of 96) |##############          | Elapsed Time: 0:01:10 ETA:   0:00:45

New car found: 59 in link https://cars.ksl.com/listing/6187518


 62% (60 of 96) |###############         | Elapsed Time: 0:01:11 ETA:   0:00:28

New car found: 60 in link https://cars.ksl.com/listing/6187529


 63% (61 of 96) |###############         | Elapsed Time: 0:01:13 ETA:   0:00:45

New car found: 61 in link https://cars.ksl.com/listing/6201714


 64% (62 of 96) |###############         | Elapsed Time: 0:01:13 ETA:   0:00:45

New car found: 62 in link https://cars.ksl.com/listing/6203271


 65% (63 of 96) |###############         | Elapsed Time: 0:01:15 ETA:   0:00:43

New car found: 63 in link https://cars.ksl.com/listing/6203396


 66% (64 of 96) |################        | Elapsed Time: 0:01:20 ETA:   0:02:20

New car found: 64 in link https://cars.ksl.com/listing/6187528


 67% (65 of 96) |################        | Elapsed Time: 0:01:21 ETA:   0:01:36

New car found: 65 in link https://cars.ksl.com/listing/6187527


 68% (66 of 96) |################        | Elapsed Time: 0:01:23 ETA:   0:00:52

New car found: 66 in link https://cars.ksl.com/listing/6304258


 69% (67 of 96) |################        | Elapsed Time: 0:01:25 ETA:   0:00:58

New car found: 67 in link https://cars.ksl.com/listing/6187526


 70% (68 of 96) |#################       | Elapsed Time: 0:01:28 ETA:   0:01:23

New car found: 68 in link https://cars.ksl.com/listing/6187517


 71% (69 of 96) |#################       | Elapsed Time: 0:01:30 ETA:   0:01:03

New car found: 69 in link https://cars.ksl.com/listing/6304256


 72% (70 of 96) |#################       | Elapsed Time: 0:01:32 ETA:   0:00:46

New car found: 70 in link https://cars.ksl.com/listing/6187514


 73% (71 of 96) |#################       | Elapsed Time: 0:01:34 ETA:   0:00:52

New car found: 71 in link https://cars.ksl.com/listing/6172817


 75% (72 of 96) |##################      | Elapsed Time: 0:01:35 ETA:   0:00:46

New car found: 72 in link https://cars.ksl.com/listing/5806259


 76% (73 of 96) |##################      | Elapsed Time: 0:01:36 ETA:   0:00:27

New car found: 73 in link https://cars.ksl.com/listing/6304254


 77% (74 of 96) |##################      | Elapsed Time: 0:01:37 ETA:   0:00:24

New car found: 74 in link https://cars.ksl.com/listing/6172812


 78% (75 of 96) |##################      | Elapsed Time: 0:01:38 ETA:   0:00:15

New car found: 75 in link https://cars.ksl.com/listing/6266639


 79% (76 of 96) |###################     | Elapsed Time: 0:01:38 ETA:   0:00:14

New car found: 76 in link https://cars.ksl.com/listing/6260726


 80% (77 of 96) |###################     | Elapsed Time: 0:01:39 ETA:   0:00:14

New car found: 77 in link https://cars.ksl.com/listing/6087719


 81% (78 of 96) |###################     | Elapsed Time: 0:01:41 ETA:   0:00:19

New car found: 78 in link https://cars.ksl.com/listing/6187502


 82% (79 of 96) |###################     | Elapsed Time: 0:01:41 ETA:   0:00:17

New car found: 79 in link https://cars.ksl.com/listing/6225561


 83% (80 of 96) |####################    | Elapsed Time: 0:01:43 ETA:   0:00:19

New car found: 80 in link https://cars.ksl.com/listing/6217345


 84% (81 of 96) |####################    | Elapsed Time: 0:01:44 ETA:   0:00:19

New car found: 81 in link https://cars.ksl.com/listing/6119871


 85% (82 of 96) |####################    | Elapsed Time: 0:01:45 ETA:   0:00:16

New car found: 82 in link https://cars.ksl.com/listing/6187490


 86% (83 of 96) |####################    | Elapsed Time: 0:01:46 ETA:   0:00:11

New car found: 83 in link https://cars.ksl.com/listing/6156605


 87% (84 of 96) |#####################   | Elapsed Time: 0:01:47 ETA:   0:00:10

New car found: 84 in link https://cars.ksl.com/listing/6119870


 88% (85 of 96) |#####################   | Elapsed Time: 0:01:47 ETA:   0:00:09

New car found: 85 in link https://cars.ksl.com/listing/6285376


 89% (86 of 96) |#####################   | Elapsed Time: 0:01:48 ETA:   0:00:08

New car found: 86 in link https://cars.ksl.com/listing/6187484


 90% (87 of 96) |#####################   | Elapsed Time: 0:01:49 ETA:   0:00:06

New car found: 87 in link https://cars.ksl.com/listing/6304246


 91% (88 of 96) |######################  | Elapsed Time: 0:01:51 ETA:   0:00:09

New car found: 88 in link https://cars.ksl.com/listing/6267934


 92% (89 of 96) |######################  | Elapsed Time: 0:01:51 ETA:   0:00:08

New car found: 89 in link https://cars.ksl.com/listing/6260121


 93% (90 of 96) |######################  | Elapsed Time: 0:01:52 ETA:   0:00:06

New car found: 90 in link https://cars.ksl.com/listing/6304245


 94% (91 of 96) |######################  | Elapsed Time: 0:01:53 ETA:   0:00:03

New car found: 91 in link https://cars.ksl.com/listing/6267932


 95% (92 of 96) |####################### | Elapsed Time: 0:01:57 ETA:   0:00:15

New car found: 92 in link https://cars.ksl.com/listing/6267931


 96% (93 of 96) |####################### | Elapsed Time: 0:01:57 ETA:   0:00:06

New car found: 93 in link https://cars.ksl.com/listing/6267929


 97% (94 of 96) |####################### | Elapsed Time: 0:01:58 ETA:   0:00:03

New car found: 94 in link https://cars.ksl.com/listing/6230949


100% (96 of 96) |########################| Elapsed Time: 0:02:00 Time:  0:02:00


New car found: 95 in link https://cars.ksl.com/listing/6267927
More results? 1


Unnamed: 0,timestamp,price,year,make,model,body,mileage,title_type,city,state,...,ext_color,int_color,transmission,liters,cylinders,fuel_type,num_doors,ext_condition,int_condition,drive_type
0,1584032705,3750,2012,Nissan,Altima,Sedan,145000.0,Clean Title,Salt Lake City,UT,...,gray,gray,Automatic,,4.0,Gasoline,4.0,Very Good,Very Good,FWD
1,1584032516,34142,2017,Toyota,Tundra,Truck,31658.0,,Sandy,UT,...,white,,Automatic,,8.0,Gasoline,4.0,,,4-Wheel Drive
2,1584032445,14523,2016,Volkswagen,Tiguan,Sport Utility,52651.0,,Salt Lake City,UT,...,black,,Automatic,,4.0,Gasoline,,,,AWD
3,1584032443,8995,2017,Volkswagen,Jetta,Sedan,82987.0,Clean Title,Draper,UT,...,red,titan black/palladium gray,Automatic,,4.0,Gasoline,4.0,,,
4,1584032423,19955,2019,Volkswagen,Jetta,Sedan,12390.0,,Layton,UT,...,pure white,,Automatic,,4.0,Gasoline,4.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,1584030225,17989,2017,Toyota,RAV4,Sport Utility,36432.0,Clean Title,North Salt Lake,UT,...,blue,gray,Automatic,,4.0,Gasoline,,,,4-Wheel Drive
188,1584030215,28589,2017,Toyota,Highlander,Sport Utility,34668.0,Clean Title,North Salt Lake,UT,...,black,gray,Automatic,,6.0,Gasoline,,,,AWD
189,1584030207,19275,2019,Hyundai,Tucson,Sport Utility,19227.0,Clean Title,North Salt Lake,UT,...,gray,black,Automatic,,4.0,Gasoline,,,,AWD
190,1584030205,13995,2013,Nissan,Titan,Truck,95000.0,Clean Title,Salt Lake City,UT,...,black,tan,Automatic,,8.0,Flex Fuel,4.0,Excellent,Excellent,4-Wheel Drive


In [4]:
# Save dataframe to csv

all_cars.to_csv('data/all_cars.csv', index=False)

In [5]:
# Load dataframe
all_cars = pd.read_csv('data/all_cars.csv')

# get most recent timestamp from the dataframe
rep_ts = all_cars['timestamp'].max()

In [8]:
# Now scrape for more cars and check for timestamp

# Define root url for KSL cars
rooturl = "https://cars.ksl.com"

lurl = "https://cars.ksl.com/search/newUsed/Used;Certified/perPage/96/page/"

count = 0
newer_cars = []
moreresults = 1
while moreresults:
    url = lurl + str(count)
    curr_cars, moreresults = carscraper(url, rooturl, rep_ts)
    count += 1    
#     print(f'More results? {moreresults}')
    if type(curr_cars) is pd.core.frame.DataFrame: # make sure real data was returned
        try:
            newer_cars = pd.concat([newer_cars, curr_cars], ignore_index=True)
        except:
            newer_cars = curr_cars
    else:
        print('No newer car data found!')
    
# add newer_cars
if type(newer_cars) is pd.core.frame.DataFrame:
    all_cars_UPDATED = pd.concat([newer_cars, all_cars], ignore_index=True)

N/A% (0 of 96) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--

New car found: 0 in link https://cars.ksl.com/listing/6304322


  1% (1 of 96) |                         | Elapsed Time: 0:00:00 ETA:   0:01:32

New car found: 1 in link https://cars.ksl.com/listing/6187672


  2% (2 of 96) |                         | Elapsed Time: 0:00:03 ETA:   0:03:29

New car found: 2 in link https://cars.ksl.com/listing/6003461


  3% (3 of 96) |                         | Elapsed Time: 0:00:04 ETA:   0:02:28

New car found: 3 in link https://cars.ksl.com/listing/6225513


  4% (4 of 96) |#                        | Elapsed Time: 0:00:05 ETA:   0:01:39

New car found: 4 in link https://cars.ksl.com/listing/6187668


  5% (5 of 96) |#                        | Elapsed Time: 0:00:06 ETA:   0:01:41

New car found: 5 in link https://cars.ksl.com/listing/5790398


  6% (6 of 96) |#                        | Elapsed Time: 0:00:08 ETA:   0:02:02

New car found: 6 in link https://cars.ksl.com/listing/6269401


  7% (7 of 96) |#                        | Elapsed Time: 0:00:09 ETA:   0:02:31

New car found: 7 in link https://cars.ksl.com/listing/6164120


  8% (8 of 96) |##                       | Elapsed Time: 0:00:11 ETA:   0:03:06

New car found: 8 in link https://cars.ksl.com/listing/6003460


  9% (9 of 96) |##                       | Elapsed Time: 0:00:12 ETA:   0:02:02

New car found: 9 in link https://cars.ksl.com/listing/6230637


 10% (10 of 96) |##                      | Elapsed Time: 0:00:15 ETA:   0:03:50

New car found: 10 in link https://cars.ksl.com/listing/6003454


 11% (11 of 96) |##                      | Elapsed Time: 0:00:16 ETA:   0:02:27

New car found: 11 in link https://cars.ksl.com/listing/6059583


 12% (12 of 96) |###                     | Elapsed Time: 0:00:16 ETA:   0:01:58

New car found: 12 in link https://cars.ksl.com/listing/6003452


 13% (13 of 96) |###                     | Elapsed Time: 0:00:18 ETA:   0:01:49

New car found: 13 in link https://cars.ksl.com/listing/6304320


 14% (14 of 96) |###                     | Elapsed Time: 0:00:20 ETA:   0:02:31

New car found: 14 in link https://cars.ksl.com/listing/6203559


 15% (15 of 96) |###                     | Elapsed Time: 0:00:21 ETA:   0:01:42

New car found: 15 in link https://cars.ksl.com/listing/6203558


 16% (16 of 96) |####                    | Elapsed Time: 0:00:22 ETA:   0:01:29

New car found: 16 in link https://cars.ksl.com/listing/6003451


 17% (17 of 96) |####                    | Elapsed Time: 0:00:23 ETA:   0:01:04

New car found: 17 in link https://cars.ksl.com/listing/6003449


 18% (18 of 96) |####                    | Elapsed Time: 0:00:25 ETA:   0:03:36

New car found: 18 in link https://cars.ksl.com/listing/6187664


 19% (19 of 96) |####                    | Elapsed Time: 0:00:27 ETA:   0:02:41

New car found: 19 in link https://cars.ksl.com/listing/6003446


 20% (20 of 96) |#####                   | Elapsed Time: 0:00:29 ETA:   0:02:31

New car found: 20 in link https://cars.ksl.com/listing/6203450


 21% (21 of 96) |#####                   | Elapsed Time: 0:00:31 ETA:   0:02:20

New car found: 21 in link https://cars.ksl.com/listing/6003444


 22% (22 of 96) |#####                   | Elapsed Time: 0:00:33 ETA:   0:02:20

New car found: 22 in link https://cars.ksl.com/listing/6187660


 23% (23 of 96) |#####                   | Elapsed Time: 0:00:34 ETA:   0:01:43

New car found: 23 in link https://cars.ksl.com/listing/6164186


 25% (24 of 96) |######                  | Elapsed Time: 0:00:35 ETA:   0:01:26

New car found: 24 in link https://cars.ksl.com/listing/6119349


 26% (25 of 96) |######                  | Elapsed Time: 0:00:36 ETA:   0:01:28

New car found: 25 in link https://cars.ksl.com/listing/6222626


 27% (26 of 96) |######                  | Elapsed Time: 0:00:37 ETA:   0:01:28

New car found: 26 in link https://cars.ksl.com/listing/5773461


 28% (27 of 96) |######                  | Elapsed Time: 0:00:39 ETA:   0:01:28

New car found: 27 in link https://cars.ksl.com/listing/6222663


 29% (28 of 96) |#######                 | Elapsed Time: 0:00:41 ETA:   0:02:02

New car found: 28 in link https://cars.ksl.com/listing/6143414


100% (96 of 96) |########################| Elapsed Time: 0:00:43 Time:  0:00:43


************ Found end of new data ************
var type of all_cars is: <class 'pandas.core.frame.DataFrame'>
More results? 0


In [9]:
# Save updated dataframe to csv

all_cars_UPDATED.to_csv('data/all_cars_UPDATED.csv', index=False) # in later iterations, remove the "_UPDATED" part of filename

In [10]:
# Load updated dataframe
all_cars_UPDATED = pd.read_csv('data/all_cars_UPDATED.csv') # in later iterations, remove the "_UPDATED" part of filename
all_cars_UPDATED

Unnamed: 0,timestamp,price,year,make,model,body,mileage,title_type,city,state,...,ext_color,int_color,transmission,liters,cylinders,fuel_type,num_doors,ext_condition,int_condition,drive_type
0,1584032993,23566,2018,Ford,Edge,Sport Utility,36285.0,Clean Title,Salt Lake City,UT,...,maroon,,Automatic,,6.0,Gasoline,4.0,,,AWD
1,1584032980,30737,2019,Ford,F-150,Truck,14012.0,,Layton,UT,...,white,medium earth gray,Automatic,,8.0,Flex Fuel,4.0,,,4-Wheel Drive
2,1584032980,41436,2018,Jeep,Wrangler Unlimited,Convertible,25.0,,West Valley City,UT,...,xtreme purple pearlcoat,black,Automatic,,6.0,Gasoline,4.0,,,4-Wheel Drive
3,1584032969,15491,2008,GMC,Yukon,Sport Utility,111188.0,Clean Title,St. George,UT,...,summit white,,Automatic,,8.0,Flex Fuel,,,,4-Wheel Drive
4,1584032969,31560,2019,Ford,F-150,Truck,11654.0,,Layton,UT,...,white,,Automatic,,8.0,Flex Fuel,4.0,,,4-Wheel Drive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
216,1584030225,17989,2017,Toyota,RAV4,Sport Utility,36432.0,Clean Title,North Salt Lake,UT,...,blue,gray,Automatic,,4.0,Gasoline,,,,4-Wheel Drive
217,1584030215,28589,2017,Toyota,Highlander,Sport Utility,34668.0,Clean Title,North Salt Lake,UT,...,black,gray,Automatic,,6.0,Gasoline,,,,AWD
218,1584030207,19275,2019,Hyundai,Tucson,Sport Utility,19227.0,Clean Title,North Salt Lake,UT,...,gray,black,Automatic,,4.0,Gasoline,,,,AWD
219,1584030205,13995,2013,Nissan,Titan,Truck,95000.0,Clean Title,Salt Lake City,UT,...,black,tan,Automatic,,8.0,Flex Fuel,4.0,Excellent,Excellent,4-Wheel Drive


In [None]:
# Now try full thing while looping through proxies and associated user-agents



In [None]:
# Playing around with determining if car listing is still good (or if it's been removed)
testurl = "https://cars.ksl.com/listing/9999999"

user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'

resp = requests.get(testurl, headers = {'User-Agent': user_agent})
lsthtml = resp.content
lstsoup = BeautifulSoup(lsthtml)

if lstsoup.title.text.strip().lower() == 'not found':
    print('bad link')

In [None]:
# Playing around with timestamps for use when checking for new data

print(datetime.fromtimestamp(all_cars['timestamp'][0]).isoformat())
print(datetime.now())
currtime = time.time()
print(currtime)
print(datetime.fromtimestamp(currtime).isoformat())

In [None]:
# Example of working live html parser without crawler block and user-agent spoof

# url = "http://www.python.org"
# resp = requests.get(url)
# html = resp.content
# print(html)

In [None]:
#################################################
####### DEPRECATED AS OF MARCH 12, 2020 #########
#################################################

# TCH: Many functionalities implemented in the carscraper function have not been copied over to this cell block


### Working example for a SINGLE KSL search results page

# maxresults = 20 # Set max number of listings to parse (per search results page)

# # Define root url for KSL cars
# rooturl = "https://cars.ksl.com"

# # Note the url below specifies that we're looking for 96 per page and the default sort of newest to oldest posting
# # This note about newest to oldest is useful so that we can avoid scraping repeat listings based on their timestamps
# url = "https://cars.ksl.com/search/newUsed/Used;Certified/perPage/96/page/0"

# # Need to spoof a user-agent in order to get past crawler block
# user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'

# # Note: The above user_agent might need to be rotated (along with IP) to avoid IP ban
# # Example found on https://www.scrapehero.com/how-to-fake-and-rotate-user-agents-using-python-3/

# all_cars = []

# # Open live page (as opposed to downloaded)
# resp = requests.get(url, headers = {'User-Agent': user_agent})
# html = resp.content
# pgsoup = BeautifulSoup(html)
# lastpg = int(pgsoup.find(attrs={"title": "Go to last page"}).text.strip()) # Note that this is 1 more than number from href for this page
# # print(f'Total number of search results pages: {lastpg}')
# # print()

# links = pgsoup.select("div.title > a.link") # grab all 96 (or up to 96) links
# # print(f'Total number of links found on current page: {len(links)}')
# tstamps = pgsoup.select("div.listing-detail-line script") # grab all 96 (or up to 96) timestamps
# # print(f'Total number of timestamps found on current page: {len(tstamps)}')

# # print()

# # for tstamp in tstamps:
# #     print(int(re.search('(\d+)',tstamp.text).group(0))) # <-- This is WORKING code to extract timestamp for each listing from search page
# # print()

# print(f'Limiting Subsequent Listing Results to {maxresults}')

# # Loop through links and scrape data for each new listing
# with progressbar.ProgressBar(max_value=maxresults) as bar:
#     for idx, link in enumerate(links[:maxresults]): # *** only load first x results for now to avoid ban before implementing spoofing

#         # Reset all fields to None before next loop
#         price=year=make=model=body=mileage=title_type=city=state=seller=None
#         trim=ext_color=int_color=transmission=liters=cylinders=fuel_type=num_doors=ext_condition=int_condition=drive_type=None
        
#         # We're going to want to strip the "?ad_cid=[number]" from the end of these links as they're not needed to load the page properly
#         # Regular expressions should come in handy here

#         cutidx = re.search('(\?ad_cid=.+)',link['href']).start()
#         currlink = link['href'][:cutidx]

#         # Somewhere here we should do a check to make sure that the timestamp for currlink is newer than our newest file in our repository
#         # That is, compare the timestamps with a simple conditional, where if the conditional is not met, this loop breaks to avoid useless computation time

#         # Open listing link and pull html from it
#         fulllink = '/'.join([rooturl.rstrip('/'), currlink.lstrip('/')])

#         resp = requests.get(fulllink, headers = {'User-Agent': user_agent})
#         lsthtml = resp.content
#         lstsoup = BeautifulSoup(lsthtml)

#         # Get listing price
#         price = lstsoup.select('h3.price')[0].text.strip().replace('$','').replace(',','')

#         # Get seller's location
#         location = lstsoup.select('h2.location > a')[0].text.strip()
#         city, state = location.split(',')
#         city = city.strip()
#         state = state.strip()

#         # Get seller type (dealer or owner)
#         sellerstr = lstsoup.select('div.fsbo')[0].text.strip()
#         if re.search('(Dealer)', sellerstr):
#             seller = 'Dealer'
#         elif re.search('(Owner)', sellerstr):
#             seller = 'Owner'

#         # Get timestamp
#         tstamp = int(re.search('(\d+)',tstamps[idx].text).group(0))

#         # Get table of car specs
#         specs = lstsoup.select('ul.listing-specifications')

#         for li in specs[0].find_all('li'):
#             lititle = li.select('span.title')[0].text.strip().strip(':')
#             livalue = li.select('span.value')[0].text.strip().strip(':')
            
#             if livalue.lower() == 'not specified':
#                 livalue = None

#             # Now a bunch of if-else statements to determine which column to add data to
#             # There might be a more sophisticated way to do this, perhaps with a tuple or a dictionary?
#             if lititle.lower() == 'year':
#                 if livalue:
#                     year = int(livalue)
#                 else:
#                     year = livalue
#             elif lititle.lower() == 'make':
#                 make = livalue
#             elif lititle.lower() == 'model':
#                 model = livalue
#             elif lititle.lower() == 'body':
#                 body = livalue
#             elif lititle.lower() == 'mileage':
#                 if livalue:
#                     mileage = int(livalue.replace(',',''))
#                 else:
#                     mileage = livalue
#             elif lititle.lower() == 'title type':
#                 title_type = livalue
                
#             # Below this are non-required specs    
#             elif lititle.lower() == 'trim':
#                 trim = livalue
#             elif lititle.lower() == 'exterior color':
#                 if livalue:
#                     ext_color = livalue.lower()
#                 else:
#                     ext_color = livalue
#             elif lititle.lower() == 'interior color':
#                 if livalue:
#                     int_color = livalue.lower()
#                 else:
#                     int_color = livalue
#             elif lititle.lower() == 'transmission':
#                 transmission = livalue
#             elif lititle.lower() == 'liters':
#                 try:
#                     liters = float(livalue)
#                 except:
#                     if livalue:
#                         str1 = re.search('^(.*?)L',livalue).group(0).strip().replace(' ','')
#                         if re.search('^(\D+)',str1):
#                             idxend = re.search('^(\D+)',str1).end()
#                             livalue = str1[idxend:-1]
#                         else:
#                             livalue = str1[:-1]
#                         livalue = float(livalue)
#                     else:
#                         liters = livalue
#             elif lititle.lower() == 'cylinders':
#                 if livalue:
#                     cylinders = int(livalue)
#                 else:
#                     cylinders = livalue
#             elif lititle.lower() == 'fuel type':
#                 fuel_type = livalue
#             elif lititle.lower() == 'number of doors':
#                 if livalue:
#                     num_doors = int(livalue)
#                 else:
#                     num_doors = livalue
#             elif lititle.lower() == 'exterior condition':
#                 ext_condition = livalue
#             elif lititle.lower() == 'interior condition':
#                 int_condition = livalue
#             elif lititle.lower() == 'drive type':
#                 drive_type = livalue
#             elif (lititle.lower() == 'vin') | (lititle.lower() == 'stock number') | (lititle.lower() == 'dealer license'):
#                 None # Don't want to save these
#             else:
#                 None
#                 print(f'Unmatched param {lititle}: {livalue}') # <-- could take advantage of some or all of these

#         curr_car = pd.DataFrame({"timestamp":[tstamp],
#                                  "price":[price],
#                                  "year":[year],
#                                  "make":[make],
#                                  "model":[model],
#                                  "body":[body],
#                                  "mileage":[mileage],
#                                  "title_type":[title_type],
#                                  "city":[city],
#                                  "state":[state],
#                                  "seller":[seller],
#                                  "trim":[trim],
#                                  "ext_color":[ext_color],
#                                  "int_color":[int_color],
#                                  "transmission":[transmission],
#                                  "liters":[liters],
#                                  "cylinders":[cylinders],
#                                  "fuel_type":[fuel_type],
#                                  "num_doors":[num_doors],
#                                  "ext_condition":[ext_condition],
#                                  "int_condition":[int_condition],
#                                  "drive_type":[drive_type]})
#         try:
#             all_cars = pd.concat([all_cars, curr_car])
#         except:
#             all_cars = curr_car

#         bar.update(idx)
        
# all_cars = all_cars.reset_index()
# del all_cars['index']
# all_cars.fillna(value=pd.np.nan, inplace=True)
# all_cars