## To Do:
* Rotate through user-agents and IPs
* Keep a database or dictionary of IP:user-agent combos to avoid using different user-agents for the same IP?
* Consider expanding IP list to include limited countries outside US
* Figure out how to use API for getting user-agents (to avoid IP ban)
* Stop crawling when current listing's timestamp <= max(timestamp) from loaded dataframe from file
* Get page views (after certain time period?)
* Get number of times listing has been favorited?
* Limit results to just those with a picture?
* Get count of number of pictures?
* Save dataframe to csv

In [1]:
from bs4 import BeautifulSoup # if this isn't installed, use pip install beautifulsoup4
import requests
import re
import pandas as pd
import numpy as np
from datetime import datetime
import time
import progressbar # if this isn't installed, use pip install progressbar2
import random
import json

In [None]:
# Get list of proxy IPs

IPurl = "https://free-proxy-list.net"

resp = requests.get(IPurl)
IPhtml = resp.content
IPsoup = BeautifulSoup(IPhtml)

proxies = []
for tr in IPsoup.find(id='proxylisttable').find('tbody').find_all('tr'):
    tds = tr.find_all('td')
    if (tds[2].text.strip() == 'US') & (tds[6].text.strip() == 'yes') & (tds[4].text.strip() != 'transparent'):
        proxies.append(''.join(['http://', ':'.join([tds[0].text.strip(), tds[1].text.strip()])])) # grab the IP addresses matching the above criteria
random.shuffle(proxies)
proxies

In [None]:
# Get list of user-agents

### Need to use API at some point rather than crawl/scrape since you can get 500 user-agents for free per month...and my IP got banned
### username: automodeals
### pw: kslclass123
### API documentation: https://developers.whatismybrowser.com/api/docs/v2/

API_key = '5ecab60888f7aebfbc4aad5850de52fa'

UAurl = "https://developers.whatismybrowser.com/useragents/explore/software_name/chrome/"

resp = requests.get(UAurl)
UAhtml = resp.content
UAsoup = BeautifulSoup(UAhtml)

UAlist = []
matches = UAsoup.select("table.table-useragents td.useragent")
for match in matches[:len(proxies)]: # only get as many user-agents are there are proxies. Dangerous to use more than one user-agent per IP
    UAlist.append(match.find('a').text.strip())
random.shuffle(UAlist)
UAlist

In [None]:
# Use user-agents API (example from https://github.com/whatismybrowser/api-v2-sample-code/blob/master/sample-code/python-3.6/user_agent_parse.py)

API_key = '5ecab60888f7aebfbc4aad5850de52fa'

headers = {'X-API-KEY': API_key}
# UAurl = "https://api.whatismybrowser.com/api/v2/user_agent_database_dump_url"
# UAurl = "https://api.whatismybrowser.com/api/v2/user_agent_database_search"

# The code below works for POSTing data, but we want to GET data

UAurl = "https://api.whatismybrowser.com/api/v2/user_agent_parse"

post_data = {
    "user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3282.167 Safari/537.36",
}

result = requests.post(UAurl, data=json.dumps(post_data), headers=headers)
result # if result is 200, then success!
result.json()

In [None]:
# Experiment with using proxy IPs from above

# url = "https://cars.ksl.com/search/newUsed/Used;Certified/perPage/96/page/0"
url = "https://httpbin.org/ip"

user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'

for idx, proxy in enumerate(proxies):
#     user_agent = UAlist[idx]
    try:
        resp = requests.get(url,proxies={"http":proxy, "https":proxy},headers={'User-Agent': user_agent})
#         print(resp.content)
        print(f'Success! proxy used: {proxy}')
    except:
        print('Proxy error')

In [None]:
### Working example for a SINGLE KSL search results page

maxresults = 20 # Set max number of listings to parse (per search results page)

# Define root url for KSL cars
rooturl = "https://cars.ksl.com"

# Note the url below specifies that we're looking for 96 per page and the default sort of newest to oldest posting
# This note about newest to oldest is useful so that we can avoid scraping repeat listings based on their timestamps
url = "https://cars.ksl.com/search/newUsed/Used;Certified/perPage/96/page/0"

# Need to spoof a user-agent in order to get past crawler block
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'

# Note: The above user_agent might need to be rotated (along with IP) to avoid IP ban
# Example found on https://www.scrapehero.com/how-to-fake-and-rotate-user-agents-using-python-3/

all_cars = []

# Open live page (as opposed to downloaded)
resp = requests.get(url, headers = {'User-Agent': user_agent})
html = resp.content
pgsoup = BeautifulSoup(html)
lastpg = int(pgsoup.find(attrs={"title": "Go to last page"}).text.strip()) # Note that this is 1 more than number from href for this page
# print(f'Total number of search results pages: {lastpg}')
# print()

links = pgsoup.select("div.title > a.link") # grab all 96 (or up to 96) links
# print(f'Total number of links found on current page: {len(links)}')
tstamps = pgsoup.select("div.listing-detail-line script") # grab all 96 (or up to 96) timestamps
# print(f'Total number of timestamps found on current page: {len(tstamps)}')

# print()

# for tstamp in tstamps:
#     print(int(re.search('(\d+)',tstamp.text).group(0))) # <-- This is WORKING code to extract timestamp for each listing from search page
# print()

print(f'Limiting Subsequent Listing Results to {maxresults}')

# Loop through links and scrape data for each new listing
with progressbar.ProgressBar(max_value=maxresults) as bar:
    for idx, link in enumerate(links[:maxresults]): # *** only load first x results for now to avoid ban before implementing spoofing

        # Reset all fields to None before next loop
        price=year=make=model=body=mileage=title_type=city=state=seller=None
        trim=ext_color=int_color=transmission=liters=cylinders=fuel_type=num_doors=ext_condition=int_condition=drive_type=None
        
        # We're going to want to strip the "?ad_cid=[number]" from the end of these links as they're not needed to load the page properly
        # Regular expressions should come in handy here

        cutidx = re.search('(\?ad_cid=.+)',link['href']).start()
        currlink = link['href'][:cutidx]

        # Somewhere here we should do a check to make sure that the timestamp for currlink is newer than our newest file in our repository
        # That is, compare the timestamps with a simple conditional, where if the conditional is not met, this loop breaks to avoid useless computation time

        # Open listing link and pull html from it
        fulllink = '/'.join([rooturl.rstrip('/'), currlink.lstrip('/')])

        resp = requests.get(fulllink, headers = {'User-Agent': user_agent})
        lsthtml = resp.content
        lstsoup = BeautifulSoup(lsthtml)

        # Get listing price
        price = lstsoup.select('h3.price')[0].text.strip().replace('$','').replace(',','')

        # Get seller's location
        location = lstsoup.select('h2.location > a')[0].text.strip()
        city, state = location.split(',')
        city = city.strip()
        state = state.strip()

        # Get seller type (dealer or owner)
        sellerstr = lstsoup.select('div.fsbo')[0].text.strip()
        if re.search('(Dealer)', sellerstr):
            seller = 'Dealer'
        elif re.search('(Owner)', sellerstr):
            seller = 'Owner'

        # Get timestamp
        tstamp = int(re.search('(\d+)',tstamps[idx].text).group(0))

        # Get table of car specs
        specs = lstsoup.select('ul.listing-specifications')

        for li in specs[0].find_all('li'):
            lititle = li.select('span.title')[0].text.strip().strip(':')
            livalue = li.select('span.value')[0].text.strip().strip(':')
            
            if livalue.lower() == 'not specified':
                livalue = None

            # Now a bunch of if-else statements to determine which column to add data to
            # There might be a more sophisticated way to do this, perhaps with a tuple or a dictionary?
            if lititle.lower() == 'year':
                if livalue:
                    year = int(livalue)
                else:
                    year = livalue
            elif lititle.lower() == 'make':
                make = livalue
            elif lititle.lower() == 'model':
                model = livalue
            elif lititle.lower() == 'body':
                body = livalue
            elif lititle.lower() == 'mileage':
                if livalue:
                    mileage = int(livalue.replace(',',''))
                else:
                    mileage = livalue
            elif lititle.lower() == 'title type':
                title_type = livalue
                
            # Below this are non-required specs    
            elif lititle.lower() == 'trim':
                trim = livalue
            elif lititle.lower() == 'exterior color':
                if livalue:
                    ext_color = livalue.lower()
                else:
                    ext_color = livalue
            elif lititle.lower() == 'interior color':
                if livalue:
                    int_color = livalue.lower()
                else:
                    int_color = livalue
            elif lititle.lower() == 'transmission':
                transmission = livalue
            elif lititle.lower() == 'liters':
                try:
                    liters = float(livalue)
                except:
                    if livalue:
                        str1 = re.search('^(.*?)L',livalue).group(0).strip().replace(' ','')
                        if re.search('^(\D+)',str1):
                            idxend = re.search('^(\D+)',str1).end()
                            livalue = str1[idxend:-1]
                        else:
                            livalue = str1[:-1]
                        livalue = float(livalue)
                    else:
                        liters = livalue
            elif lititle.lower() == 'cylinders':
                if livalue:
                    cylinders = int(livalue)
                else:
                    cylinders = livalue
            elif lititle.lower() == 'fuel type':
                fuel_type = livalue
            elif lititle.lower() == 'number of doors':
                if livalue:
                    num_doors = int(livalue)
                else:
                    num_doors = livalue
            elif lititle.lower() == 'exterior condition':
                ext_condition = livalue
            elif lititle.lower() == 'interior condition':
                int_condition = livalue
            elif lititle.lower() == 'drive type':
                drive_type = livalue
            elif (lititle.lower() == 'vin') | (lititle.lower() == 'stock number') | (lititle.lower() == 'dealer license'):
                None # Don't want to save these
            else:
                None
                print(f'Unmatched param {lititle}: {livalue}') # <-- could take advantage of some or all of these

        curr_car = pd.DataFrame({"timestamp":[tstamp],
                                 "price":[price],
                                 "year":[year],
                                 "make":[make],
                                 "model":[model],
                                 "body":[body],
                                 "mileage":[mileage],
                                 "title_type":[title_type],
                                 "city":[city],
                                 "state":[state],
                                 "seller":[seller],
                                 "trim":[trim],
                                 "ext_color":[ext_color],
                                 "int_color":[int_color],
                                 "transmission":[transmission],
                                 "liters":[liters],
                                 "cylinders":[cylinders],
                                 "fuel_type":[fuel_type],
                                 "num_doors":[num_doors],
                                 "ext_condition":[ext_condition],
                                 "int_condition":[int_condition],
                                 "drive_type":[drive_type]})
        try:
            all_cars = pd.concat([all_cars, curr_car])
        except:
            all_cars = curr_car

        bar.update(idx)
        
all_cars = all_cars.reset_index()
del all_cars['index']
all_cars.fillna(value=pd.np.nan, inplace=True)
all_cars

In [7]:
# Make a function for the scraping done for each search page
def carscraper(url, rooturl):
    # url should be of the form "https://cars.ksl.com/search/newUsed/Used;Certified/perPage/96/page/0"
    # rooturl should be something like "https://cars.ksl.com"
    
    # Need to spoof a user-agent in order to get past crawler block
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'

    resp = requests.get(url, headers = {'User-Agent': user_agent})
    html = resp.content
    pgsoup = BeautifulSoup(html)
    
    # Check if there are additional pages of results
    if pgsoup.find("a", {"title" : "Go forward 1 page"}):
        moreresults = 1
    else:
        moreresults = 0
    
    links = pgsoup.select("div.title > a.link") # grab all 96 (or up to 96) links
    tstamps = pgsoup.select("div.listing-detail-line script") # grab all 96 (or up to 96) timestamps

    # Loop through links and scrape data for each new listing
    with progressbar.ProgressBar(max_value=len(links)) as bar:
        for idx, link in enumerate(links): # *** only load first x results for now to avoid ban before implementing spoofing

#             try:
            # Reset all fields to None before next loop
            price=year=make=model=body=mileage=title_type=city=state=seller=None
            trim=ext_color=int_color=transmission=liters=cylinders=fuel_type=num_doors=ext_condition=int_condition=drive_type=None

            # We're going to want to strip the "?ad_cid=[number]" from the end of these links as they're not needed to load the page properly
            # Regular expressions should come in handy here

            cutidx = re.search('(\?ad_cid=.+)',link['href']).start()
            currlink = link['href'][:cutidx]

            # Somewhere here we should do a check to make sure that the timestamp for currlink is newer than our newest file in our repository
            # That is, compare the timestamps with a simple conditional, where if the conditional is not met, this loop breaks to avoid useless computation time

            # Open listing link and pull html from it
            fulllink = '/'.join([rooturl.rstrip('/'), currlink.lstrip('/')])

            resp = requests.get(fulllink, headers = {'User-Agent': user_agent})
            lsthtml = resp.content
            lstsoup = BeautifulSoup(lsthtml)

            # Get listing price
            price = lstsoup.select('h3.price')[0].text.strip().replace('$','').replace(',','')

            # Get seller's location
            location = lstsoup.select('h2.location > a')[0].text.strip()
            city, state = location.split(',')
            city = city.strip()
            state = state.strip()

            # Get seller type (dealer or owner)
            sellerstr = lstsoup.select('div.fsbo')[0].text.strip()
            if re.search('(Dealer)', sellerstr):
                seller = 'Dealer'
            elif re.search('(Owner)', sellerstr):
                seller = 'Owner'

            # Get timestamp
            tstamp = int(re.search('(\d+)',tstamps[idx].text).group(0))

            # Get table of car specs
            specs = lstsoup.select('ul.listing-specifications')

            for li in specs[0].find_all('li'):
                lititle = li.select('span.title')[0].text.strip().strip(':')
                livalue = li.select('span.value')[0].text.strip().strip(':')

                if livalue.lower() == 'not specified':
                    livalue = None

                # Now a bunch of if-else statements to determine which column to add data to
                # There might be a more sophisticated way to do this, perhaps with a tuple or a dictionary?
                if lititle.lower() == 'year':
                    if livalue:
                        year = int(livalue)
                    else:
                        year = livalue
                elif lititle.lower() == 'make':
                    make = livalue
                elif lititle.lower() == 'model':
                    model = livalue
                elif lititle.lower() == 'body':
                    body = livalue
                elif lititle.lower() == 'mileage':
                    if livalue:
                        mileage = int(livalue.replace(',',''))
                    else:
                        mileage = livalue
                elif lititle.lower() == 'title type':
                    title_type = livalue

                # Below this are non-required specs    
                elif lititle.lower() == 'trim':
                    trim = livalue
                elif lititle.lower() == 'exterior color':
                    if livalue:
                        ext_color = livalue.lower()
                    else:
                        ext_color = livalue
                elif lititle.lower() == 'interior color':
                    if livalue:
                        int_color = livalue.lower()
                    else:
                        int_color = livalue
                elif lititle.lower() == 'transmission':
                    transmission = livalue
                elif lititle.lower() == 'liters':
                    try:
                        liters = float(livalue)
                    except:
                        if livalue:
                            str1 = re.search('^(.*?)L',livalue).group(0).strip().replace(' ','')
                            if re.search('^(\D+)',str1):
                                idxend = re.search('^(\D+)',str1).end()
                                livalue = str1[idxend:-1]
                            else:
                                livalue = str1[:-1]
                            try:
                                livalue = float(livalue)
                            except:
                                print(url)
                                print('****')
                                print(link)
                        else:
                            liters = livalue
                elif lititle.lower() == 'cylinders':
                    if livalue:
                        cylinders = int(livalue)
                    else:
                        cylinders = livalue
                elif lititle.lower() == 'fuel type':
                    fuel_type = livalue
                elif lititle.lower() == 'number of doors':
                    if livalue:
                        num_doors = int(livalue)
                    else:
                        num_doors = livalue
                elif lititle.lower() == 'exterior condition':
                    ext_condition = livalue
                elif lititle.lower() == 'interior condition':
                    int_condition = livalue
                elif lititle.lower() == 'drive type':
                    drive_type = livalue
                elif (lititle.lower() == 'vin') | (lititle.lower() == 'stock number') | (lititle.lower() == 'dealer license'):
                    None # Don't want to save these
                else:
                    None
                    print(f'Unmatched param {lititle}: {livalue}') # <-- could take advantage of some or all of these

            curr_car = pd.DataFrame({"timestamp":[tstamp],
                                     "price":[price],
                                     "year":[year],
                                     "make":[make],
                                     "model":[model],
                                     "body":[body],
                                     "mileage":[mileage],
                                     "title_type":[title_type],
                                     "city":[city],
                                     "state":[state],
                                     "seller":[seller],
                                     "trim":[trim],
                                     "ext_color":[ext_color],
                                     "int_color":[int_color],
                                     "transmission":[transmission],
                                     "liters":[liters],
                                     "cylinders":[cylinders],
                                     "fuel_type":[fuel_type],
                                     "num_doors":[num_doors],
                                     "ext_condition":[ext_condition],
                                     "int_condition":[int_condition],
                                     "drive_type":[drive_type]})
            try:
                all_cars = pd.concat([all_cars, curr_car])
            except:
                all_cars = curr_car

            bar.update(idx)
#             except:
#                 print('***********************************')
#                 print(fulllink)
#                 print('***********************************')
#                 break

    all_cars = all_cars.reset_index()
    del all_cars['index']
    all_cars.fillna(value=pd.np.nan, inplace=True)
    return all_cars, moreresults

In [8]:
# Try a multi-page test using carscraper function

# set cap for number of search pages to load (i.e. pages with up to 96 listings)
maxpg = 2

# Define root url for KSL cars
rooturl = "https://cars.ksl.com"

# Note the url below specifies that we're looking for 96 per page and the default sort of newest to oldest posting
# This note about newest to oldest is useful so that we can avoid scraping repeat listings based on their timestamps
# Also note that this url does NOT have a page number associated with it. This is added in the while loop below
lurl = "https://cars.ksl.com/search/newUsed/Used;Certified/perPage/96/page/"

count = 0
all_cars = []
while count < maxpg:
    url = lurl + str(count)
    curr_cars, moreresults = carscraper(url, rooturl)
    count += 1    
    print(f'More results? {moreresults}')
    try:
        all_cars = pd.concat([all_cars, curr_cars], ignore_index=True)
    except:
        all_cars = curr_cars
    
all_cars

100% (96 of 96) |########################| Elapsed Time: 0:01:36 Time:  0:01:36


More results? 1


100% (96 of 96) |########################| Elapsed Time: 0:02:13 Time:  0:02:13


More results? 1


Unnamed: 0,timestamp,price,year,make,model,body,mileage,title_type,city,state,...,ext_color,int_color,transmission,liters,cylinders,fuel_type,num_doors,ext_condition,int_condition,drive_type
0,1583962324,2500,2006,Chrysler,300,Sedan,174300,Rebuilt/Reconstructed Title,Salt Lake City,UT,...,silver,gray,Automatic,,6.0,Gasoline,4.0,Very Good,Very Good,RWD
1,1583962274,24988,2019,Kia,Sedona,Minivan,17732,Clean Title,Bountiful,UT,...,white,gray,Automatic,,6.0,Gasoline,5.0,Excellent,Excellent,FWD
2,1583962270,22995,2018,Toyota,Sienna,Minivan,48880,Clean Title,American Fork,UT,...,silver,black,Automatic,,6.0,Gasoline,4.0,,,FWD
3,1583962258,25995,2018,Ford,F-150,Truck,32691,Clean Title,American Fork,UT,...,gray,medium earth gray,Automatic,,6.0,Flex Fuel,4.0,,,4-Wheel Drive
4,1583962250,250,1999,Dodge,Caravan,Minivan,300000,Clean Title,South Salt Lake,UT,...,,,,,6.0,Flex Fuel,,,,FWD
5,1583962233,11199,2013,Nissan,Pathfinder,Sport Utility,95551,,Woods Cross,UT,...,brilliant silver,charcoal,Automatic,,6.0,Gasoline,4.0,,,4-Wheel Drive
6,1583962165,2500,1996,Ford,Ranger,Truck,123000,Clean Title,South Jordan,UT,...,,,Manual,,4.0,Gasoline,2.0,,,
7,1583962139,22995,2018,Toyota,Sienna,Minivan,47772,Clean Title,American Fork,UT,...,gray,ash,Automatic,,6.0,Gasoline,4.0,,,FWD
8,1583962139,41995,2019,Ford,F-150,Truck,32684,Clean Title,Springville,UT,...,red,black,Automatic,,6.0,Gasoline,,,,4-Wheel Drive
9,1583962138,36500,2016,Ford,F-150,Truck,53840,Clean Title,Springville,UT,...,silver,black,Automatic,,6.0,Gasoline,,,,4-Wheel Drive


In [9]:
# Save dataframe to csv

all_cars.to_csv('data/all_cars.csv', index=False)

In [13]:
# Load dataframe
all_cars = pd.read_csv('data/all_cars.csv')

# get most recent timestamp from the dataframe
rep_ts = all_cars['timestamp'].max()

In [None]:
# Now scrape for more cars and check for timestamp



In [None]:
# Now try full thing while looping through proxies and associated user-agents



In [None]:
# Playing around with timestamps for use when checking for new data

print(datetime.fromtimestamp(all_cars['timestamp'][0]).isoformat())
print(datetime.now())
currtime = time.time()
print(currtime)
print(datetime.fromtimestamp(currtime).isoformat())

In [None]:
# Example of working live html parser without crawler block and user-agent spoof

# url = "http://www.python.org"
# resp = requests.get(url)
# html = resp.content
# print(html)