## To Do:
* Rotate through user-agents (and IPs?)
* Consider expanding IP list to include limited countries outside US
* Figure out how to use API for getting user-agents (to avoid IP ban)
* Stop crawling when current listing's timestamp <= max(timestamp) from loaded dataframe from file
* Handle "L" in cylinders value with regular expression
* Get page views (after certain time period?)
* Get number of times listing has been favorited?
* Limit results to just those with a picture?
* Save dataframe to csv

In [None]:
from bs4 import BeautifulSoup # if this isn't installed, use pip install beautifulsoup4
import requests
import re
import pandas as pd
import numpy as np
from datetime import datetime
import time
import progressbar # if this isn't installed, use pip install progressbar2
import random

In [None]:
# Get list of proxy IPs

IPurl = "https://free-proxy-list.net"

resp = requests.get(IPurl)
IPhtml = resp.content
IPsoup = BeautifulSoup(IPhtml)

proxies = []
for tr in IPsoup.find(id='proxylisttable').find('tbody').find_all('tr'):
    tds = tr.find_all('td')
    if (tds[2].text.strip() == 'US') & (tds[6].text.strip() == 'yes') & (tds[4].text.strip() != 'transparent'):
        proxies.append(''.join(['http://', ':'.join([tds[0].text.strip(), tds[1].text.strip()])])) # grab the IP addresses matching the above criteria
random.shuffle(proxies)
proxies

In [None]:
# Get list of user-agents

### Need to use API at some point rather than crawl/scrape since you can get 500 user-agents for free per month...and my IP got banned

UAurl = "https://developers.whatismybrowser.com/useragents/explore/software_name/chrome/"

resp = requests.get(UAurl)
UAhtml = resp.content
UAsoup = BeautifulSoup(UAhtml)

UAlist = []
matches = UAsoup.select("table.table-useragents td.useragent")
for match in matches[:len(proxies)]: # only get as many user-agents are there are proxies. Dangerous to use more than one user-agent per IP
    UAlist.append(match.find('a').text.strip())
random.shuffle(UAlist)
UAlist

In [None]:
# Experiment with using proxy IPs from above

# url = "https://cars.ksl.com/search/newUsed/Used;Certified/perPage/96/page/0"
url = "https://httpbin.org/ip"

user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'

for idx, proxy in enumerate(proxies):
#     user_agent = UAlist[idx]
    try:
        resp = requests.get(url,proxies={"http":proxy, "https":proxy},headers={'User-Agent': user_agent})
#         print(resp.content)
        print(f'Success! proxy used: {proxy}')
    except:
        print('Proxy error')

In [None]:
### Working example for a SINGLE KSL search results page

maxresults = 20 # Set max number of listings to parse (per search results page)

# Define root url for KSL cars
rooturl = "https://cars.ksl.com"

# Note the url below specifies that we're looking for 96 per page and the default sort of newest to oldest posting
# This note about newest to oldest is useful so that we can avoid scraping repeat listings based on their timestamps
url = "https://cars.ksl.com/search/newUsed/Used;Certified/perPage/96/page/0"

# Need to spoof a user-agent in order to get past crawler block
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36'

# Note: The above user_agent might need to be rotated (along with IP) to avoid IP ban
# Example found on https://www.scrapehero.com/how-to-fake-and-rotate-user-agents-using-python-3/

all_cars = []

# Open live page (as opposed to downloaded)
resp = requests.get(url, headers = {'User-Agent': user_agent})
html = resp.content
pgsoup = BeautifulSoup(html)
lastpg = int(pgsoup.find(attrs={"title": "Go to last page"}).text.strip()) # Note that this is 1 more than number from href for this page
# print(f'Total number of search results pages: {lastpg}')
# print()

links = pgsoup.select("div.title > a.link") # grab all 96 (or up to 96) links
# print(f'Total number of links found on current page: {len(links)}')
tstamps = pgsoup.select("div.listing-detail-line script") # grab all 96 (or up to 96) timestamps
# print(f'Total number of timestamps found on current page: {len(tstamps)}')

# print()

# for tstamp in tstamps:
#     print(int(re.search('(\d+)',tstamp.text).group(0))) # <-- This is WORKING code to extract timestamp for each listing from search page
# print()

print(f'Limiting Subsequent Listing Results to {maxresults}')

# Loop through links and scrape data for each new listing
with progressbar.ProgressBar(max_value=maxresults) as bar:
    for idx, link in enumerate(links[:maxresults]): # *** only load first x results for now to avoid ban before implementing spoofing

        # Reset all fields to None before next loop
        price=year=make=model=body=mileage=title_type=city=state=seller=None
        trim=ext_color=int_color=transmission=liters=cylinders=fuel_type=num_doors=ext_condition=int_condition=drive_type=None
        
        # We're going to want to strip the "?ad_cid=[number]" from the end of these links as they're not needed to load the page properly
        # Regular expressions should come in handy here

        cutidx = re.search('(\?ad_cid=.+)',link['href']).start()
        currlink = link['href'][:cutidx]

        # Somewhere here we should do a check to make sure that the timestamp for currlink is newer than our newest file in our repository
        # That is, compare the timestamps with a simple conditional, where if the conditional is not met, this loop breaks to avoid useless computation time

        # Open listing link and pull html from it
        fulllink = '/'.join([rooturl.rstrip('/'), currlink.lstrip('/')])

        resp = requests.get(fulllink, headers = {'User-Agent': user_agent})
        lsthtml = resp.content
        lstsoup = BeautifulSoup(lsthtml)

        # Get listing price
        price = lstsoup.select('h3.price')[0].text.strip().replace('$','').replace(',','')

        # Get seller's location
        location = lstsoup.select('h2.location > a')[0].text.strip()
        city, state = location.split(',')
        city = city.strip()
        state = state.strip()

        # Get seller type (dealer or owner)
        sellerstr = lstsoup.select('div.fsbo')[0].text.strip()
        if re.search('(Dealer)', sellerstr):
            seller = 'Dealer'
        elif re.search('(Owner)', sellerstr):
            seller = 'Owner'

        # Get timestamp
        tstamp = int(re.search('(\d+)',tstamps[idx].text).group(0))

        # Get table of car specs
        specs = lstsoup.select('ul.listing-specifications')

        for li in specs[0].find_all('li'):
            lititle = li.select('span.title')[0].text.strip().strip(':')
            livalue = li.select('span.value')[0].text.strip().strip(':')
            
            if livalue.lower() == 'not specified':
                livalue = None

            # Now a bunch of if-else statements to determine which column to add data to
            # There might be a more sophisticated way to do this, perhaps with a tuple or a dictionary?
            if lititle.lower() == 'year':
                if livalue:
                    year = int(livalue)
                else:
                    year = livalue
            elif lititle.lower() == 'make':
                make = livalue
            elif lititle.lower() == 'model':
                model = livalue
            elif lititle.lower() == 'body':
                body = livalue
            elif lititle.lower() == 'mileage':
                if livalue:
                    mileage = int(livalue.replace(',',''))
                else:
                    mileage = livalue
            elif lititle.lower() == 'title type':
                title_type = livalue
                
            # Below this are non-required specs    
            elif lititle.lower() == 'trim':
                trim = livalue
            elif lititle.lower() == 'exterior color':
                if livalue:
                    ext_color = livalue.lower()
                else:
                    ext_color = livalue
            elif lititle.lower() == 'interior color':
                if livalue:
                    int_color = livalue.lower()
                else:
                    int_color = livalue
            elif lititle.lower() == 'transmission':
                transmission = livalue
            elif lititle.lower() == 'liters':
#                 liters = float(livalue) # <-- this may or may not have an "L" attached to it (e.g. "2.4L")
                liters = livalue
            elif lititle.lower() == 'cylinders':
                if livalue:
                    cylinders = int(livalue)
                else:
                    cylinders = livalue
            elif lititle.lower() == 'fuel type':
                fuel_type = livalue
            elif lititle.lower() == 'number of doors':
                if livalue:
                    num_doors = int(livalue)
                else:
                    num_doors = livalue
            elif lititle.lower() == 'exterior condition':
                ext_condition = livalue
            elif lititle.lower() == 'interior condition':
                int_condition = livalue
            elif lititle.lower() == 'drive type':
                drive_type = livalue
            elif (lititle.lower() == 'vin') | (lititle.lower() == 'stock number') | (lititle.lower() == 'dealer license'):
                None # Don't want to save these
            else:
                None
                print(f'Unmatched param {lititle}: {livalue}') # <-- could take advantage of some or all of these

        curr_car = pd.DataFrame({"timestamp":[tstamp],
                                 "price":[price],
                                 "year":[year],
                                 "make":[make],
                                 "model":[model],
                                 "body":[body],
                                 "mileage":[mileage],
                                 "title_type":[title_type],
                                 "city":[city],
                                 "state":[state],
                                 "seller":[seller],
                                 "trim":[trim],
                                 "ext_color":[ext_color],
                                 "int_color":[int_color],
                                 "transmission":[transmission],
                                 "liters":[liters],
                                 "cylinders":[cylinders],
                                 "fuel_type":[fuel_type],
                                 "num_doors":[num_doors],
                                 "ext_condition":[ext_condition],
                                 "int_condition":[int_condition],
                                 "drive_type":[drive_type]})
        try:
            all_cars = pd.concat([all_cars, curr_car])
        except:
            all_cars = curr_car

        bar.update(idx)
        
all_cars = all_cars.reset_index()
del all_cars['index']
all_cars.fillna(value=pd.np.nan, inplace=True)
all_cars

In [None]:
# Now try full thing while looping through proxies and associated user-agents



In [None]:
# Playing around with timestamps for use when checking for new data

print(datetime.fromtimestamp(all_cars['timestamp'][0]).isoformat())
print(datetime.now())
currtime = time.time()
print(currtime)
print(datetime.fromtimestamp(currtime).isoformat())

In [None]:
# Example of working live html parser without crawler block and user-agent spoof

# url = "http://www.python.org"
# resp = requests.get(url)
# html = resp.content
# print(html)