In [None]:
import pandas as pd
import numpy as np
import time
import datetime
import progressbar # if this isn't installed, use pip install progressbar2
import requests
from bs4 import BeautifulSoup # if this isn't installed, use pip install beautifulsoup4
from selenium import webdriver # if not installed, do pip install selenium
import random
from itertools import cycle

In [None]:
def generateProxies():
    # Get list of US-based proxy IPs and ports using selenium

    IPurl = "https://www.us-proxy.org/" # <-- the robots.txt file for this site allows full access for all user-agents

    # Specify incognito options for Chrome
    option = webdriver.ChromeOptions()
    option.add_argument("--incognito")
    option.add_argument("--start-maximized")

    # Create new Chrome instance
    browser = webdriver.Chrome(options=option)

    # Minimize window
#     browser.minimize_window()

    # Go to desired website
    IPurl = "https://www.us-proxy.org/" # <-- the robots.txt file for this site allows full access for all user-agents
    browser.get(IPurl)

    # Filter by https only
    https_button = browser.find_elements_by_xpath("//*[@id='proxylisttable']/tfoot/tr/th[7]/select/option[3]")[0]
    https_button.click()

    # Set to 80 results
    maxnum_button = browser.find_elements_by_xpath("//*[@id='proxylisttable_length']/label/select/option[3]")[0]
    maxnum_button.click()

    # Grab IP's and Ports from the resulting table
    rows = browser.find_elements_by_xpath("//*[@id='proxylisttable']/tbody/tr")

    proxies = set() # using a set ensures there aren't duplicates
    for row in rows:
        row = row.text.split(' ')

        if row[3].strip().lower() != 'transparent': # don't want to include our real proxy when navigating KSL
            proxies.add(''.join(['http://', ':'.join([row[0].strip(), row[1].strip()])]))

    # Close browser when done
    browser.close()

    return proxies

In [None]:
def update_listing_info(cars_df, **kwargs):
    '''Updates a cars_dataframe with 6 new columns (views, favorites, 
    workingURL, view_rate, favorite_rate, fav_per_view). 
    REQUIRED INPUTS:
    cars_df: data frame with information as pulled from carscraper()
    VARIABLE INPUTS:
    min_age: int specifying minimum age (days) listing must be before updating information. Default 3
    min_last_pull: int specifying minimum time (days) since last pull for new information. Default 1
    use_proxy: bool indicating to use a proxy. Default 0
    proxy_dict: dictionary of proxy IPs and user agents'''
    
    # parse kwargs/set defaults
    if 'min_age' in kwargs.keys():
        if isinstance(kwargs['min_age'],int):
            min_age = kwargs['min_age']
        else:
            raise TypeError(f'Expected int for min_age but got {type(kwargs["min_age"])}.')
    else:
        min_age = 3
    if 'min_last_pull' in kwargs.keys():
        if isinstance(kwargs['min_last_pull'],int):
            min_last_pull = kwargs['min_last_pull']
        else:
            raise TypeError(f'Expected int for min_last_pull but got {type(kwargs["min_last_pull"])}.')
    else:
        min_last_pull = 1
    if 'use_proxy' in kwargs.keys():
        if isinstance(kwargs['use_proxy'],int) or isinstance(kwargs['use_proxy'],bool):
            use_proxy = kwargs['use_proxy']
        else:
            raise TypeError(f'Expected int or bool for use_proxy but got {type(kwargs["use_proxy"])}.')
    else:
        # default is to NOT use proxy
        use_proxy = False
    
    # the following were pulled manually on 3/12/20 from https://www.whatismybrowser.com/guides/the-latest-user-agent/
    user_agents = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
                   'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
                   'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/74.0',
                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/74.0',
                   'Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/74.0',
                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0 Safari/605.1.15',
                   'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36 Edg/80.0.361.62',
                   'Mozilla/5.0 (Windows NT 10.0; Trident/7.0; rv:11.0) like Gecko']
    if use_proxy:
        # The following inputs are only useful when using a proxy
        
        if 'proxydict' in kwargs.keys():
            if isinstance(kwargs['proxydict'],dict):
                proxydict = kwargs['proxydict']
            else:
                print(f'Expected dict type for proxydict but got {type(kwargs["proxydict"])}. Generating new proxydict...')
                newproxies = generateProxies()
                proxydict = {i:random.choice(user_agents) for i in newproxies}
        else:
            print('No proxydict found. Generating...')
            newproxies = generateProxies()
            proxydict = {i:random.choice(user_agents) for i in newproxies}

        if 'refreshmin' in kwargs.keys():
            if isinstance(kwargs['refreshmin'],int) or isinstance(kwargs['refreshmin'],float):
                refreshmin = kwargs['refreshmin']
            else:
                refreshmin = 15
                print(f'Expected int or float for refreshmin but got {type(kwargs["refreshmin"])}. Set to default value of {refreshmin}.')
        else:
            refreshmin = 15
            print(f'No refreshmin found. Set to default value of {refreshmin}.')
                      
    if use_proxy:
        tstart = time.time() # set a start time to use for refreshing proxy list (if needed)    

        if 'currproxy' in kwargs.keys():
            if isinstance(kwargs['currproxy'],str):
                currproxy = kwargs['currproxy']
            else:
                proxy_pool = cycle(proxydict) # make a pool of proxies 
                currproxy = next(proxy_pool) # grab the next proxy in cycle
        else:
            proxy_pool = cycle(proxydict) # make a pool of proxies 
            currproxy = next(proxy_pool) # grab the next proxy in cycle     

    # new columns to add
    cars_df['views'] = np.NaN
    cars_df['favorites'] = np.NaN
    cars_df['workingURL'] = 1
    cars_df['view_rate'] = np.NaN
    cars_df['favorite_rate'] = np.NaN
    cars_df['fav_per_view'] = np.NaN
    
    # conversions to datetime
    orig_dates = cars_df['post_date']
    cars_df['post_date'] = pd.to_datetime(cars_df['post_date'])
    cars_df['lastpull_ts'] = pd.to_datetime(cars_df['lastpull_ts'], unit = 's')

    # find ads more than x days old (time.time() is in seconds)
    curr_time = pd.to_datetime(time.time(),unit='s')
    min_dt = pd.to_timedelta(min_age*60*60*24, unit='seconds') # time in seconds for use with datetime

    old_ads = cars_df['post_date'] < (curr_time - min_dt)

    # find ads that haven't been pulled for more than x days
    min_last_pull_dt = pd.to_timedelta(min_last_pull*60*60*24, unit='seconds') # time in seconds for use with datetime
    no_recent_update = cars_df['lastpull_ts'] < (curr_time - min_last_pull_dt)

    # subselect ads that need updating based on previous criteria and having a working URL last time it was checked
    cars_need_update = cars_df[old_ads & no_recent_update & cars_df['workingURL']]

    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'

    # iterate through, pulling new information from each ad
    last_pull = []
    views = []
    favorites = []
    working_url = []
    with progressbar.ProgressBar(max_value=len(cars_need_update.index)) as bar:
        for i, ad in cars_need_update.iterrows():
            if use_proxy:
                attempts = len(proxydict) # for now, limit the total number of attempts to one per proxy. This will prevent endless while loop
                chkproxy = 1
                while chkproxy:
                    if (time.time() - tstart) > 60*refreshmin: # check if it's been more than refreshmin minutes since proxy_pool updated
                        print('Refreshing proxy pool...')
                        tstart = time.time()

                        currproxies = set(proxydict.keys())
                        newproxies = generateProxies()
                        newproxies = newproxies.difference(currproxies)

                        if newproxies:
                            newdict = {i:random.choice(user_agents) for i in newproxies}
                            proxydict.update(newdict)
                            proxy_pool = cycle(proxydict)
                            currproxy = next(proxy_pool)
                            print('Proxy pool updated!')

                    try:
                        ad_response = requests.get(ad['link'],proxies={"http":currproxy, "https":currproxy},headers={'User-Agent': proxydict[currproxy]}, timeout=20)
                        print(f'Proxy success for {currproxy}')
                        print()
                        chkproxy = 0
                        attempts += 1
                    except:
                        prevproxy = currproxy
                        currproxy = next(proxy_pool)
                        print(f'Proxy error for {prevproxy}! Next up is {currproxy}')
                        attempts -= 1
                        print(f'Attempts remaining: {attempts}')
            else:
                ad_response = requests.get(ad['link'], headers = {'User-Agent': user_agent})
            
            pull_ts = pd.to_datetime(time.time(), unit='s')
            last_pull.append(pull_ts)
            ad_soup = BeautifulSoup(ad_response.content)

            # Check if link is still good (i.e. listing is still active)
            if ad_soup.title.text.strip().lower() == 'not found':
                working_url.append(0)
                views.append(None)
                favorites.append(None)
            else:
                working_url.append(1)

                # get views
                viewcount = int(ad_soup.select('span.vdp-info-value')[1].text.split()[0])
                views.append(viewcount)

                # get favorites
                favoritecount = int(ad_soup.select('span.vdp-info-value')[2].text.split()[0])
                favorites.append(favoritecount)
            bar.update(i)
            
    cars_updated = cars_need_update
    cars_updated['views'] = views
    cars_updated['favorites'] = favorites
    cars_updated['lastpull_ts'] = last_pull
    cars_updated['workingURL'] = working_url
    cars_updated['fav_per_view'] = cars_updated['favorites'] / cars_updated['views']
    # rates calculated per day
    cars_updated['view_rate'] = cars_updated['views'] / ((cars_updated['lastpull_ts'] - cars_updated['post_date']).dt.total_seconds()*60*60*24)
    cars_updated['favorite_rate'] = cars_updated['favorites'] / ((cars_updated['lastpull_ts'] - cars_updated['post_date']).dt.total_seconds()*60*60*24)

    cars_df.update(cars_updated)
    
    # update timestamps to replicate original state
    cars_df['lastpull_ts'] = (cars_df['lastpull_ts'] - datetime.datetime(1970,1,1)).dt.total_seconds().astype(int)
    cars_df['post_date'] = orig_dates
    
    return cars_df

In [None]:
cars_df = pd.read_csv('data/all_cars.csv')

# cars_df = cars_df.iloc[:5,:]
cars_df = update_listing_info(cars_df,min_age=0,min_last_pull=0,use_proxy=True)
cars_df.to_csv('data/all_cars_view_fav.csv',index=False)

cars_df.head()