In [1]:
import pandas as pd
import time
import datetime
import progressbar
import requests
from bs4 import BeautifulSoup

In [4]:
def update_listing_info(cars_df, **kwargs):
    '''Updates a cars_dataframe with 6 new columns (views, favorites, 
    workingURL, view_rate, favorite_rate, fav_per_view). 
    REQUIRED INPUTS:
    cars_df: data frame with information as pulled from carscraper()
    VARIABLE INPUTS:
    min_age: int specifying minimum age (days) listing must be before updating information. Default 3
    min_last_pull: int specifying minimum time (days) since last pull for new information. Default 1'''
    
    # parse kwargs/set defaults
    if 'min_age' in kwargs.keys():
        if isinstance(kwargs['min_age'],int):
            min_age = kwargs['min_age']
        else:
            raise TypeError(f'Expected int for min_age but got {type(kwargs["min_age"])}.')
    else:
        min_age = 3
    if 'min_last_pull' in kwargs.keys():
        if isinstance(kwargs['min_last_pull'],int):
            min_last_pull = kwargs['min_last_pull']
        else:
            raise TypeError(f'Expected int for min_last_pull but got {type(kwargs["min_last_pull"])}.')
    else:
        min_last_pull = 1

    # new columns to add
    cars_df['views'] = np.NaN
    cars_df['favorites'] = np.NaN
    cars_df['workingURL'] = 1
    cars_df['view_rate'] = np.NaN
    cars_df['favorite_rate'] = np.NaN
    cars_df['fav_per_view'] = np.NaN
    
    # conversions to datetime
    orig_dates = cars_df['post_date']
    cars_df['post_date'] = pd.to_datetime(cars_df['post_date'])
    cars_df['lastpull_ts'] = pd.to_datetime(cars_df['lastpull_ts'], unit = 's')

    # find ads more than x days old (time.time() is in seconds)
    curr_time = pd.to_datetime(time.time(),unit='s')
    min_dt = pd.to_timedelta(min_age*60*60*24, unit='seconds') # time in seconds for use with datetime

    old_ads = cars_df['post_date'] < (curr_time - min_dt)

    # find ads that haven't been pulled for more than x days
    min_last_pull_dt = pd.to_timedelta(min_last_pull*60*60*24, unit='seconds') # time in seconds for use with datetime
    no_recent_update = cars_df['lastpull_ts'] < (curr_time - min_last_pull_dt)

    # subselect ads that need updating based on previous criteria and having a working URL last time it was checked
    cars_need_update = cars_df[old_ads & no_recent_update & cars_df['workingURL']]

    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'

    # iterate through, pulling new information from each ad
    last_pull = []
    views = []
    favorites = []
    working_url = []
    with progressbar.ProgressBar(max_value=len(cars_need_update.index)) as bar:
        for i, ad in cars_need_update.iterrows():
            ad_response = requests.get(ad['link'], headers = {'User-Agent': user_agent})
            pull_ts = pd.to_datetime(time.time(), unit='s')
            last_pull.append(pull_ts)
            ad_soup = BeautifulSoup(ad_response.content)

            # Check if link is still good (i.e. listing is still active)
            if ad_soup.title.text.strip().lower() == 'not found':
                working_url.append(0)
                views.append(None)
                favorites.append(None)
            else:
                working_url.append(1)

                # get views
                viewcount = int(ad_soup.select('span.vdp-info-value')[1].text.split()[0])
                views.append(viewcount)

                # get favorites
                favoritecount = int(ad_soup.select('span.vdp-info-value')[2].text.split()[0])
                favorites.append(favoritecount)
            bar.update(i)
            
    cars_updated = cars_need_update
    cars_updated['views'] = views
    cars_updated['favorites'] = favorites
    cars_updated['lastpull_ts'] = last_pull
    cars_updated['workingURL'] = working_url
    cars_updated['fav_per_view'] = cars_updated['favorites'] / cars_updated['views']
    # rates calculated per day
    cars_updated['view_rate'] = cars_updated['views'] / ((cars_updated['lastpull_ts'] - cars_updated['post_date']).dt.total_seconds()*60*60*24)
    cars_updated['favorite_rate'] = cars_updated['favorites'] / ((cars_updated['lastpull_ts'] - cars_updated['post_date']).dt.total_seconds()*60*60*24)

    cars_df.update(cars_updated)
    
    # update timestamps to replicate original state
    cars_df['lastpull_ts'] = (cars_df['lastpull_ts'] - datetime.datetime(1970,1,1)).dt.total_seconds().astype(int)
    cars_df['post_date'] = orig_dates
    
    return cars_df

In [9]:
cars_df = pd.read_csv('data/all_cars.csv')

# cars_df = cars_df.iloc[:20,:]
cars_df = update_listing_info(cars_df,min_age=0,min_last_pull=0)
cars_df.to_csv('data/all_cars_view_fav.csv',index=False)

cars_df.head()

100% (20 of 20) |########################| Elapsed Time: 0:00:18 Time:  0:00:18


Unnamed: 0,post_date,lastpull_ts,link,price,year,make,model,body,mileage,title_type,...,int_condition,drive_type,VIN,n_pics,views,favorites,workingURL,view_rate,favorite_rate,fav_per_view
0,2020-03-22,1584993540,https://cars.ksl.com/listing/6178199,13512,2016,Ford,Fusion,Sedan,39150.0,,...,,FWD,3FA6P0H70GR384143,30,235.0,4.0,1,1.717764e-08,2.923853e-10,0.017021
1,2020-03-22,1584993540,https://cars.ksl.com/listing/6148338,59992,2020,Jeep,Gladiator,Truck,1036.0,,...,,4-Wheel Drive,1C6JJTBGXLL117026,23,1276.0,8.0,1,9.327059e-08,5.847686e-10,0.00627
2,2020-03-22,1584993541,https://cars.ksl.com/listing/5915346,1299,2005,Hyundai,Accent,Sedan,142714.0,Clean Title,...,,FWD,KMHCG45C95U580539,21,2164.0,70.0,1,1.581794e-07,5.11671e-09,0.032348
3,2020-03-22,1584993541,https://cars.ksl.com/listing/5915345,1299,2000,Ford,Mustang,Coupe,181911.0,Clean Title,...,,,1FAFP4047YF240774,19,5140.0,110.0,1,3.757116e-07,8.04052e-09,0.021401
4,2020-03-22,1584993543,https://cars.ksl.com/listing/6327170,9750,2015,Toyota,Camry,Sedan,121850.0,Clean Title,...,Good,FWD,4T1BF1FK5FU035846,19,31.0,1.0,1,2.265942e-09,7.30949e-11,0.032258


In [None]:
pd.to_timedelta(cars_df['lastpull_ts'])

In [None]:
cars_df['lastpull_ts'] = (cars_df['lastpull_ts'] - datetime.datetime(1970,1,1)).dt.total_seconds().astype(int)