# script.py

## Import all modules

In [None]:
import requests                      # fetches html
from bs4 import BeautifulSoup        # helps extract data from the html 
import pandas as pd
from datetime import datetime
import time
import numpy as np
from random import randint

## Define the header needed when requesting for html

In [None]:
HEADERS = ({'User-Agent':      'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
            'Accept-Language': 'en-US, en;q=0.5'})

## Define the url of Craigslist
In this case, I added `&s=` at the end of the url so that we can later add `int` values to access different page numbers.

In [None]:
URL = 'https://sandiego.craigslist.org/search/sss?query=desk+chair&sort=rel&s='
IMG_URL = 'https://images.craigslist.org/{}_300x300.jpg'

## Create lists that will store data extracted from html

In [None]:
names = []
prices = []
dates = []
locations = []
urls = []
imgs = []
ids = []
last_seen = []

## Create a dictionary containing all the lists above

In [None]:
data = {'id':        ids,
        'name':      names,
        'price':     prices,
        'date':      dates,
        'location':  locations,
        'url':       urls,
        'images':    imgs
        'last_seen': last_seen}

## Define all the get functions that will extract meaningful data from html

In [None]:
def get_name(post):
    
    """
    Retrieves the title of the listing. 
    """
    
    try:
        name = post.find('a', class_ = 'result-title hdrlnk').text

    except:
        name = np.NaN

    return name

In [None]:
def get_price(post):
    
    """
    Retrieves the price of the listing. 
    """
    
    try:
        price = int(post.a.text[1:])

    except:
        pass

    try:
        price = int(post.find('span', class_ = 'result-price').text[1:])

    except:
        price = np.NaN

    return price

In [None]:
def get_date(post):
    
    """
    Retrieves the date of which the listing was posted. 
    """
    
    date = post.find('time')

    return date['datetime']

In [None]:
def get_location(post):
    
    """
    Retrieves the location at which the listing can be purchased. 
    """
    
    try:
        location = post.find('span', class_ = 'result-hood').text[2:-1]

    except:
        location = np.NaN

    return location

In [None]:
def get_url(post):
    
    """
    Retrieves the Craigslist link in which the listing can be purchased. 
    """
    
    url = post.a['href']
    
    return url

### ⚠️ Retrieving images from a lisitng is different from other work

Notice that images of the listing has a url source of **base url** + **data-id**.  
So, we will be creating image urls after extracting `data-ids`. 

In [None]:
def get_img(post):
    
    """
    Retrieves image urls of the listing. 
    """
    
    try:
        anchor_tag = post.find('a', class_ = 'result-image gallery')
        image_ids = anchor_tag.get('data-ids').split(',')
        product_ids = [image_id[2:] for image_id in image_ids]

        images = [IMG_URL.format(product_id) for product_id in product_ids]

    except:
        images = np.NaN

    return images

In [None]:
def get_id(post):
    
    """
    Retrieves the ID of the original listing. 
    """

    post_id = post.get('data-repost-of')

    if isinstance(post_id, int):
        return post_id

    else:
        return post.get('data-pid')

In [None]:
def get_last_seen():
    
    """
    Retrieves the date on which the listing was found the latest. 
    """
    
    return time.asctime( time.localtime(time.time()))

## Define a function that appends new data to corresponding lists

In [None]:
def get_all_page(posts):
    
    """
    Calls all the functions defined above.
    Then, saves data to corresponding lists.
    """
    
    for post in posts:
        names.append(get_name(post))
        prices.append(get_price(post))
        dates.append(get_date(post))
        locations.append(get_location(post))
        urls.append(get_url(post))
        imgs.append(get_img(post))
        ids.append(get_id(post))
        last_seen.append(get_last_seen())

## Define a function that collects listing data from all pages

In [None]:
def search_all_pages():
    n = 0

    while True:
        url = URL + str(n)
        response = requests.get(url, headers = HEADERS)

        if response.status_code != 200:
            return 'Cannot access website'

        html_soup = BeautifulSoup(response.text, features="lxml")
        results = html_soup.find_all('li', class_ = 'result-row')
        get_all_page(results)

        n += 120
        total_count = html_soup.find('span', class_ = 'totalcount').text
        if n > int(total_count):
            break
        time.sleep(randint(1,5))

## Retrieve all listing data that is currently posted on Craigslist

In [None]:
search_all_pages()
current = pd.DataFrame(data = data)

## Combine current and past data

In [None]:
past = pd.read_csv('tracking.csv')

"""
find common listings and keep the earlier last_seen value
"""

cols = ['id', 'name', 'price', 'date', 'location', 'url', 'images']

current['id']=current['id'].astype(int)
duplicate_old = past.merge(current, how = 'left', on = cols).dropna().drop('last_seen_y', axis = 1)
duplicate_old = duplicate_old.rename(columns = {'last_seen_x': 'last_seen'})

"""
listing found only in the past
"""

past_unique = pd.concat([past, duplicate_old, duplicate_old], sort = False).drop_duplicates(keep = False)

"""
combine and export with the correct latest seen dates
"""

combined = pd.concat([past_unique, current], sort = False)
combined.to_csv('tracking.csv', index = False)