In [15]:
'''
Code to scrape auto & trucks section in the For Sale section of Washington DC Craigslist.
The data scraped is cleaned with Tableau Prep and visualized using Tableau Public.
My Tableau Public URL: 
Date: 07/24/2018
Author: Sunil Vejandla
Python Version: 3.6.5
Pandas Version: 0.22
Reference for the Request Monitoring part: https://www.dataquest.io/blog/web-scraping-beautifulsoup/ by Alex Olteanu
'''

# Scraping Cars and Trucks section of Washington DC's Craigslist 
# to get the car name and price listed

import string
import sys
import bs4
import unicodedata
import pandas as pd
from requests import get
from time import sleep
from time import time
from random import randint
from IPython.core.display import clear_output
#to warn us if something's off
from warnings import warn

BASE_URL = 'https://washingtondc.craigslist.org'
CTA_URL = 'https://washingtondc.craigslist.org/search/cta'

filename = 'DC_CL_Cars_Trucks_test1.csv'

auto_names = []
auto_prices = []
auto_urls = []
#scraping the first 25 pages
pages = [i for i in range(0, 25)]

start_time = time()
requests = 0

for page in pages:
    #calculating the value for s to form the URL in craigslist format
    #https://washingtondc.craigslist.org/search/cta?s=120
    if page == 0:
        CTA_URL = 'https://washingtondc.craigslist.org/search/cta'
    else:
        s = page * 120
        CTA_URL = 'https://washingtondc.craigslist.org/search/cta'+'?s='+str(s)
        #print(CTA_URL)
    
    #Make a get request
    response = get(CTA_URL)
    
    #pause the loop
    sleep(randint(4,10))
    
    #Monitor the requests
    requests += 1
    elapsed_time = time() - start_time
    print ('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
    clear_output(wait = True)
    
    #Throw a warning if the request doesn't return a 200 status code
    if response.status_code != 200:
        warn('Request:{}; Status Code:{}'.format(requests, response.status_code))
    
    #check if the number of requests is greater than expected
    if (requests > 25):
        warn('Breaking the loop as the number of requests made is greater than 20')
        break
    
    # parsing response.text using Python's built-in HTML parser.
    html_soup = bs4.BeautifulSoup(response.text, 'html.parser')
    #finding the div with the 'content' class
    auto_containers_div = html_soup.find('div', class_='content')
    #finding all of the list items with the 'result-row' class. These are the actual ads on the page
    auto_containers = auto_containers_div.find_all('li', class_='result-row')
    
    for auto_container in auto_containers:
        #if the ad has price listed
        if auto_container.find('span', class_='result-price') is not None:
            #get the name
            name = auto_container.p.a.text
            auto_names.append(name)
            
            #get the link
            url_link = auto_container.find('a', class_='result-title hdrlnk').attrs['href']
            auto_urls.append(url_link)
            
            #get the price
            price = auto_container.find('span', class_='result-price').text
            auto_prices.append(price)

#creating a pandas data frame
autos_df = pd.DataFrame({'name': auto_names,
                         'url' : auto_urls,
                        'price': auto_prices})

#only keeping the alphabets and numbers in name. Removing all symbols, and special characters
clean_symbols = autos_df['name'].str.findall(r'[a-zA-Z0-9]+')
clean_symbols_list = []

for name in clean_symbols:
    clean_symbols_list.append(' '.join(name))

autos_df['name'] = clean_symbols_list

#extracting year from the name column. There will be NaNs
auto_year = autos_df['name'].str.extract(r'(\d{4})', expand=False)

#adding a new column to the data frame
autos_df['make_year'] = auto_year

#writing the data frame to a csv file for analysis.
autos_df.to_csv(filename, sep=',', encoding='utf-8', line_terminator='\n')

#display scraped data info
print(autos_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2867 entries, 0 to 2866
Data columns (total 4 columns):
name         2867 non-null object
price        2867 non-null object
url          2867 non-null object
make_year    2479 non-null object
dtypes: object(4)
memory usage: 89.7+ KB
None


Unnamed: 0,name,price,url,make_year
0,1992 Ford F 250 XLT,$1000,https://washingtondc.craigslist.org/doc/cto/d/...,1992.0
1,1991 Chrysler lebaron convertible,$8500,https://washingtondc.craigslist.org/nva/cto/d/...,1991.0
2,MERCEDES BENZ C220,$1250,https://washingtondc.craigslist.org/nva/cto/d/...,
3,2014 Nissan Versa sv 68k miles,$6500,https://washingtondc.craigslist.org/doc/cto/d/...,2014.0
4,1995 K3500 Chevy Dually 4X4 Extended Cab,$4900,https://washingtondc.craigslist.org/nva/cto/d/...,1995.0
5,2011 Land Rover LR2 Low 67500 miles,$9500,https://washingtondc.craigslist.org/nva/cto/d/...,2011.0
6,2001 TOYOTA TACOMA NEW WHEELS CLEAN,$1700,https://washingtondc.craigslist.org/mld/cto/d/...,2001.0
7,2011 Subaru Legacy 2 5i limited 81000 miles,$9200,https://washingtondc.craigslist.org/doc/cto/d/...,2011.0
8,2001 Nissan Xterra mechanics special,$1000,https://washingtondc.craigslist.org/nva/cto/d/...,2001.0
9,2013 Toyota Camry,$9200,https://washingtondc.craigslist.org/nva/cto/d/...,2013.0
