## Introduction
`This notebook provides a comprehensive overview of a web scraping project conducted on Autotrader South Africa's website between 23/04/23 and 24/04/23 using Python and Selenium framework. The goal of the project was to extract data on various used car models, the data  extracted includes car name, price, mileage, number of previous owners and car specifications.The scraped data was stored in a csv file for further analysis.`

### Necessary imports

In [1]:
#necessary imports
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

driver_path = '/usr/local/python/3.10.4/lib/python3.10/site-packages/chromedriver_autoinstaller/112/chromedriver'
service = Service(driver_path)

prefs = {"profile.managed_default_content_settings.images": 2,
         "profile.managed_default_content_settings.javascript": 2,
         #"profile.managed_default_content_settings.cookies": 2,
         "profile.managed_default_content_settings.plugins": 1,
         "profile.managed_default_content_settings.popups": 2,
         "profile.managed_default_content_settings.geolocation": 2,
         "profile.managed_default_content_settings.media_stream": 2,
         "profile.managed_default_content_settings.notifications": 2}

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_experimental_option('prefs', prefs)

driver = webdriver.Chrome(service=service, options=options)


### Autotrader web scraping

In [2]:
#helper function
def convert_to_dict(x):
    #converts a list to dictionary in the form index0:index1...
    item = iter(x)
    converted_dict = dict(zip(item, item))
    return converted_dict

In [3]:
%%time
#list to store each car's scraped data
car_data=[]

# go to link
try:
    url = 'https://www.autotrader.co.za/cars-for-sale?pagenumber=650&sortorder=Newest&year=2018-to-2023&isused=True&priceoption=RetailPrice'
    driver.get(url)
    #click cookie button 
    driver.find_element(By.XPATH, '//*[@id="cookieBanner"]/div/div/div/button[2]').click()

except WebDriverException as e:
     print('Incorrect url: The website could not be loaded or some other WebDriver related error occurred')

else:
    
    for i in range(1,100): #to loop over 100 pages
        #get all car links on web page
        car_links=[]
        wait = WebDriverWait(driver, 2)
        car_tile=wait.until(EC.visibility_of_element_located((By.CLASS_NAME,'b-result-tiles')))
        search='po=1' #to isolate car links from other links
        for i in car_tile.find_elements(By.TAG_NAME,'a'):
            try:
                href = i.get_attribute('href')
                if href and search in href:
                    car_links.append(href)
            except StaleElementReferenceException:
                time.sleep(1)  # wait for the element to become available again before retrying

        #navigate to each car's individual page
        for link in car_links: 
            try:
                #open new tab
                driver.execute_script("window.open('');")
                # Switch to the new window and open new URL
                driver.switch_to.window(driver.window_handles[1])
                driver.get(link)
            except:
                #if error is encountered on the tab,close and return to first tab
                driver.close()
                driver.switch_to.window(driver.window_handles[0])
                continue
            else:
                try:
                    #wait 2 secs for page to load
                    wait = WebDriverWait(driver, 2)
                    #get car name 
                    car_name=wait.until(EC.visibility_of_element_located((By.CLASS_NAME,'e-listing-title')))
                    car_name=car_name.text
                    #get car price
                    car_price=driver.find_element(By.CLASS_NAME,'e-price').text

                    #get car details in top icon section
                    car_details_1={}
                    icon_elements=driver.find_elements(By.CLASS_NAME,'e-summary-icon')[1:]
                    for i in icon_elements:
                        car_details_1[(i.get_attribute('title'))]=(i.text)

                    #get car details in Vehicle Details section
                    car_details_2={}
                    vehicle_details=driver.find_element(By.CLASS_NAME,'b-striped-specs').text.split('\n')
                    #make dictionary of extracted details
                    for i in range(0, len(vehicle_details), 2):
                        key = vehicle_details[i]
                        value = vehicle_details[i+1]
                        if 'owner' in key.lower():
                            car_details_2['Previous Owners'] = value
                        elif 'colour' in key.lower() or 'color' in key.lower():
                            car_details_2['Color'] = value
                        elif 'body' in key.lower():
                            car_details_2[key] = value

                    #scroll to specifications section to click all specification tab
                    specification_section= driver.find_element(By.CLASS_NAME,'b-listing-specification')
                    driver.execute_script("arguments[0].scrollIntoView();", specification_section)

                    #click all specification tab to display its info
                    specs_tabs=driver.find_elements(By.CLASS_NAME,'e-accordion-header')
                    for i in specs_tabs:
                        i.click()
                        driver.execute_script("window.scrollBy(0, 150)")

                    specs=[]
                    car_details_3={}
                    #extract car details under each specification tab
                    specs_elements=driver.find_elements(By.CLASS_NAME,'e-accordion-body')
                    for element in specs_elements:
                        element=element.text.split('\n')
                        specs.append(element)
                    #make dictionary of car details
                    for i in specs:
                        car_details_3.update(convert_to_dict(i))
                except:
                    #if error is encountered on a tab,close and return to first tab
                    driver.close()
                    driver.switch_to.window(driver.window_handles[0])
                    continue

                else:
                    #join all extracted car details together in single dictionary 
                    car_details_combined={}
                    car_details_combined['Name']=car_name
                    car_details_combined['Price']=car_price
                    car_details_combined.update(car_details_1)
                    car_details_combined.update(car_details_2)
                    car_details_combined.update(car_details_3)

                    #append car_details_combined of car to car_data list
                    car_data.append(car_details_combined)

                    # Closing current tab
                    driver.close()

                    # Switching to old/main tab
                    driver.switch_to.window(driver.window_handles[0])

        #click next page link
        driver.find_element(By.CLASS_NAME,'gm-float-right.e-pagination-link').click()
        time.sleep(2)

print('Scraping done')
print('Number of scraped cars :',len(car_data))

Scraping done
Number of scraped cars : 2574
CPU times: user 2min, sys: 6.03 s, total: 2min 6s
Wall time: 1h 18min 37s


### Save data

In [4]:
#export to dataframe
import pandas as pd
df=pd.DataFrame(car_data)

In [5]:
#save to disk
df.to_csv('./Downloads/autotrader-south-africa-cars-20230423.csv')