# Scraping Car Ads from Piston Window
### Import necessary libraries

In [4]:
import re # patterns in HTML source code
import sys # exist execution
import time # speed of executions

import numpy as np

from constants import *
from selenium import webdriver # browser tasks automation
from selenium.webdriver.common.by import By # find element by property
from selenium.webdriver.support.ui import WebDriverWait # selenium wait
from selenium.webdriver.support import expected_conditions as EC # action expected on wait
from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException # exceptions
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException, NoSuchWindowException, WebDriverException
from bs4 import BeautifulSoup # web scraping
import pandas as pd # dataframe

### Scraping

In [34]:
start_time = time.time()

# 1) Variables:

# Initiating lists for features
make = list()
model = list()
year = list()
body = list()
mileage = list()
fuel = list()
transmission = list()
engine_size = list()
engine_power = list()
price = list()

URLS = [URL1, URL2, URL3, URL4, URL5, URL6, URL7, URL8, URL9, URL10]
NUM_OF_PAGES = [PAGES1, PAGES2, PAGES3, PAGES4, PAGES5, PAGES6, PAGES7, PAGES8, PAGES9, PAGES10]

# Initiating the listings dataframe
df = pd.DataFrame(columns=COLUMNS)

# Chromedriver settings
browser = webdriver.Chrome(CHROMEDRIVER)
browser.delete_all_cookies()

# 3) Scraping cars attributes from listings page to lists
for URL in URLS:
    PAGES = NUM_OF_PAGES[URLS.index(URL)]
    for page in range(1, PAGES):
        denied = True
        while denied:
            try:
                browser.get(URL.format(page))
                denied = False
            except WebDriverException:
                browser.implicitly_wait(2)
                browser.refresh()

        ignored_exceptions = (NoSuchElementException, StaleElementReferenceException, NoSuchWindowException, WebDriverException)
        staled = True
        while staled:
            price_temp = list()
            try:
                prices = WebDriverWait(browser, 10, ignored_exceptions=ignored_exceptions
                                       ).until(EC.presence_of_all_elements_located(
                    (By.CSS_SELECTOR, PRICE_CSS)))
                # fetch price
                for ad_price in prices:
                    staled = True
                    price_temp.append(re.sub("[^\d\.]", "", ad_price.text.strip()))
                    staled = False
                price = price + price_temp
            except TimeoutException:
                sys.exit("cars attributes (in cards) css selector not found")
            except ignored_exceptions:
                browser.refresh()

        html = browser.page_source
        soup = BeautifulSoup(html, "lxml")

        # fetch make and model
        for ad_name in soup.find_all("h3", AD_NAME_CLASS):
            name = ad_name.text.strip().split()
            make.append(name[0])
            model.append(" ".join(name[1:]))

        # fetch year, mileage, body, transmission, fuel, engine size, engine power
        for m, ad_specs in enumerate(soup.find_all("div", ATTRIBUTES_CLASS)):
            if m % 2 == 0:
                continue
            year_tag = False
            miles_tag = False
            body_tag = False
            transmission_tag = False
            fuel_tag = False
            engine_size_tag = False
            engine_power_tag = False
            for n, spec in enumerate(ad_specs):
                try:
                    if "/" in spec.text and bool(re.search("\d", spec.text.strip().split("/")[1])):
                        year_tag = True
                        year.append(spec.text.strip().split("/")[1])
                    if "miles" in spec.text.lower():
                        miles_tag = True
                        mileage.append(re.sub("[^\d\.]", "", spec.text))
                    if spec.text.strip().lower() in BODY_TYPES:
                        body_tag = True
                        body.append(spec.text.strip().lower())
                    if spec.text.strip().title() in TRANSMISSION_TYPES:
                        transmission_tag = True
                        transmission.append(spec.text.strip().title())
                    if spec.text.strip().title() in FUEL_TYPES or "hybrid" in spec.text.strip().lower():
                        fuel_tag = True
                        if "hybrid" in spec.text.strip().lower():
                            fuel.append("Hybrid")
                        else:
                            fuel.append(spec.text.strip().title())
                    if "L" in spec.text and spec.text.lower() not in BODY_TYPES and bool(re.search("\d", spec.text.strip().split("L")[0])):
                        engine_size_tag = True
                        engine_size.append(re.findall("\d+(?:\.\d+)?", spec.text.strip())[0])
                    if "PS" in spec.text and "dropside" not in spec.text.lower():
                        engine_power_tag = True
                        engine_power.append(re.findall("\d+(?:\.\d+)?", spec.text.strip())[0])
                    if n == len(ad_specs) - 1:
                        if not year_tag:
                            year.append(None)
                        if not miles_tag:
                            mileage.append(None)
                        if not body_tag:
                            body.append(None)
                        if not transmission_tag:
                            transmission.append(None)
                        if not fuel_tag:
                            fuel.append(None)
                        if not engine_size_tag:
                            engine_size.append(None)
                        if not engine_power_tag:
                            engine_power.append(None)
                        if len(fuel) != len(engine_size):
                            fuel.pop()
                except:
                    print(spec)
        print(f"time: {time.time() - start_time} seconds - [page: {page}]")
    print(f"time: {time.time() - start_time} seconds - [NEW URL:{URLS.index(URL)+1}]")

# close chrome browser
browser.quit()

  browser = webdriver.Chrome(CHROMEDRIVER)


time: 18.446707248687744 seconds - [page: 1]
time: 27.048123598098755 seconds - [page: 2]
time: 34.9071261882782 seconds - [page: 3]
time: 43.1124210357666 seconds - [page: 4]
time: 51.45791149139404 seconds - [page: 5]
time: 59.644615173339844 seconds - [page: 6]
time: 67.65413045883179 seconds - [page: 7]
time: 76.37939667701721 seconds - [page: 8]
time: 84.45411920547485 seconds - [page: 9]
time: 92.77337765693665 seconds - [page: 10]
time: 101.87900161743164 seconds - [page: 11]
time: 110.05058598518372 seconds - [page: 12]
time: 118.77985715866089 seconds - [page: 13]
time: 126.55651307106018 seconds - [page: 14]
time: 134.41640448570251 seconds - [page: 15]
time: 142.56701970100403 seconds - [page: 16]
time: 150.56606483459473 seconds - [page: 17]
time: 158.1476695537567 seconds - [page: 18]
time: 166.3015263080597 seconds - [page: 19]
time: 174.45396876335144 seconds - [page: 20]
time: 182.4153664112091 seconds - [page: 21]
time: 190.3929479122162 seconds - [page: 22]
time: 197.

['Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel', 'Diesel']
48
48


In [36]:
# 4) Filling dataframe column by feature list
df["make"] = make
df["model"] = model
df["mileage"] = mileage
df["year"] = year
df["body"] = body
df["transmission"] = transmission
df["engineSize"] = engine_size
df["enginePower"] = engine_power
df["fuel"] = fuel
df["price"] = price
print(f"shape so far of df is {df.shape}")

shape so far of df is (82137, 10)


In [37]:
print(df.fuel.unique())

['Diesel' 'Hybrid' None 'Electric' 'Petrol']


### Small Snapshot of Dataframe

In [38]:
df

Unnamed: 0,make,model,year,body,mileage,fuel,transmission,engineSize,enginePower,price
0,Jaguar,S-Type,2007,saloon,52112,Diesel,Automatic,2.7,280,5799
1,BMW,X1,2014,suv,45580,Diesel,Automatic,2,250,11500
2,Land,Rover Range Rover Sport,2011,estate,100620,Diesel,Automatic,3,333,14809
3,Land,Rover Range Rover Sport,2012,estate,67640,Diesel,Automatic,3,347,17402
4,BMW,6 Series,2010,convertible,67112,Diesel,Automatic,3,389,12499
...,...,...,...,...,...,...,...,...,...,...
82132,Hyundai,Ioniq,,hatchback,,Hybrid,Automatic,,,29043
82133,Fiat,500,2021,hatchback,4414,Electric,Automatic,,160,26701
82134,Toyota,Yaris,2016,hatchback,35820,Hybrid,Automatic,1.5,136,12495
82135,MG,Zs,2021,hatchback,3236,Electric,Automatic,,,25495


### Writing Dataframe into CSV File

In [41]:
df.to_csv("car-dataset.csv", index=False)

In [44]:
print(df.transmission.unique())

['Automatic' None 'Manual' 'Semi Automatic']
