# Section 1: Acquiring each listing

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time

## Section 1.1: Parsing the first search page (per 100 listing)

In [2]:
url = 'https://www.sgcarmart.com/used_cars/listing.php?BRSR=0&RPG=100&AVL=2&VEH=2'
# To page through each main page, ONLY considering passenger cars.
#'https://www.sgcarmart.com/used_cars/listing.php?BRSR=' + str(idx * 100) '&RPG=20&AVL=2&VEH=0'

response = requests.get(url)

In [3]:
page = response.text
main_page = BeautifulSoup(page, 'lxml')

In [560]:
main_page.find_all('a')

[<a href="listing.php?MOD=&amp;RPG=100&amp;VEH=0&amp;AVL=2" style="background-color:#0066AA; color:#FFFFFF;">All Makes</a>,
 <a href="listing.php?MOD=Alfa+Romeo&amp;RPG=100&amp;VEH=0&amp;AVL=2">Alfa Romeo</a>,
 <a href="listing.php?MOD=Aston+Martin&amp;RPG=100&amp;VEH=0&amp;AVL=2">Aston Martin</a>,
 <a href="listing.php?MOD=Audi&amp;RPG=100&amp;VEH=0&amp;AVL=2">Audi</a>,
 <a href="listing.php?MOD=Austin&amp;RPG=100&amp;VEH=0&amp;AVL=2">Austin</a>,
 <a href="listing.php?MOD=Bentley&amp;RPG=100&amp;VEH=0&amp;AVL=2">Bentley</a>,
 <a href="listing.php?MOD=BMW&amp;RPG=100&amp;VEH=0&amp;AVL=2">BMW</a>,
 <a href="listing.php?MOD=Chery&amp;RPG=100&amp;VEH=0&amp;AVL=2">Chery</a>,
 <a href="listing.php?MOD=Chevrolet&amp;RPG=100&amp;VEH=0&amp;AVL=2">Chevrolet</a>,
 <a href="listing.php?MOD=Chrysler&amp;RPG=100&amp;VEH=0&amp;AVL=2">Chrysler</a>,
 <a href="listing.php?MOD=Citroen&amp;RPG=100&amp;VEH=0&amp;AVL=2">Citroen</a>,
 <a href="listing.php?MOD=Daihatsu&amp;RPG=100&amp;VEH=0&amp;AVL=2">Daihat

In [561]:
main_page.find_all('a')[0]  # Example of 1 item in the list

<a href="listing.php?MOD=&amp;RPG=100&amp;VEH=0&amp;AVL=2" style="background-color:#0066AA; color:#FFFFFF;">All Makes</a>

In [808]:
base_url = "https://www.sgcarmart.com/used_cars/"

for link in main_page.find_all('a'):  # Just taking the first 5 listings
    suffix = link['href']
    if ('ID=' in suffix) and ("DL=" in suffix):
        each_listing_link = base_url + suffix
        print(each_listing_link)

https://www.sgcarmart.com/used_cars/info.php?ID=858208&DL=1000
https://www.sgcarmart.com/used_cars/info.php?ID=858208&DL=1000
https://www.sgcarmart.com/used_cars/info.php?ID=862846&DL=1000
https://www.sgcarmart.com/used_cars/info.php?ID=862846&DL=1000
https://www.sgcarmart.com/used_cars/info.php?ID=862843&DL=1103
https://www.sgcarmart.com/used_cars/info.php?ID=862843&DL=1103
https://www.sgcarmart.com/used_cars/info.php?ID=862201&DL=3345
https://www.sgcarmart.com/used_cars/info.php?ID=862201&DL=3345
https://www.sgcarmart.com/used_cars/info.php?ID=861970&DL=3345
https://www.sgcarmart.com/used_cars/info.php?ID=861970&DL=3345
https://www.sgcarmart.com/used_cars/info.php?ID=860493&DL=2976
https://www.sgcarmart.com/used_cars/info.php?ID=860493&DL=2976
https://www.sgcarmart.com/used_cars/info.php?ID=781368&DL=1034
https://www.sgcarmart.com/used_cars/info.php?ID=781368&DL=1034
https://www.sgcarmart.com/used_cars/info.php?ID=860253&DL=3345
https://www.sgcarmart.com/used_cars/info.php?ID=860253&

# Section 2: Pulling data from each individual listing

### Section 2.1: Parsing each listing's page

In [709]:
# url to parse from "https://www.sgcarmart.com/used_cars/info.php?ID=862816&DL=2397"
listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862833&DL=3588"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')

### Section 2.1: Title/Brand of car

In [18]:
listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862832&DL=2934"
response2 = requests.get(listing_url)
parsed_listing_url = BeautifulSoup(response2.text, 'lxml')

In [19]:
parsed_listing_url.find(class_='link_redbanner')

<a class="link_redbanner" href="listing.php?MOD=Mercedes-Benz+E-Class+E200+CGI+Avantgarde&amp;AVL=2">Mercedes-Benz E-Class E200 CGI Avantgarde (COE till 04/2029)</a>

In [20]:
parsed_listing_url.find(class_='link_redbanner').text

'Mercedes-Benz E-Class E200 CGI Avantgarde (COE till 04/2029)'

In [21]:
parsed_listing_url.find(class_='link_redbanner').text.split()

['Mercedes-Benz',
 'E-Class',
 'E200',
 'CGI',
 'Avantgarde',
 '(COE',
 'till',
 '04/2029)']

In [22]:
# Assume every car name starts with brand
brand = parsed_listing_url.find(class_='link_redbanner').text.split()[0]
brand


'Mercedes-Benz'

In [23]:
# Define a function to retrieve car brand name given a listing url

def brand_retrieval(listing_url):
    response = requests.get(listing_url)
    parsed_listing_url = BeautifulSoup(response.text, 'lxml')
    
    brand_name = parsed_listing_url.find(class_='link_redbanner').text.split()[0]
    
    time.sleep(1)
    return brand_name


brand_retrieval('https://www.sgcarmart.com/used_cars/info.php?ID=862832&DL=2934')

'Mercedes-Benz'

### Section 2.3: Depreciation Value Per Year

In [25]:
listing_url = 'https://www.sgcarmart.com/used_cars/info.php?ID=862832&DL=2934'
response = requests.get(listing_url)
parsed_listing_url = BeautifulSoup(response.text, 'lxml')

In [30]:
parsed_listing_url.find_all(class_="label")[1].findNextSibling()

<td valign="top">
                                $7,340 /yr                            </td>

In [31]:
parsed_listing_url.find_all(class_="label")[1].findNextSibling().text

'\r\n                                $7,340 /yr                            '

In [42]:
parsed_listing_url.find_all(class_="label")[1].findNextSibling().text.strip().split('$')

['', '7,340 /yr']

In [43]:
parsed_listing_url.find_all(class_="label")[1].findNextSibling().text.strip().split('$')[1]

'7,340 /yr'

In [44]:
parsed_listing_url.find_all(class_="label")[1].findNextSibling().text.strip().split('$')[1].split('/yr')

['7,340 ', '']

In [45]:
data_value = parsed_listing_url.find_all(class_="label")[1].findNextSibling().text.strip().split('$')[1].split('/yr')
data_value

['7,340 ', '']

In [46]:
data_value[0]

'7,340 '

In [47]:
data_value[0].split(',')

['7', '340 ']

In [49]:
desired_value = int(data_value[0].split(',')[0] + data_value[0].split(',')[1])

desired_value

7340

In [53]:
# Define a function that retrieves depreciation value per year given a listing url
def depreciation_value_per_year_error_handler(data_value):
    if len(data_value) < 2:
        data_value = np.nan

    else: 
        try:                 
            desired_value = int(data_value[0].split(',')[0] +\
                                data_value[0].split(',')[1]) # Will fail on IndexError if tries to split '900' with a ',' in ['900','']
        except IndexError: 
            desired_value = int(data_value[0])
        
        return desired_value
    
def depreciation_value_per_year_retrieval(listing_url):
    response = requests.get(listing_url)
    parsed_listing_url = BeautifulSoup(response.text, 'lxml')
    
    data_value = parsed_listing_url.find_all(class_="label")[1].findNextSibling().text.strip().split('$')[1].split('/yr')
    
    depreciation_value_per_year = depreciation_value_per_year_error_handler(data_value)
    
    return depreciation_value_per_year

print(depreciation_value_per_year_retrieval('https://www.sgcarmart.com/used_cars/info.php?ID=862831&DL=3417'))
print(depreciation_value_per_year_retrieval('https://www.sgcarmart.com/used_cars/info.php?ID=862874&DL=2854'))
    

4440
6500


### Section 2.2: Price

In [680]:
main_page2.find_all(class_='font_red')[0]

<td class="font_red" width="175">
<a href="info_financial.php?ID=862833" style="color:#DE0807;"><strong>$31,800</strong></a> </td>

In [681]:
# String ready for operations to be done on
main_page2.find_all(class_='font_red')[0].text.strip()

'$31,800'

In [682]:
main_page2.find_all(class_='font_red')[0].text.strip().split('$')

['', '31,800']

In [792]:
# Define a function to retrieve price for a given parsed url

# url to parse from "https://www.sgcarmart.com/used_cars/info.php?ID=862816&DL=2397"
listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862833&DL=3588"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')

def price_retrieval(parsed_url):
    data_value = parsed_url.find_all(class_='font_red')[0].text.strip()
    data_value = data_value.split('$')

    price = price_error_handling(data_value)
    return price


def price_error_handling(data_value):
    # Try-Exception error handling
    
    try:   # First try to deal with values higher than 1000
        price = data_value[1]  # will fail on IndexError if retrieves ['na'] scenario
        price = int(price.split(',')[0] + price.split(',')[1]) # Will fail on IndexError if tries to split '900' with a ',' in ['',900]
        
    except IndexError:  # Dealing with ['na'] and ['', 900'] scenarios
        try: 
            price = int(data_value[1]) # Will fail on IndexError if ['na'] scenario
        except IndexError:  # Deals with ['na'] scenarios
            price = np.nan  # Stores NA values as nan
    
    return price


price_retrieval(main_page2)

31800

In [684]:
# Example Scenarios:
# data_value = '$900'
# data_value = '$1,000'
# data_value = 'na'


# Converting input into a split list that will allow try-except below to deal with errors
# will either be in ['',1,000], ['na'] or ['',900]
data_value = main_page2.find_all(class_='font_red')[0].text.strip()
data_value = data_value.split('$')


# Try-Exception error handling
try:   # First try to deal with values higher than 1000
    price = data_value[1]  # will fail on IndexError if retrieves ['na'] scenario
    price = int(price.split(',')[0] + price.split(',')[1]) # Will fail on IndexError if tries to split '900' with a ',' in ['',900]
    
except IndexError:  # Dealing with ['na'] and ['', 900'] scenarios
    try: 
        price = int(data_value[1]) # Will fail on IndexError if ['na'] scenario
    except IndexError:  # Deals with ['na'] scenarios
        price = np.nan  # Stores NA values as nan
    
price   

31800

### Section 2.3: Getting Road Tax/year

In [813]:
# url to parse from "https://www.sgcarmart.com/used_cars/info.php?ID=862816&DL=2397"
listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862833&DL=3588"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')

In [814]:
main_page2.find_all(class_='row_info')

[<div class="row_info">
                                         73,000 km (7.3k /yr)                                     </div>,
 <div class="row_info">
                                     	$684 /yr									</div>,
 <div class="row_info">
                                     	N.A.                                    </div>,
 <div class="row_info">N.A.</div>,
 <div class="row_info">
 									1,499 cc                                    </div>,
 <div class="row_info">
                                     1,313 kg                                    </div>,
 <div class="row_info">2009</div>,
 <div class="row_info">Auto</div>,
 <div class="row_info">$18,978</div>,
 <div class="row_info">$18,978</div>,
 <div class="row_info">80.0 kW (107 bhp)</div>,
 <div class="row_info">2</div>]

In [815]:
main_page2.find_all(class_='row_info')[1]

<div class="row_info">
                                    	$684 /yr									</div>

In [816]:
main_page2.find_all(class_='row_info')[1].text

'\r\n                                    \t$684 /yr\t\t\t\t\t\t\t\t\t'

In [817]:
main_page2.find_all(class_='row_info')[1].text.strip()

'$684 /yr'

In [818]:
main_page2.find_all(class_='row_info')[1].text.strip().replace('/yr','').strip()

'$684'

In [819]:
main_page2.find_all(class_='row_info')[1].text.strip().replace('/yr','').strip().split('$')

['', '684']

In [820]:
# Define a function to retrieve road tax /yr given a parsed listing url
def road_tax_retriever(parsed_url):
    string_data = parsed_url.find_all(class_='row_info')[1].text.strip()
    road_tax_yearly = road_tax_error_handler(string_data)
    return road_tax_yearly
    

def road_tax_error_handler(string_data):
    if '/yr' in string_data: # Only takes in scenarios that are not NA
        try:
            # Removes '$" character and splits string_data into a list of ['', 1,000] or ['', 900]
            road_tax_per_year = \
            string_data.replace('/yr','').strip().split('$') 

            # Accesses the second item in the list
            road_tax_per_year = road_tax_per_year[1] 


            road_tax_per_year = int(road_tax_per_year.split(',')[0] +\
                                    road_tax_per_year.split(',')[1])  # Will fail on IndexError if value is above 1000

        except IndexError: # Handles values that are below 1000. (i.e. ['',900])
            road_tax_pear_year = int(road_tax_per_year[1])

    else: # Deals with 'NA' scenario
        road_tax_per_year = np.nan
    
    return road_tax_per_year
        

        
# url to parse from "https://www.sgcarmart.com/used_cars/info.php?ID=862816&DL=2397"
listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862833&DL=3588"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')

print(road_tax_retriever(main_page2))

684


## Derivation for above function

In [699]:
# Input must be in "$ 684 /yr" format
# If input in NA format, it will just immediately be treated a NaN value


string_data = main_page2.find_all(class_='row_info')[1].text.strip()
if '/yr' in string_data: # Only takes in scenarios that are not NA
    try:
        # Removes '$" character and splits string_data into a list of ['', 1,000] or ['', 900]
        road_tax_per_year = \
        string_data.replace('/yr','').strip().split('$') 
        
        # Accesses the second item in the list
        road_tax_per_year = road_tax_per_year[1] 
        
        
        road_tax_per_year = int(road_tax_per_year.split(',')[0] +\
                                road_tax_per_year.split(',')[1])  # Will fail on IndexError if value is above 1000
        
    except IndexError: # Handles values that are below 1000. (i.e. ['',900])
        road_tax_pear_year = int(road_tax_per_year[1])
        
else: # Deals with 'NA' scenario
    road_tax_per_year = np.nan
print(road_tax_per_year)

684


### Section 2.4: Registered date

In [823]:
# url to parse from "https://www.sgcarmart.com/used_cars/info.php?ID=862816&DL=2397"
listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862833&DL=3588"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')

In [824]:
main_page2.find_all(class_='row_bg')

[<tr class="row_bg">
 <td class="label" style="display:flex;align-items:center;" width="115">
 <strong>Price</strong>
 </td>
 <td class="font_red" width="175">
 <a href="info_financial.php?ID=862833" style="color:#DE0807;"><strong>$31,800</strong></a> </td>
 <td width="115"><img alt="spacer" height="1" src="https://i.i-sgcm.com/images/spacer.gif" width="1"/></td><td><img alt="spacer" height="1" src="https://i.i-sgcm.com/images/spacer.gif" width="1"/></td> </tr>,
 <tr class="row_bg">
 <td class="label" style="display:flex;" width="115">
 <strong>Depreciation</strong>
 <div class="qmark" style="padding-left:3px;"><a class="abc_dep" href="popups/whatsDepreciation.php"><img height="16" src="https://i.i-sgcm.com/used_cars/qmark_grey_16x16.png" width="16"/></a></div>
 </td>
 <td valign="top">
                                 $6,350 /yr                            </td>
 <td valign="top"><strong>Reg Date</strong></td>
 <td valign="top">
 								28-Oct-2009<br/>(5yrs  COE left)                

In [825]:
main_page2.find_all(class_='row_bg')[1]

<tr class="row_bg">
<td class="label" style="display:flex;" width="115">
<strong>Depreciation</strong>
<div class="qmark" style="padding-left:3px;"><a class="abc_dep" href="popups/whatsDepreciation.php"><img height="16" src="https://i.i-sgcm.com/used_cars/qmark_grey_16x16.png" width="16"/></a></div>
</td>
<td valign="top">
                                $6,350 /yr                            </td>
<td valign="top"><strong>Reg Date</strong></td>
<td valign="top">
								28-Oct-2009<br/>(5yrs  COE left)                            </td>
</tr>

In [826]:
main_page2.find_all(class_='row_bg')[1].find_all('td')

[<td class="label" style="display:flex;" width="115">
 <strong>Depreciation</strong>
 <div class="qmark" style="padding-left:3px;"><a class="abc_dep" href="popups/whatsDepreciation.php"><img height="16" src="https://i.i-sgcm.com/used_cars/qmark_grey_16x16.png" width="16"/></a></div>
 </td>, <td valign="top">
                                 $6,350 /yr                            </td>, <td valign="top"><strong>Reg Date</strong></td>, <td valign="top">
 								28-Oct-2009<br/>(5yrs  COE left)                            </td>]

In [827]:
main_page2.find_all(class_='row_bg')[1].find_all('td')[3].text

'\r\n\t\t\t\t\t\t\t\t28-Oct-2009(5yrs  COE left)                            '

In [828]:
main_page2.find_all(class_='row_bg')[1].find_all('td')[3].text.split()

['28-Oct-2009(5yrs', 'COE', 'left)']

In [829]:
main_page2.find_all(class_='row_bg')[1].find_all('td')[3].text.split()[0]

'28-Oct-2009(5yrs'

In [830]:
main_page2.find_all(class_='row_bg')[1].find_all('td')[3].text.split()[0].split('(')

['28-Oct-2009', '5yrs']

In [831]:
reg_date = main_page2.find_all(class_='row_bg')[1].find_all('td')[3].text.split()[0].split('(')[0]
reg_date

'28-Oct-2009'

In [1132]:
# Define function to retrieve Registered date given a parsed listing url

# url to parse from "https://www.sgcarmart.com/used_cars/info.php?ID=862816&DL=2397"
listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862831&DL=3417"
response2 = requests.get(listing_url)
page2 = response2.text
parsed_listing_url = BeautifulSoup(page2, 'lxml')

def registered_date_retrieval(parsed_url):
    reg_date = parsed_url.find_all(class_='row_bg')[1].find_all('td')[3].text.split()[0].split('(')[0]
    return reg_date

print(registered_date_retrieval(parsed_listing_url))

14-Jan-2009


### Section 2.5: Days of COE left

In [833]:
listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862833&DL=3588"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')

In [834]:
main_page2.find_all(class_='row_bg')[1].find_all('td')[3].text.split()

['28-Oct-2009(5yrs', 'COE', 'left)']

In [835]:
main_page2.find_all(class_='row_bg')[1].find_all('td')[3].text.split('(')

['\r\n\t\t\t\t\t\t\t\t28-Oct-2009',
 '5yrs  COE left)                            ']

In [836]:
main_page2.find_all(class_='row_bg')[1].find_all('td')[3].text.split('(')[1]

'5yrs  COE left)                            '

In [837]:
main_page2.find_all(class_='row_bg')[1].find_all('td')[3].text.split('(')[1].split('COE')

['5yrs  ', ' left)                            ']

In [843]:
days_of_coe_left_yy_mm_dd_format_for_cleaner_function = \
main_page2.find_all(class_='row_bg')[1].find_all('td')[3].text.split('(')[1].split('COE')[0].strip()

days_of_coe_left_yy_mm_dd_format_for_cleaner_function

'5yrs'

In [882]:
# Define a function to retrieve days of COE left in terms of days:

# Retrieve into usable format for link
def days_of_coe_retriever(parsed_listing_page):
    """Takes in a parsed listing page (individual cars) and retrieve the days of COE left
    ---
    Input: str
    Output: Days of COE left in integer form
    """
    days_of_coe_left_yy_mm_dd_format_for_cleaner_function=\
    parsed_listing_page.find_all(class_='row_bg')[1].find_all('td')[3].text.split('(')[1].split('COE')[0].strip()
    
    return yr_mm_dd_cleaner(days_of_coe_left_yy_mm_dd_format_for_cleaner_function)


# Define a function to calculate days of COE left
def yr_mm_dd_cleaner(str1):
    """Accepts a string that may or may include the elements yr mths days and 
    converts the whole string into number of days.
    ----
    Input: single string
    output: number of days in integer form
    ----
    Example string inputs:
    - 4yrs 2mths 23days
    - 5yrs
    - 2 mths 23 days
    - 50 days
    """
    
    # Convert days_of_coe_left_yy_mm_dd to days    
    year_index = str1.find('yr')
    if year_index == -1:
        year = 0
    else:
        year = int(str1[year_index-1])

        
    mth_index = str1.find('mth')
    if mth_index == -1:
        mth = 0
    else:
        mth = int(str1[mth_index-1])

        
    day_index = str1.find('day')
    if day_index == -1:
        day = 0
    else:
        day = int(str1[day_index-1])
       
    days_of_coe_left = (year * 365) + (mth * 30) + day  # 30.4167 is an average value of days in a month
    return days_of_coe_left


# url to parse from "https://www.sgcarmart.com/used_cars/info.php?ID=862816&DL=2397"
listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862833&DL=3588"
response2 = requests.get(listing_url)
page2 = response2.text
parsed_listing_page = BeautifulSoup(page2, 'lxml')

days_of_coe_retriever(parsed_listing_page)

1825

### Section 2.6: Mileage in km

In [None]:
listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862833&DL=3588"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')

In [846]:
main_page2.find_all(class_='row_info')

[<div class="row_info">
                                         73,000 km (7.3k /yr)                                     </div>,
 <div class="row_info">
                                     	$684 /yr									</div>,
 <div class="row_info">
                                     	N.A.                                    </div>,
 <div class="row_info">N.A.</div>,
 <div class="row_info">
 									1,499 cc                                    </div>,
 <div class="row_info">
                                     1,313 kg                                    </div>,
 <div class="row_info">2009</div>,
 <div class="row_info">Auto</div>,
 <div class="row_info">$18,978</div>,
 <div class="row_info">$18,978</div>,
 <div class="row_info">80.0 kW (107 bhp)</div>,
 <div class="row_info">2</div>]

In [847]:
main_page2.find_all(class_='row_info')[0]

<div class="row_info">
                                        73,000 km (7.3k /yr)                                     </div>

In [848]:
main_page2.find_all(class_='row_info')[0].text

'\r\n                                        73,000 km (7.3k /yr)                                     '

In [849]:
main_page2.find_all(class_='row_info')[0].text.strip()

'73,000 km (7.3k /yr)'

In [855]:
na = 'na'
km = '73,000 km (7.3k /yr)'

In [868]:
na.split('km')

['na']

In [867]:
## 3 Scenarios: 
# NA Milage
# 73,000 km (7.3k /yr)
# Less than 1000 km

km.split('km')

['73,000 ', ' (7.3k /yr)']

In [870]:
km.split('km')[0].strip()

'73,000'

In [871]:
km.split('km')[0].strip().split(',')

['73', '000']

In [872]:
mileage_km =\
km.split('km')[0].strip().split(',')[0] + km.split('km')[0].strip().split(',')[1]

mileage_km

'73000'

In [879]:
# Get data ready for try-exception error handling
data_value = main_page2.find_all(class_='row_info')[0].text.strip()
data_value = data_value.split('km')

if len(data_value) < 2:  # Deals with ['na'] scenarios
    mileage_km = np.nan  # Stores NA values as nan

else:  
    try:                 
        mileage_km = int(data_value[0].strip().split(',')[0] + data_value[0].strip().split(',')[1])
    except IndexError: # Will fail on IndexError if tries to split '900' with a ',' in ['',900]
        mileage_km = int(data_value[0].strip())

mileage_km

73000

In [1]:
# Write a function to retrieve the mileage in km from a parsed listing html
def mileage_error_handler(data_value):
    if len(data_value) < 2:  # Deals with ['na'] scenarios
        mileage_km = np.nan  # Stores NA values as nan

    else:  
        try:                 
            mileage_km = int(data_value[0].strip().split(',')[0] + data_value[0].strip().split(',')[1])
        except IndexError: # Will fail on IndexError if tries to split '900' with a ',' in ['',900]
            mileage_km = int(data_value[0].strip())
    
    return mileage_km


def mileage_retriever(parsed_url):
    data_value = parsed_url.find_all(class_='row_info')[0].text.strip()
    data_value = data_value.split('km')
    mileage_km = mileage_error_handler(data_value)
    return mileage_km
     
listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862833&DL=3588"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')

main_page22.find('a')

mileage_retriever(main_page2)

NameError: name 'requests' is not defined

### Section 2.7: Manufactured Date

In [883]:
# url to parse from "https://www.sgcarmart.com/used_cars/info.php?ID=862816&DL=2397"
listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862833&DL=3588"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')

In [884]:
main_page2.find_all(class_='row_info')

[<div class="row_info">
                                         73,000 km (7.3k /yr)                                     </div>,
 <div class="row_info">
                                     	$684 /yr									</div>,
 <div class="row_info">
                                     	N.A.                                    </div>,
 <div class="row_info">N.A.</div>,
 <div class="row_info">
 									1,499 cc                                    </div>,
 <div class="row_info">
                                     1,313 kg                                    </div>,
 <div class="row_info">2009</div>,
 <div class="row_info">Auto</div>,
 <div class="row_info">$18,978</div>,
 <div class="row_info">$18,978</div>,
 <div class="row_info">80.0 kW (107 bhp)</div>,
 <div class="row_info">2</div>]

In [885]:
main_page2.find_all(class_='row_info')[6]

<div class="row_info">2009</div>

In [887]:
# Define a function that returns the manufactured date using a parsed html
def manufactured_year_retrieval(parsed_url):
    manufactured_year = parsed_url.find_all(class_='row_info')[6].text
    return manufactured_year


# url to parse from "https://www.sgcarmart.com/used_cars/info.php?ID=862816&DL=2397"
listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862833&DL=3588"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')

# Testing function
manufactured_year_retrieval(main_page2)

'2009'

### Section 2.8: Transmission

In [450]:
main_page2.find_all(class_='row_info')

[<div class="row_info">
                                         N.A.                                     </div>,
 <div class="row_info">
                                     	$742 /yr									</div>,
 <div class="row_info">
                                     	$52,259 as of today (<a class="drg_pop" href="/used_cars/popups/deregValue.php?ID=862816&amp;DL=2397">change</a>)                                    </div>,
 <div class="row_info">$53,000</div>,
 <div class="row_info">
 									1,597 cc                                    </div>,
 <div class="row_info">
                                     1,249 kg                                    </div>,
 <div class="row_info">2016</div>,
 <div class="row_info">Auto</div>,
 <div class="row_info">$19,689</div>,
 <div class="row_info">$19,689</div>,
 <div class="row_info">92.0 kW (123 bhp)</div>,
 <div class="row_info">1</div>]

In [451]:
main_page2.find_all(class_='row_info')[7]

<div class="row_info">Auto</div>

In [890]:
# Define a function that returns the transmission based on a parsed listing url

def transmission_retrieval(parsed_url):
    transmission = parsed_url.find_all(class_='row_info')[7].text
    return transmission
    

    
# url to parse from "https://www.sgcarmart.com/used_cars/info.php?ID=862816&DL=2397"
listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862833&DL=3588"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')


transmission_retrieval(main_page2)

'Auto'

### Section 2.9: Dereg Value

In [1029]:
# url to parse from "https://www.sgcarmart.com/used_cars/info.php?ID=862816&DL=2397"
listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862831&DL=3417"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')

In [1030]:
main_page2.find_all(class_='row_info')

[<div class="row_info">
                                         90,602 km (8.4k /yr)                                     </div>,
 <div class="row_info">
                                     	$442 /yr									</div>,
 <div class="row_info">
                                     	$11,026 as of today (<a class="drg_pop" href="/used_cars/popups/deregValue.php?ID=862831&amp;DL=3417">change</a>)                                    </div>,
 <div class="row_info">$13,024</div>,
 <div class="row_info">
 									1,086 cc                                    </div>,
 <div class="row_info">
                                     872 kg                                    </div>,
 <div class="row_info">2008</div>,
 <div class="row_info">Auto</div>,
 <div class="row_info">$8,432</div>,
 <div class="row_info">$8,432</div>,
 <div class="row_info">47.1 kW (63 bhp)</div>,
 <div class="row_info">2</div>]

In [1031]:
main_page2.find_all(class_='row_info')[2]

<div class="row_info">
                                    	$11,026 as of today (<a class="drg_pop" href="/used_cars/popups/deregValue.php?ID=862831&amp;DL=3417">change</a>)                                    </div>

In [1032]:
main_page2.find_all(class_='row_info')[2].text

'\r\n                                    \t$11,026 as of today (change)                                    '

In [1033]:
main_page2.find_all(class_='row_info')[2].text.strip()

'$11,026 as of today (change)'

In [1050]:
# 3 Scenarios:
# ['NA']
# ['$11,026 as of today']
# ['$900 as of today']

data_value = main_page2.find_all(class_='row_info')[2].text.strip().split()
data_value

['$11,026', 'as', 'of', 'today', '(change)']

In [1037]:
# Write a function to retrieve dereg value from a parsed url

def dereg_value_retrieval(parsed_url):
    # Splits into ['NA'], or ['$11,026', 'as', 'of', 'today', '(change)'] or ['$900', 'as', 'of', 'today', '(change)']
    data_value = parsed_url.find_all(class_='row_info')[2].text.strip().split() 
    
    dereg_value_from_scrape_date = dereg_value_error_handler(data_value)
    return dereg_value_from_scrape_date
    

def dereg_value_error_handler(data_value):
    if len(data_value) < 2:  # Deals with ['NA'] scenario
        dereg_value_from_scrape_date = np.nan

    else: 
        data_value = data_value[0].split('$')[1] # Puts input into '11,026' or '900' format
        try:                 
            dereg_value_from_scrape_date = \
            int(data_value.split(',')[0] +\
                data_value.split(',')[1]) # Will fail on IndexError if tries to split '900' with a ',' in ['',900]
        except IndexError: 
            dereg_value_from_scrape_date = int(data_value.strip())

        return dereg_value_from_scrape_date
    
# url to parse from "https://www.sgcarmart.com/used_cars/info.php?ID=862816&DL=2397"
listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862831&DL=3417"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')

dereg_value_retrieval(main_page2)

11026

In [1018]:
# 3 Scenarios:
# ['NA']
# ['$11,026 as of today']
# ['$900 as of today']


# Splits into ['NA'], or ['$11,026', 'as', 'of', 'today', '(change)'] or ['$900', 'as', 'of', 'today', '(change)']
data_value = main_page2.find_all(class_='row_info')[2].text.strip().split() 

if len(data_value) < 2:  # Deals with ['NA'] scenario
    dereg_value_from_scrape_date = np.nan

else: 
    data_value = data_value[0].split('$')[1] # Puts input into '11,026' or '900' format
    try:                 
        dereg_value_from_scrape_date = \
        int(data_value.split(',')[0] +\
            data_value.split(',')[1]) # Will fail on IndexError if tries to split '900' with a ',' in ['',900]
    except IndexError: 
        dereg_value_from_scrape_date = int(data_value.strip())
        


11026

### Section 2.10: OMV

In [1054]:
# url to parse from "https://www.sgcarmart.com/used_cars/info.php?ID=862816&DL=2397"
listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862869&DL=2976"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')

In [1055]:
main_page2.find_all(class_='row_info')[8]

<div class="row_info">$21,967</div>

In [1056]:
main_page2.find_all(class_='row_info')[8].text

'$21,967'

In [1057]:
main_page2.find_all(class_='row_info')[8].text.split('$')

['', '21,967']

In [1058]:
main_page2.find_all(class_='row_info')[8].text.split('$')[1].split(',')

['21', '967']

In [1059]:
main_page2.find_all(class_='row_info')[8].text.split('$')[1].split(',')[0] + main_page2.find_all(class_='row_info')[8].text.split('$')[1].split(',')[1]

'21967'

In [1061]:
# Include an NA handling case
# 3 scenarios:
# 'NA'
# '$21,967'
# '$900'

data_value = main_page2.find_all(class_='row_info')[8].text.split('$') 
# Splits data into ['', '21,967'], ['','900'] or ['NA'] format

if len(data_value) < 2:
    omv = np.nan
    
else:
    try:
        omv = int(main_page2.find_all(class_='row_info')[8].text.split('$')[1].split(',')[0] +\
                  main_page2.find_all(class_='row_info')[8].text.split('$')[1].split(',')[1])  # Will fail on index error if try to split 900
    except IndexError:
        omv = int(main_page2.find_all(class_='row_info')[8].text.split('$')[1])
omv

21967

In [1070]:
# Define a function that retrieves omv based on a parsed listing url
def omv_error_handler(data_value):
    if len(data_value) < 2:  # deals iwth ['NA'] input
        omv = np.nan

    else:
        try:
            omv = int(main_page2.find_all(class_='row_info')[8].text.split('$')[1].split(',')[0] +\
                      main_page2.find_all(class_='row_info')[8].text.split('$')[1].split(',')[1])  # Will fail on index error if try to split 900
        except IndexError:
            omv = int(main_page2.find_all(class_='row_info')[8].text.split('$')[1])
    return omv


def omv_retrieval(parsed_url):
    data_value = main_page2.find_all(class_='row_info')[8].text.split('$') 
    # Splits data into ['', '21,967'], ['','900'] or ['NA'] format for input into error function
    
    omv = omv_error_handler(data_value)
    return omv     


listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862869&DL=2976"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')

omv_retrieval(main_page2)

21967

### Section 2.11: COE as of Today

In [1096]:
listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862874&DL=2854"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')

In [1097]:
main_page2.find_all(class_='row_info')

[<div class="row_info">
                                         N.A.                                     </div>,
 <div class="row_info">
                                     	N.A.									</div>,
 <div class="row_info">
                                     	$27,873 as of today (<a class="drg_pop" href="/used_cars/popups/deregValue.php?ID=862874&amp;DL=2854">change</a>)                                    </div>,
 <div class="row_info">$42,302</div>,
 <div class="row_info">
 									2,999 cc                                    </div>,
 <div class="row_info">
                                     2,500 kg                                    </div>,
 <div class="row_info">2015</div>,
 <div class="row_info">Manual</div>,
 <div class="row_info">$30,804</div>,
 <div class="row_info">$1,541</div>,
 <div class="row_info">1</div>]

In [1098]:
main_page2.find_all(class_='row_info')[3]

<div class="row_info">$42,302</div>

In [1099]:
data_value = main_page2.find_all(class_='row_info')[3].text.split('$')

In [1100]:
data_value

['', '42,302']

In [1101]:
data_value[1]

'42,302'

In [1102]:
data_value[1].split(',')[0] + data_value[1].split(',')[1]

'42302'

In [1106]:
# Write a function to retrieve COE as of today from a parsed listing url
def coe_error_handler(data_value):
    if len(data_value) < 2:  # deals iwth ['NA'] input
        coe_from_scrape_date = np.nan

    else:
        try:
            coe_from_scrape_date = int(data_value[1].split(',')[0] +\
                                       data_value[1].split(',')[1])  # Will fail on index error if try to split 900
        except IndexError:
            coe_from_scrape_date = int(data_value[1])
    return coe_from_scrape_date


def coe_retrieval(parsed_url):
    data_value = parsed_url.find_all(class_='row_info')[3].text.split('$')
    
    coe_from_scrape_date = coe_error_handler(data_value)
    return coe_from_scrape_date
    
    
listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862874&DL=2854"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')

coe_retrieval(main_page2)

42302

### Section 2.12: ARF 

In [1110]:
listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862874&DL=2854"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')

In [1111]:
main_page2.find_all(class_='row_info')[9]

<div class="row_info">$1,541</div>

In [1112]:
main_page2.find_all(class_='row_info')[9].text

'$1,541'

In [1114]:
data_value = main_page2.find_all(class_='row_info')[9].text.split('$')
data_value

['', '1,541']

In [1115]:
# Write a function that retrieves ARF based on a parsed listing url
def error_handler(data_value):
    if len(data_value) < 2:  # deals iwth ['NA'] input
        desired_value = np.nan

    else:
        try:
            desired_value = int(data_value[1].split(',')[0] +\
                                       data_value[1].split(',')[1])  # Will fail on index error if try to split 900
        except IndexError:
            desired_value = int(data_value[1])
    return desired_value


def arf_retrieval(parsed_url):
    data_value = parsed_url.find_all(class_='row_info')[9].text.split('$')
    arf = error_handler(data_value)
    return arf
    
    
listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862874&DL=2854"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')

arf_retrieval(main_page2)
    

1541

### Section 2.13: Engine Capacity (cc)

In [1116]:
main_page2.find_all(class_='row_info')

[<div class="row_info">
                                         N.A.                                     </div>,
 <div class="row_info">
                                     	N.A.									</div>,
 <div class="row_info">
                                     	$27,873 as of today (<a class="drg_pop" href="/used_cars/popups/deregValue.php?ID=862874&amp;DL=2854">change</a>)                                    </div>,
 <div class="row_info">$42,302</div>,
 <div class="row_info">
 									2,999 cc                                    </div>,
 <div class="row_info">
                                     2,500 kg                                    </div>,
 <div class="row_info">2015</div>,
 <div class="row_info">Manual</div>,
 <div class="row_info">$30,804</div>,
 <div class="row_info">$1,541</div>,
 <div class="row_info">1</div>]

In [1117]:
main_page2.find_all(class_='row_info')[4]

<div class="row_info">
									2,999 cc                                    </div>

In [1118]:
main_page2.find_all(class_='row_info')[4].text

'\r\n\t\t\t\t\t\t\t\t\t2,999 cc                                    '

In [1119]:
main_page2.find_all(class_='row_info')[4].text.strip()

'2,999 cc'

In [1121]:
data_value = main_page2.find_all(class_='row_info')[4].text.strip().split('cc')
data_value

['2,999 ', '']

In [1123]:
def engine_capacity_error_handler(data_value):
    if len(data_value) < 2:  # deals iwth ['NA'] input
        desired_value = np.nan

    else:
        try:
            desired_value = int(data_value[0].split(',')[0] +\
                                       data_value[0].split(',')[1])  # Will fail on index error if try to split 900
        except IndexError:
            desired_value = int(data_value[0])
    return desired_value


def engine_capacity_retrieval(parsed_url):
    data_value = main_page2.find_all(class_='row_info')[4].text.strip().split('cc')
    
    engine_capacity = engine_capacity_error_handler(data_value)
    return engine_capacity




listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862869&DL=2976"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')

engine_capacity_retrieval(main_page2)


1461

### Section 2.13: Power (kW) (FOR FUTURE CONSIDERATION. NO TIME TO SCRAPE ALREADY)

In [1135]:
listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862831&DL=3417"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')

In [1136]:
main_page2.find_all(class_='row_info')

[<div class="row_info">
                                         90,602 km (8.4k /yr)                                     </div>,
 <div class="row_info">
                                     	$442 /yr									</div>,
 <div class="row_info">
                                     	$11,026 as of today (<a class="drg_pop" href="/used_cars/popups/deregValue.php?ID=862831&amp;DL=3417">change</a>)                                    </div>,
 <div class="row_info">$13,024</div>,
 <div class="row_info">
 									1,086 cc                                    </div>,
 <div class="row_info">
                                     872 kg                                    </div>,
 <div class="row_info">2008</div>,
 <div class="row_info">Auto</div>,
 <div class="row_info">$8,432</div>,
 <div class="row_info">$8,432</div>,
 <div class="row_info">47.1 kW (63 bhp)</div>,
 <div class="row_info">2</div>]

In [1137]:
main_page2.find_all(class_='row_info')[-2].text

'47.1 kW (63 bhp)'

In [1138]:
main_page2.find_all(class_='row_info')[-2].text.split()

['47.1', 'kW', '(63', 'bhp)']

In [1139]:
power_kw = float(main_page2.find_all(class_='row_info')[-2].text.split()[0])
power_kw

47.1

## Curb Weight

In [1144]:
listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862869&DL=2976"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')

In [1145]:
main_page2.find_all(class_='row_info')[5]

<div class="row_info">
                                    1,280 kg                                    </div>

In [1146]:
main_page2.find_all(class_='row_info')[5].text

'\r\n                                    1,280 kg                                    '

In [1148]:
data_value = main_page2.find_all(class_='row_info')[5].text.split()

['1,280', 'kg']

In [1151]:
def curb_weight_error_handler(data_value):
    if len(data_value) < 2:  # deals iwth ['NA'] input
        desired_value = np.nan

    else:
        try:
            desired_value = int(data_value[0].split(',')[0] +\
                                       data_value[0].split(',')[1])  # Will fail on index error if try to split 900
        except IndexError:
            desired_value = int(data_value[0])
    return desired_value


def curb_weight_retrieval(parsed_url):
    data_value = parsed_url.find_all(class_='row_info')[5].text.split()
    curb_weight = curb_weight_error_handler(data_value)
    return curb_weight


listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862831&DL=3417"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')

curb_weight_retrieval(main_page2)
    

872

### Section 2.15: No. of owners

In [1159]:
listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862874&DL=2854"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')

In [1157]:
main_page2.find_all(class_='row_info')[-1]

<div class="row_info">1</div>

In [1164]:
# Define a function to retrieve the no of owners from a parsed listing url
def owner_retrieval(parsed_url):
    no_of_owners = int(parsed_url.find_all(class_='row_info')[-1].text)
    return no_of_owners

listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862832&DL=2934"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')

owner_retrieval(main_page2)

4

### Section 2.16: Type of Vehicle

In [None]:
listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862874&DL=2854"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')

In [1165]:
main_page2.find(class_='row_bg1')

<tr class="row_bg1">
<td width="115"><strong>Type of Vehicle</strong></td>
<td colspan="3"><a href="listing.php?VEH=12">Luxury Sedan</a></td>
</tr>

In [1170]:
main_page2.find(class_='row_bg1').find_all('a')

[<a href="listing.php?VEH=12">Luxury Sedan</a>]

In [1172]:
main_page2.find(class_='row_bg1').find_all('a')[0].text

'Luxury Sedan'

In [1177]:
# Define a function that returns the type of vehicle given a parsed listing url

def type_of_vehicle_retrieval(parsed_url):
    type_of_vehicle = parsed_url.find(class_='row_bg1').find_all('a')[0].text
    return type_of_vehicle


listing_url = "https://www.sgcarmart.com/used_cars/info.php?ID=862869&DL=2976"
response2 = requests.get(listing_url)
page2 = response2.text
main_page2 = BeautifulSoup(page2, 'lxml')

type_of_vehicle_retrieval(main_page2)

'Van'