In [1]:
# Importing the necessary packages
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import time

In [3]:
def listing_html_grabber(url):
    """Returns a list of 100 individual listings per main url
    ---
    input: Requires a url to scrape from: eg: url = 'https://www.sgcarmart.com/used_cars/listing.php?BRSR=0&RPG=100&AVL=2&VEH=2'
    output: a list of 100 individual listings per main url
    """
    
    # Base url, or you can think of this as the individual car listing prefix
    base_url = 'https://www.sgcarmart.com/used_cars/'
    
    # Make a request to the website and get the object
    content = requests.get(url)

    # Parse the HTML text
    soup = BeautifulSoup(content.text,'html.parser')

    # Find every single URL in the webpage , refer to this post: 
    # https://stackoverflow.com/questions/46490626/getting-all-links-from-a-page-beautiful-soup
    # This returns a list of every tag that contains a link in the webpage
    links = soup.find_all('a')

    # Create holder for each individual car listing url in a main url
    listing_urls = []

    for link in links:
        # Get the link
        suffix = link.get('href')

        # Check if 'ID=' and 'DL=' exist in the string
        if ('ID=' in suffix) and ('DL=' in suffix):

            # Concatenate the two strings if they do
            listing_url = base_url + suffix
            # Append result to the list
            listing_urls.append(listing_url)
            
    return listing_urls
    

In [6]:
main_page_listing_list = []
for idx, link in enumerate(range(2)):
    url = "https://www.sgcarmart.com/used_cars/listing.php?BRSR=" + str(idx * 100) + "&RPG=100&AVL=2&VEH=2"
    main_page_listing_list.append(url)
    
main_page_listing_list


['https://www.sgcarmart.com/used_cars/listing.php?BRSR=0&RPG=100&AVL=2&VEH=2',
 'https://www.sgcarmart.com/used_cars/listing.php?BRSR=100&RPG=100&AVL=2&VEH=2']

In [7]:
# Applying Listing URL HTML grabber to each main listing page
for main_link in main_page_listing_list:
    listing_urls = listing_html_grabber(main_link)
    time.sleep(5)

['https://www.sgcarmart.com/used_cars/info.php?ID=862861&DL=1000',
 'https://www.sgcarmart.com/used_cars/info.php?ID=862861&DL=1000',
 'https://www.sgcarmart.com/used_cars/info.php?ID=852211&DL=3317',
 'https://www.sgcarmart.com/used_cars/info.php?ID=852211&DL=3317',
 'https://www.sgcarmart.com/used_cars/info.php?ID=848869&DL=2954',
 'https://www.sgcarmart.com/used_cars/info.php?ID=848869&DL=2954',
 'https://www.sgcarmart.com/used_cars/info.php?ID=843528&DL=2962',
 'https://www.sgcarmart.com/used_cars/info.php?ID=843528&DL=2962',
 'https://www.sgcarmart.com/used_cars/info.php?ID=840508&DL=2954',
 'https://www.sgcarmart.com/used_cars/info.php?ID=840508&DL=2954',
 'https://www.sgcarmart.com/used_cars/info.php?ID=832588&DL=3317',
 'https://www.sgcarmart.com/used_cars/info.php?ID=832588&DL=3317',
 'https://www.sgcarmart.com/used_cars/info.php?ID=862965&DL=3004',
 'https://www.sgcarmart.com/used_cars/info.php?ID=862965&DL=3004',
 'https://www.sgcarmart.com/used_cars/info.php?ID=862185&DL=29

In [9]:
def attribute_scraper(listing_url_list):
    """Accepts a list of individual car listing urls and returns variables of importance for linear regression analysis.
    ---
    Input: List of urls
    Output: Variables
    """
    assert listing_url_list == list, "Entry has to be a list"
    
    time.sleep(5)