In [None]:
import requests
import numpy as np
import pandas as pd
import lxml.html as lxl
import re

In [None]:
MAX_PAGE = 470

# 1. Functions

## Scraping the webpages to get all the car urls

In [None]:
## Get all the urls for all the listed used vehicles on truecar.com
def urls_scraping(base_url = 'https://bonbanh.com/oto-cu-da-qua-su-dung'):
    urls = []
    pages = []
    for i in range(1, MAX_PAGE+1):
        pages.append(base_url + '/page,' + str(i))
    for page in pages:
        try:
            response = requests.get(page)
            response.raise_for_status()
        except:
            break
        root = lxl.fromstring(response.content)
        url = ['https://bonbanh.com/' + link for link in root.xpath('//div[@id="s-list-car"]//a[@itemprop="url"]/@href')]
        urls += url

    return urls

## Parse one url to get information and return a dataframe

In [None]:
def convert_text_to_number(text):
  text = text.strip()
  # Kiểm tra xem chuỗi có khớp với định dạng số tiền không
  if("Tỷ" in text):
    match = re.match(r'^(\d+)(?: Tỷ)?(?: (\d{1,3}) Triệu)?$', text)
    if not match:
      print("So tien \'"+ text + "\' khong hop le")
      return 0
    else:
      billion_part, million_part = match.groups()
      # Lấy giá trị từ các nhóm trong kết quả khớp
      billion_part, million_part = match.groups()

      # Chuyển đổi tỷ thành đơn vị số và thêm vào triệu nếu có
      total_amount = int(billion_part) * 1000
      if million_part:
          total_amount += int(million_part)

      return total_amount
  else:
    match = re.match(r'^(\d+)(?: Triệu)?$', text)
    if not match:
      print("So tien \'"+ text + "\' khong hop le")
      return 0
    else:
      million_part = match.groups()
      # Lấy giá trị từ nhóm khớp
      million_part = match.group(1)

      # Chuyển đổi triệu thành đơn vị số
      total_amount = int(million_part)

      return total_amount



In [None]:
def page_scraping(urls):
  array = []
  index = 0
  n = len(urls)
  percent = n // 100 + 1
  for url in urls:
    index = index + 1
    try:
      response = requests.get(url)
      response.raise_for_status()
      root = lxl.fromstring(response.content)
      title = root.xpath('//div[@class="title"]/h1/text()')[0]
      make = root.xpath('//*[@id="wrapper"]/div[2]/span[3]/a/span/strong/text()')[0]
      model = root.xpath('//*[@id="wrapper"]/div[2]/span[4]/a/span/strong/text()')[0]
      name = title.replace("\n", "").replace("\t", "").split("-")[0]
      price_plain = title.replace("\n", "").replace("\t", "").split("-")[-1]
      price = convert_text_to_number(price_plain)
      year = root.xpath('//span[@class="inp"]/text()')[0].replace("\n", "").replace("\t", "").strip();
      engine = root.xpath('/html/body/div[1]/div[3]/div[5]/div[1]/div[2]/div[1]/div[2]/span/text()')[0].replace("\n", "").replace("\t", "").strip()
      exterior_color = root.xpath('/html/body/div[1]/div[3]/div[5]/div[1]/div[2]/div[2]/div[2]/span/text()')[0].replace("\n", "").replace("\t", "").strip()
      interior_color = root.xpath('/html/body/div[1]/div[3]/div[5]/div[1]/div[2]/div[3]/div[2]/span/text()')[0].replace("\n", "").replace("\t", "").strip()
      mileage = int(root.xpath('/html/body/div[1]/div[3]/div[5]/div[1]/div[1]/div[3]/div[2]/span/text()')[0].replace("\n", "").replace("\t", "").split(" ")[0].replace(",", ""))
      num_seats = root.xpath('/html/body/div[1]/div[3]/div[5]/div[1]/div[2]/div[4]/div[2]/span/text()')[0].replace("\n", "").replace("\t", "").strip().split(" ")[0]
      num_doors = root.xpath('/html/body/div[1]/div[3]/div[5]/div[1]/div[2]/div[5]/div[2]/span/text()')[0].replace("\n", "").replace("\t", "").strip().split(" ")[0]
      origin = root.xpath('/html/body/div[1]/div[3]/div[5]/div[1]/div[1]/div[4]/div[2]/span/text()')[0].replace("\n", "").replace("\t", "").strip()
      style = root.xpath('/html/body/div[1]/div[3]/div[5]/div[1]/div[1]/div[5]/div[2]/span/text()')[0].replace("\n", "").replace("\t", "").strip()
      transmission = root.xpath('/html/body/div[1]/div[3]/div[5]/div[1]/div[1]/div[6]/div[2]/span/text()')[0].replace("\n", "").replace("\t", "").strip()
      drivetrain = root.xpath('/html/body/div[1]/div[3]/div[5]/div[1]/div[2]/div[6]/div[2]/span/text()')[0].replace("\n", "").replace("\t", "").strip()
      if(index % percent == 0):
        print(str(int(index*100/n))+ "%")
      array.append({ 'name': name, 'make': make, 'model': model, 'price': price, 'year': year, 'engine': engine, 'exterior_color': exterior_color,
                    'interior_color': interior_color, 'mileage': mileage, 'num_seats': num_seats, 'num_doors': num_doors, 'origin': origin,
                    'style': style, 'transmission': transmission, 'drivetrain': drivetrain})
    except:
      print("An exception occurred with index: "+ str(index))
  return pd.DataFrame(array)
    # return root

## Use multi-processing to scrape all urls and merge into one dataframe

# 2. Main function

## Getting all urls

In [None]:
%%time
urls=urls_scraping() # extract all vehicle urls from allowed 1470 pages.
print(len(urls))

9403
CPU times: user 1min, sys: 1.12 s, total: 1min 1s
Wall time: 14min 53s


In [None]:
%%time
data = page_scraping(urls)

1%
2%
3%
4%
5%
6%
7%
8%
9%
10%
11%
12%
13%
14%
15%
16%
17%
18%
19%
20%
21%
An exception occurred with index: 2029
An exception occurred with index: 2030
22%
23%
24%
25%
26%
27%
28%
29%
30%
31%
32%
33%
34%
35%
36%
37%
38%
An exception occurred with index: 3630
An exception occurred with index: 3631
39%
40%
41%
42%
43%
44%
45%
46%
47%
48%
49%
50%
51%
52%
53%
54%
55%
56%
57%
58%
59%
60%
61%
62%
63%
64%
65%
66%
67%
68%
69%
70%
71%
72%
73%
74%
75%
76%
An exception occurred with index: 7297
77%
An exception occurred with index: 7353
78%
79%
80%
81%
82%
83%
84%
85%
86%
87%
88%
89%
An exception occurred with index: 8515
An exception occurred with index: 8516
90%
91%
92%
93%
94%
95%
96%
98%
99%
CPU times: user 17min 55s, sys: 20.8 s, total: 18min 16s
Wall time: 2h 59min 36s


## Returning the raw dataframe

In [None]:
print(data.shape)
data.head(100)

(9395, 15)


Unnamed: 0,name,make,model,price,year,engine,exterior_color,interior_color,mileage,num_seats,num_doors,origin,style,transmission,drivetrain
0,Xe Ford Everest Titanium 2.0L 4x2 AT 2022,Ford,Everest,950,2022,Dầu 2.0 L,Đen,Đen,30000,7,5,Nhập khẩu,SUV,Số tự động,RFD - Dẫn động cầu sau
1,Xe Lexus RX 200t 2017,Lexus,RX,1990,2017,Xăng 2.0 L,Trắng,-,50000,5,5,Nhập khẩu,SUV,Số tự động,AWD - 4 bánh toàn thời gian
2,Xe Mercedes Benz S class S450L Luxury 2020,Mercedes Benz,S class,2790,2020,Xăng 3.0 L,Trắng,Nâu,30000,5,4,Lắp ráp trong nước,Sedan,Số tự động,RFD - Dẫn động cầu sau
3,Xe Toyota Vios 1.5G 2019,Toyota,Vios,448,2019,Xăng 1.5 L,Đen,Kem,46000,5,4,Lắp ráp trong nước,Sedan,Số tự động,FWD - Dẫn động cầu trước
4,Xe Volvo S60 T5 R,Volvo,S60,1279,2021,Xăng 2.0 L,Trắng,Đen,19000,5,4,Nhập khẩu,Sedan,Số tự động,AWD - 4 bánh toàn thời gian
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Xe Mercedes Benz C class C200 Exclusive 2019,Mercedes Benz,C class,990,2019,Xăng 2.0 L,Đen,Đen,41000,5,4,Lắp ráp trong nước,Sedan,Số tự động,RFD - Dẫn động cầu sau
96,Xe Kia Carnival Signature 3.5G 2022,Kia,Carnival,1539,2022,Xăng 3.5 L,Trắng,Nâu,13000,7,5,Lắp ráp trong nước,Van/Minivan,Số tự động,FWD - Dẫn động cầu trước
97,Xe Honda City 1.5 AT 2017,Honda,City,356,2017,Xăng 1.5 L,Bạc,Đen,75000,5,4,Lắp ráp trong nước,Sedan,Số tự động,FWD - Dẫn động cầu trước
98,Xe Ford Ranger Wildtrak 2.0L 4x4 AT 2022,Ford,Ranger,759,2022,Dầu 2.0 L,Đen,Đen,20000,5,4,Lắp ráp trong nước,Bán tải / Pickup,Số tự động,4WD - Dẫn động 4 bánh


In [None]:
data.to_csv('lmao/usedCarListing470first.csv', encoding = 'utf-8')