<a href="https://colab.research.google.com/github/tazar09/cars_dimensions/blob/main/comparison_02-feb2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re

import warnings
warnings.filterwarnings('ignore')

# Scrapping the website

In [2]:
url = 'https://www.automobiledimension.com/compact-suv.php'
h3 = requests.get(url, verify = False)
html_content = h3.text

# Creating BeautifulSoup objects

In [3]:
soup = BeautifulSoup(html_content, 'html.parser')

In [4]:
main_table = soup.find('main')

# Extract model description

In [5]:
temp_list = []
for i in main_table('a'):
  href = i['href']
  if href and href.startswith('/model'):
    temp_list.append(href)

In [6]:
weblink = []
for i in temp_list:
  weblink.append('https://www.automobiledimension.com' + i)

In [239]:
# Function to extract boot
models_text = []
boot_space = []

def extract_boot_space(content):
  content_boot = BeautifulSoup(content.text, 'html.parser')
  interior_div_text = content_boot.find_all('figure', class_ = 'interior-figure')[1]
  figcaption_element = interior_div_text.find('figcaption')
  if figcaption_element:
      # Extract the text, split it, and keep the second part
      extracted_text = figcaption_element.text
      boot_space.append(extracted_text)
  else:
      pass
  return(boot_space)

# Function to extract main text

def extract_main_text(content):
  content_text = BeautifulSoup(content.text, 'html.parser')
  interior_text = content_text.find('div', class_ = 'interior-text')
  if interior_text:
    div_content = interior_text.text
    models_text.append(div_content)
  else:
    pass
  return(models_text)


In [240]:
boot_space_list = []
main_text_list = []

for i in weblink:
  content = requests.get(i, verify = False)
  boot_space_list = extract_boot_space(content)
  main_text_list = extract_main_text(content)

In [241]:
main_text_list[0]

'The Ford Puma has a length of 4207 mm, a height of 1537 mm, a width of 1805 mm without the exterior mirrors and a measurement of 1930 millimeters with the mirrors unfolded. Motorization: petrol and  petrol mild hybrid. The mild hybrid model is called Puma Ecoboost Hybrid and has a boot space of 401* liters. Because of its size, features and ground clearance of 16 cm, we classify the Ford Puma in the category of compact SUV.Do you like the trend of larger cars? 👍 👎'

In [256]:
boot_space_list[0]

'Boot space: 401* - 456 liters.'

# Extract dimensions

## Function for extracting

In [257]:
def extract_car_info(text):
  # Define the regular expression patterns for each data point
  if 'boot capacity varies between' in text:
    patterns = {
      'name': r'The\s+(.+?)\s+has',
      "length": r"length\s+of\s+(\d+)\s+mm",
      "height": r"height\s+of\s+(\d+)\s+mm",
      "width": r"width\s+of\s+(\d+)\s+mm",
      'measurement': r'measurement\s+of\s+(\d+)\s+millimeters',
      "boot_space": r"varies\s+between\s+(.+?)\s+liters",
      "ground_clearance": r"ground\s+clearance\s+of\s+(\d+)\s+cm",
      "category": r"category\s+of\s+(.*?)\.",
    }

    # Initialize an empty dictionary to store the extracted data
    car_info = {}

    # Iterate through each pattern and try to extract the corresponding data
    for key, pattern in patterns.items():
      match = re.search(pattern, text)
      if match:
        car_info[key] = match.group(1)  # Convert matched value to integer
      else:
        car_info[key] = 'N/A'  # Set missing values to None


  else:
    patterns = {
      'name': r'The\s+(.+?)\s+has',
      "length": r"length\s+of\s+(\d+)\s+mm",
      "height": r"height\s+of\s+(\d+)\s+mm",
      "width": r"width\s+of\s+(\d+)\s+mm",
      'measurement': r'measurement\s+of\s+(\d+)\s+millimeters',
      "boot_space": r"boot\s+space\s+of\s+(.*?)\s*liters",
      "ground_clearance": r"ground\s+clearance\s+of\s+(\d+)\s+cm",
      "category": r"category\s+of\s+(.*?)\.",
    }

    # Initialize an empty dictionary to store the extracted data
    car_info = {}

    # Iterate through each pattern and try to extract the corresponding data
    for key, pattern in patterns.items():
      match = re.search(pattern, text)
      if match:
        car_info[key] = match.group(1)  # Convert matched value to integer
      else:
        car_info[key] = 'N/A'  # Set missing values to None

  return list(car_info.items())

In [245]:
extract_car_info(main_text_list[12])

[('name', 'Smart #1'),
 ('length', '4270'),
 ('height', '1636'),
 ('width', '1822'),
 ('measurement', 'N/A'),
 ('boot_space', '288 - 426'),
 ('ground_clearance', 'N/A'),
 ('category', 'compact SUV')]

In [270]:
def average_boot(text):
  min, max = text.split(sep = '-')
  min = int(min)
  max = int(max)
  average = ((max+min)/2)
  return average

In [274]:
df = pd.DataFrame({'description':main_text_list, 'boot_space':boot_space_list})
columns = ['name', "length", "height", "width", 'measurement', "boot space", "ground clearance"]

for i, column in enumerate(columns):
  df[column]=df['description'].apply(lambda x: extract_car_info(x)[i][1])

df = df.replace('N/A', '')
df['boot_space'] = df['boot_space'].str.replace('Boot space: ', '')
df['boot_space'] = df['boot_space'].str.replace(' liters.', '')
df['boot_space'] = df['boot_space'].apply(lambda x: x.replace('* - ', '-') if '*' in x else x)
df['boot_space'] = df['boot_space'].apply(lambda x: average_boot(x) if len(x) > 4 else int(x))


df.drop('boot space', axis = 1).head(100)

Unnamed: 0,description,boot_space,name,length,height,width,measurement,ground clearance
0,"The Ford Puma has a length of 4207 mm, a heigh...",428.5,Ford Puma,4207,1537,1805,1930.0,16.0
1,"The Audi Q2 has a length of 4208 mm, a height ...",405.0,Audi Q2,4208,1508,1794,2009.0,15.0
2,"The Nissan Juke has a length of 4210 mm, a hei...",388.0,Nissan Juke,4210,1595,1800,1983.0,17.0
3,"The Opel Crossland has a length of 4212 mm, a ...",410.0,Opel Crossland,4212,1605,1765,1976.0,17.0
4,"The SsangYong Tivoli has a length of 4225 mm, ...",427.0,SsangYong Tivoli,4225,1621,1810,,18.0
5,"The Renault Captur has a length of 4227 mm, a ...",335.5,Renault Captur,4227,1566,1797,2003.0,17.0
6,"The Mitsubishi ASX has a length of 4227 mm, a ...",333.0,Mitsubishi ASX,4227,1566,1797,2003.0,17.0
7,"The Volvo EX30 has a length of 4233 mm, a heig...",318.0,Volvo EX30,4233,1549,1836,2032.0,17.0
8,"The Jeep Renegade has a length of 4236 mm, a h...",340.5,Jeep Renegade,4236,1697,1805,,17.0
9,"The Volkswagen T-Roc has a length of 4236 mm, ...",445.0,Volkswagen T-Roc,4236,1573,1819,2012.0,16.0
