<a href="https://colab.research.google.com/github/tazar09/cars_dimensions/blob/main/midsize_05-feb2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [152]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import numpy as np

import warnings
warnings.filterwarnings('ignore')

# Scrapping the website

In [2]:
url = 'https://www.automobiledimension.com/compact-suv.php'
h3 = requests.get(url, verify = False)
html_content = h3.text

# Creating BeautifulSoup objects

In [3]:
soup = BeautifulSoup(html_content, 'html.parser')
main_table = soup.find('main')

In [4]:
temp_list = []
for i in main_table.find_all('div', class_ = 'unit'):
  link = str(i.a['href'])
  temp_list.append('https://www.automobiledimension.com' + link)
temp_list[0:3]

['https://www.automobiledimension.com/model/ford/puma',
 'https://www.automobiledimension.com/model/audi/q2',
 'https://www.automobiledimension.com/model/nissan/juke']

# Extract model description

In [5]:
def extract_main_text(text):
  search = text.find('div', class_ = 'interior-text')
  return search.text

In [6]:
def extract_boot_space(text):
  search = text.find_all('figcaption')[1]
  boot_space = search.text
  return boot_space

In [8]:
main_text_list = []
boot_space_list = []
for i in temp_list:
  content = requests.get(i, verify = False)
  content = BeautifulSoup(content.text, 'html.parser')
  main_text_list.append(extract_main_text(content))
  boot_space_list.append(extract_boot_space(content))

# Creating dataframe

In [363]:
df = pd.DataFrame({'description': main_text_list, 'boot_space': boot_space_list})

In [364]:
def average(text):
  min, max = text.split('-')
  average = (int(max) + int(min))/2
  return average, int(min)

In [365]:
df['boot_space'] = df['boot_space'].str.replace('Boot space: ', '')
df['boot_space'] = df['boot_space'].str.replace(' liters.','')
df['boot_space'] = df['boot_space'].apply(lambda x: x.replace('* - ', '-') if len(x)>3 else x)

In [366]:
df['boot_mean'] = df['boot_space'].apply(lambda x: average(x)[0] if len(x) > 3 else int(x))
df['boot_normal'] = df['boot_space'].apply(lambda x: average(x)[1] if len(x) > 3 else int(x))


In [367]:
df.drop('boot_space', axis = 1, inplace = True)

In [368]:
df.head()

Unnamed: 0,description,boot_mean,boot_normal
0,"The Ford Puma has a length of 4207 mm, a heigh...",428.5,401
1,"The Audi Q2 has a length of 4208 mm, a height ...",405.0,405
2,"The Nissan Juke has a length of 4210 mm, a hei...",388.0,354
3,"The Opel Crossland has a length of 4212 mm, a ...",410.0,410
4,"The SsangYong Tivoli has a length of 4225 mm, ...",427.0,427


# Extract dimensions

In [369]:
def extract_dim(text):
  patterns = {
  'name': r'The\s+(.*?)\s+has',
  "length": r"length\s+of\s+(\d+)\s+mm",
  'measurement': r'measurement\s+of\s+(\d+)\s+millimeters',
  "height": r"height\s+of\s+(\d+)\s+mm",
  "width": r"width\s+of\s+(\d+)\s+mm",
  "ground_clearance": r"ground\s+clearance\s+of\s+(\d+)\s+cm",
  "category": r"category\s+of\s+(.*?)\."
  }

  car_info = {}

  for key, pattern in patterns.items():
    match = re.search(pattern, text)
    if match:
      car_info[key] = match.group(1) # Convert matched value to integer
    else:
      car_info[key] = None  # Set missing values to None

  return list(car_info.values()), list(car_info.keys())

In [370]:
columns = extract_dim(df['description'][0])[1]
for i in columns:
  df[i] = df['description'].apply(lambda x: extract_dim(x)[0][columns.index(i)])

In [371]:
# reordering the columns

df = df[['name', 'category','length', 'height',
       'width','measurement', 'boot_normal', 'boot_mean', 'ground_clearance']]

df.head()

Unnamed: 0,name,category,length,height,width,measurement,boot_normal,boot_mean,ground_clearance
0,Ford Puma,compact SUV,4207,1537,1805,1930.0,401,428.5,16
1,Audi Q2,compact SUV,4208,1508,1794,2009.0,405,405.0,15
2,Nissan Juke,compact SUV,4210,1595,1800,1983.0,354,388.0,17
3,Opel Crossland,compact SUV,4212,1605,1765,1976.0,410,410.0,17
4,SsangYong Tivoli,compact SUV,4225,1621,1810,,427,427.0,18


In [372]:
numeric_columns = df.columns[2:]
for i in numeric_columns:
  df[i] = df[i].apply(lambda x: np.NaN if (x == None) else int(x))

# Imputing missing values of measurement.

In [373]:
df['mirrors'] = (pd.to_numeric(df['measurement'], errors='coerce') - df['width']).fillna(np.NaN)

In [374]:
df['mirror_rate'] = pd.to_numeric(df['mirrors'], errors = 'coerce') / df['width']
df['measurement']  = df['measurement'].replace('', np.NaN)
df['ground_clearance']  = df['ground_clearance'].replace('', np.NaN)
df['boot_mean']  = df['boot_mean'].replace('None', np.NaN)
df['boot_normal']  = df['boot_normal'].replace('None', np.NaN)

In [375]:
df['mirror_rate'].fillna(np.mean(df['mirror_rate']), inplace=True)
df['measurement'].fillna(round((df['width'] * (1+ df['mirror_rate'])), 0), inplace=True)

# Extract brand and model

In [None]:
# Define a regular expression pattern
pattern = r"^(.*?)\s+(.*)$"

# Extract brand and model using str.extract
df[['brand', 'model']] = df['name'].str.extract(pattern, expand=True)

In [405]:
df.columns

Index(['name', 'category', 'length', 'height', 'width', 'measurement',
       'boot_normal', 'boot_mean', 'ground_clearance', 'mirrors',
       'mirror_rate', 'brand', 'model'],
      dtype='object')

In [406]:
df = df[['brand', 'model', 'category', 'length', 'height', 'width', 'measurement',
       'boot_normal', 'boot_mean', 'ground_clearance', 'mirrors',
       'mirror_rate']]
df.head(100)

Unnamed: 0,brand,model,category,length,height,width,measurement,boot_normal,boot_mean,ground_clearance,mirrors,mirror_rate
0,Ford,Puma,compact SUV,4207,1537,1805,1930.0,401,428,16.0,125.0,0.069252
1,Audi,Q2,compact SUV,4208,1508,1794,2009.0,405,405,15.0,215.0,0.119844
2,Nissan,Juke,compact SUV,4210,1595,1800,1983.0,354,388,17.0,183.0,0.101667
3,Opel,Crossland,compact SUV,4212,1605,1765,1976.0,410,410,17.0,211.0,0.119547
4,SsangYong,Tivoli,compact SUV,4225,1621,1810,2018.0,427,427,18.0,,0.115118
5,Renault,Captur,compact SUV,4227,1566,1797,2003.0,265,335,17.0,206.0,0.114636
6,Mitsubishi,ASX,compact SUV,4227,1566,1797,2003.0,265,333,17.0,206.0,0.114636
7,Volvo,EX30,compact SUV,4233,1549,1836,2032.0,318,318,17.0,196.0,0.106754
8,Jeep,Renegade,compact SUV,4236,1697,1805,2013.0,330,340,17.0,,0.115118
9,Volkswagen,T-Roc,compact SUV,4236,1573,1819,2012.0,445,445,16.0,193.0,0.106102
