<a href="https://colab.research.google.com/github/tazar09/cars_dimensions/blob/main/midsize_07-feb2024__.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import numpy as np

import warnings
warnings.filterwarnings('ignore')
import time

# Scrapping the website

In [2]:
url = 'https://www.automobiledimension.com/mid-size-suv.php'
h3 = requests.get(url, verify = False)
html_content = h3.text

# Creating BeautifulSoup objects

In [3]:
soup = BeautifulSoup(html_content, 'html.parser')
main_table = soup.find('main')

In [4]:
temp_list = []
for i in main_table.find_all('div', class_ = 'unit'):
  link = str(i.a['href'])
  temp_list.append('https://www.automobiledimension.com' + link)
temp_list[0:3]

['https://www.automobiledimension.com/model/hyundai/kona',
 'https://www.automobiledimension.com/model/renault/austral',
 'https://www.automobiledimension.com/model/kia/sportage']

# Extract model description

In [5]:
def extract_main_text(text):
  search = text.find('div', class_ = 'interior-text')
  return search.text

In [6]:
def extract_boot_space(text):
  search = text.find_all('figcaption')[1]
  boot_space = search.text
  return boot_space

In [7]:
main_text_list = []
boot_space_list = []

try:
  for i in temp_list:
    content = requests.get(i, verify = False)
    time.sleep(3)
    content = BeautifulSoup(content.text, 'html.parser')
    main_text_list.append(extract_main_text(content))
    boot_space_list.append(extract_boot_space(content))
except:
  main_text_list.append(None)
  boot_space_list.append(None)

# Creating dataframe

In [124]:
df = pd.DataFrame({'description': main_text_list, 'boot_space': boot_space_list})
# df.head(100)

# Cleaning the dataframe

In [125]:
df['boot_space'] = df['boot_space'].str.replace('Boot space: ', '')
df['boot_space'] = df['boot_space'].str.replace(' liters.','')
df['boot_space'] = df['boot_space'].str.replace('*','')
df.head(100)

Unnamed: 0,description,boot_space
0,"The Hyundai Kona has a length of 4355 mm, a he...",493 - 466
1,"The Renault Austral has a length of 4510 mm, a...",430 - 500
2,"The Kia Sportage has a length of 4515 mm, a he...",540 - 591
3,"The Alfa-Romeo Tonale has a length of 4530 mm,...",500
4,"The Volkswagen Tiguan has a length of 4540 mm,...",
5,"The LynkCo 01 has a length of 4541 mm, a heigh...",466
6,"The Peugeot 3008 has a length of 4542 mm, a he...",520
7,The Mitsubishi Eclipse Cross has a length of 4...,404
8,"The Dacia Jogger has a length of 4547 mm, a he...",(5) 565-708 - (7) 160
9,"The BMW X2 has a length of 4554 mm, a height o...",560


In [126]:
x = df.loc[31, 'boot_space']
len(x)

23

In [127]:
def extract_5_seats_boot(text):
  # rows with only 3-digits number
  if len(text) == 3:
    return text
  # rows w/wo hybrid
  elif ('-' in text and '(5)' not in text):
    pattern1 = r'.*(\d{3})'
    match1 = re.search(pattern1, text)
    return match1.group(1).strip()
  # rows with 5 and 7 seated boot space
  elif '(5)' in text:
    pattern2 = r'\(5\)\s(\d+)'
    match2 = re.search(pattern2, text)
    return match2.group(1).strip()
  # blank and all other rows
  else:
    return ''

def extract_7_seats_boot(text):
  if '(7) ' in text:
    pattern = r'\(7\)\s(\d+)'
    match1 = re.search(pattern, text)
    if match1:
      return match1.group(1).strip()
    else:
      return ''
  else:
    return ''

def extract_5_hybrid(text):
    if ('-' in text and '(' not in text):
      first, second = text.split('-')
      first = first.strip()
      second = second.strip()
      lst1 = [first, second]
      lst1.sort()
      return lst1[0]


    elif ('(' in text and '-' in text):
       pattern = r".*(\d{3}).*(\d{3}).*(\d{3})"
       match1 = re.search(pattern, text)
       if match1:
        first = int(match1.group(1))
        second = int(match1.group(2))
        third = int(match1.group(3))
        lst2 = [first, second, third]
        lst2.sort()
        return lst2[1]
       else:
        return ''
    else:
      return ''

In [128]:
df['boot_space_5'] = df['boot_space'].apply(lambda x: extract_5_seats_boot(x))
df['boot_5_hybrid'] = df['boot_space'].apply(lambda x: extract_5_hybrid(x))
df['boot_space_7'] = df['boot_space'].apply(lambda x: extract_7_seats_boot(x))

df.drop('boot_space', axis = 1).head(100)

Unnamed: 0,description,boot_space_5,boot_5_hybrid,boot_space_7
0,"The Hyundai Kona has a length of 4355 mm, a he...",466.0,466.0,
1,"The Renault Austral has a length of 4510 mm, a...",500.0,430.0,
2,"The Kia Sportage has a length of 4515 mm, a he...",591.0,540.0,
3,"The Alfa-Romeo Tonale has a length of 4530 mm,...",500.0,,
4,"The Volkswagen Tiguan has a length of 4540 mm,...",,,
5,"The LynkCo 01 has a length of 4541 mm, a heigh...",466.0,,
6,"The Peugeot 3008 has a length of 4542 mm, a he...",520.0,,
7,The Mitsubishi Eclipse Cross has a length of 4...,404.0,,
8,"The Dacia Jogger has a length of 4547 mm, a he...",565.0,565.0,160.0
9,"The BMW X2 has a length of 4554 mm, a height o...",560.0,,


In [129]:
# df['7-seated'] = df['boot_space_7'].apply(lambda x: 'Yes' if pd.notna(x) else 'No')

# def boot_hybrid(text):
#   if ('-' in text and '(' not in text):
#     match_hybrid = text.split('-')
#     return match_hybrid[0].strip()
#   else:
#     return np.NaN

# def boot_normal(text):
#   if ('-' in text and '(' not in text):
#     match_normal = text.split('-')
#     return match_normal[1].strip()
#   else:
#     return np.NaN

# # def join_boot_5_normal(text)



# df['boot_space_hydbrid'] = df['boot_space'].apply(lambda x: boot_hybrid(x))
# df['boot_space_normal'] = df['boot_space'].apply(lambda x: boot_normal(x))
# df['boot_normal_final'] = df['boot_space_normal'].apply(lambda x: fillna())
# df.head(100)

# Extract dimensions

In [130]:
def extract_dim(text):
  patterns = {
  'name': r'The\s+(.*?)\s+has',
  "length": r"length\s+of\s+(\d+)\s+mm",
  'measurement': r'measurement\s+of\s+(\d+)\s+millimeters',
  "height": r"height\s+of\s+(\d+)\s+mm",
  "width": r"width\s+of\s+(\d+)\s+mm",
  "ground_clearance": r"ground\s+clearance\s+of\s+(\d+)\s+cm",
  "category": r"category\s+of\s+(.*?)\."
  }

  car_info = {}

  for key, pattern in patterns.items():
    match = re.search(pattern, text)
    if match:
      car_info[key] = match.group(1) # Convert matched value to integer
    else:
      car_info[key] = None  # Set missing values to None

  return list(car_info.values()), list(car_info.keys())

In [131]:
columns = extract_dim(df['description'][0])[1]
for i in columns:
  df[i] = df['description'].apply(lambda x: extract_dim(x)[0][columns.index(i)])
df.drop(['boot_space', 'description'], axis = 1).head(10)

Unnamed: 0,boot_space_5,boot_5_hybrid,boot_space_7,name,length,measurement,height,width,ground_clearance,category
0,466.0,466.0,,Hyundai Kona,4355,,1575,1825,16.0,mid-size SUV
1,500.0,430.0,,Renault Austral,4510,2083.0,1618,1825,17.0,mid-size SUV
2,591.0,540.0,,Kia Sportage,4515,,1645,1865,17.0,mid-size SUV
3,500.0,,,Alfa-Romeo Tonale,4530,,1600,1840,,mid-size SUV
4,,,,Volkswagen Tiguan,4540,,1670,1840,,mid-size SUV
5,466.0,,,LynkCo 01,4541,2141.0,1694,1857,21.0,mid-size SUV
6,520.0,,,Peugeot 3008,4542,2108.0,1641,1895,,mid-size SUV
7,404.0,,,Mitsubishi Eclipse Cross,4545,,1685,1805,19.0,mid-size SUV
8,565.0,565.0,160.0,Dacia Jogger,4547,2007.0,1632,1784,20.0,mid-size SUV
9,560.0,,,BMW X2,4554,2104.0,1590,1845,20.0,mid-size SUV


In [135]:
pattern = r"^(.*?)\s+(.*)$"
df[['brand', 'model']] = df['name'].str.extract(pattern, expand=True)

df = df[['brand', 'model', 'category', 'boot_space_5', 'boot_5_hybrid',
       'boot_space_7', 'length', 'measurement', 'height', 'width',
       'ground_clearance']]

df.head(10)

Unnamed: 0,brand,model,category,boot_space_5,boot_5_hybrid,boot_space_7,length,measurement,height,width,ground_clearance
0,Hyundai,Kona,mid-size SUV,466.0,466.0,,4355,,1575,1825,16.0
1,Renault,Austral,mid-size SUV,500.0,430.0,,4510,2083.0,1618,1825,17.0
2,Kia,Sportage,mid-size SUV,591.0,540.0,,4515,,1645,1865,17.0
3,Alfa-Romeo,Tonale,mid-size SUV,500.0,,,4530,,1600,1840,
4,Volkswagen,Tiguan,mid-size SUV,,,,4540,,1670,1840,
5,LynkCo,01,mid-size SUV,466.0,,,4541,2141.0,1694,1857,21.0
6,Peugeot,3008,mid-size SUV,520.0,,,4542,2108.0,1641,1895,
7,Mitsubishi,Eclipse Cross,mid-size SUV,404.0,,,4545,,1685,1805,19.0
8,Dacia,Jogger,mid-size SUV,565.0,565.0,160.0,4547,2007.0,1632,1784,20.0
9,BMW,X2,mid-size SUV,560.0,,,4554,2104.0,1590,1845,20.0


# Imputing missing values of measurement.

In [None]:
df['mirrors'] = (pd.to_numeric(df['measurement'], errors='coerce') - df['width']).fillna(np.NaN)

In [None]:
df['mirror_rate'] = pd.to_numeric(df['mirrors'], errors = 'coerce') / df['width']
df['measurement']  = df['measurement'].replace('', np.NaN)
df['ground_clearance']  = df['ground_clearance'].replace('', np.NaN)
df['boot_mean']  = df['boot_mean'].replace('None', np.NaN)
df['boot_normal']  = df['boot_normal'].replace('None', np.NaN)

In [None]:
df['mirror_rate'].fillna(np.mean(df['mirror_rate']), inplace=True)
df['measurement'].fillna(round((df['width'] * (1+ df['mirror_rate'])), 0), inplace=True)

# Extract brand and model

In [None]:
# Define a regular expression pattern
pattern = r"^(.*?)\s+(.*)$"

# Extract brand and model using str.extract
df[['brand', 'model']] = df['name'].str.extract(pattern, expand=True)

In [None]:
df.columns

In [None]:
df = df[['brand', 'model', 'category', 'length', 'height', 'width', 'measurement',
       'boot_normal', 'boot_mean', 'ground_clearance', 'mirrors',
       'mirror_rate']]
df.head(100)