<a href="https://colab.research.google.com/github/tazar09/cars_dimensions/blob/main/midsize_07-feb2024_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import numpy as np

import warnings
warnings.filterwarnings('ignore')
import time

# Scrapping the website

In [2]:
url = 'https://www.automobiledimension.com/mid-size-suv.php'
h3 = requests.get(url, verify = False)
html_content = h3.text

# Creating BeautifulSoup objects

In [3]:
soup = BeautifulSoup(html_content, 'html.parser')
main_table = soup.find('main')

In [4]:
temp_list = []
for i in main_table.find_all('div', class_ = 'unit'):
  link = str(i.a['href'])
  temp_list.append('https://www.automobiledimension.com' + link)
temp_list[0:3]

['https://www.automobiledimension.com/model/hyundai/kona',
 'https://www.automobiledimension.com/model/renault/austral',
 'https://www.automobiledimension.com/model/kia/sportage']

# Extract model description

In [5]:
def extract_main_text(text):
  search = text.find('div', class_ = 'interior-text')
  return search.text

In [6]:
def extract_boot_space(text):
  search = text.find_all('figcaption')[1]
  boot_space = search.text
  return boot_space

In [7]:
main_text_list = []
boot_space_list = []

try:
  for i in temp_list:
    content = requests.get(i, verify = False)
    time.sleep(3)
    content = BeautifulSoup(content.text, 'html.parser')
    main_text_list.append(extract_main_text(content))
    boot_space_list.append(extract_boot_space(content))
except:
  main_text_list.append(None)
  boot_space_list.append(None)

# Creating dataframe

In [None]:
df = pd.DataFrame({'description': main_text_list, 'boot_space': boot_space_list})
# df.head(100)

# Cleaning the dataframe

In [None]:
df['boot_space'] = df['boot_space'].str.replace('Boot space: ', '')
df['boot_space'] = df['boot_space'].str.replace(' liters.','')
df['boot_space'] = df['boot_space'].str.replace('*','')
df.head(100)

In [18]:
def extract_5_seats_boot(text):
  if '(5)' in text:
    pattern = r'\(5\)\s(\d+)'
    match5 = re.search(pattern, text)
    return match5.group(1)
  else:
    return np.NaN

def extract_7_seats_boot(text):
  if '(7) ' in text:
    pattern = r'\(7\)\s(\d+)'
    match7 = re.search(pattern, text)
    if match7:
      return match7.group(1)
    else:
      return np.NaN
  else:
    return np.NaN

def extract_hybrid(text):
    if '-' in text and '(' not in text:
      boot_hybrid, boot_space = text.split('-')

      return average, int(min)
    else:
      return text, text

In [None]:
df['boot_space_5'] = df['boot_space'].apply(lambda x: extract_5_seats_boot(x))
df['boot_space_7'] = df['boot_space'].apply(lambda x: extract_7_seats_boot(x))
df.head(100)

In [40]:
df['7-seated'] = df['boot_space_7'].apply(lambda x: 'Yes' if pd.notna(x) else 'No')

def boot_hybrid(text):
  if ('-' in text and '(' not in text):
    match_hybrid = text.split('-')
    return match_hybrid[0].strip()
  else:
    return np.NaN

def boot_normal(text):
  if ('-' in text and '(' not in text):
    match_normal = text.split('-')
    return match_normal[1].strip()
  else:
    return np.NaN

# def join_boot_5_normal(text)



df['boot_space_hydbrid'] = df['boot_space'].apply(lambda x: boot_hybrid(x))
df['boot_space_normal'] = df['boot_space'].apply(lambda x: boot_normal(x))
df['boot_normal_final'] = df['boot_space_normal'].apply(lambda x: fillna())
df.head(100)

Unnamed: 0,description,boot_space,boot_space_5,boot_space_7,7-seated,boot_space_hydbrid,boot_space_normal
0,"The Hyundai Kona has a length of 4355 mm, a he...",493 - 466,,,No,493.0,466.0
1,"The Renault Austral has a length of 4510 mm, a...",430 - 500,,,No,430.0,500.0
2,"The Kia Sportage has a length of 4515 mm, a he...",540 - 591,,,No,540.0,591.0
3,"The Alfa-Romeo Tonale has a length of 4530 mm,...",500,,,No,,
4,"The Volkswagen Tiguan has a length of 4540 mm,...",,,,No,,
5,"The LynkCo 01 has a length of 4541 mm, a heigh...",466,,,No,,
6,"The Peugeot 3008 has a length of 4542 mm, a he...",520,,,No,,
7,The Mitsubishi Eclipse Cross has a length of 4...,404,,,No,,
8,"The Dacia Jogger has a length of 4547 mm, a he...",(5) 565-708 - (7) 160,565.0,160.0,Yes,,
9,"The BMW X2 has a length of 4554 mm, a height o...",560,,,No,,


In [13]:
# extracting for all cases, compact and midsize, so needed trees
def average(text):
    if '-' in text and '(' not in text:
      min, max = text.split('-')
      average = (int(max) + int(min))/2
      return average, int(min)
    else:
      return text, text

In [14]:
df['boot_space'] = df['boot_space'].apply(lambda x: average(x)[0])
df.head(100)

Unnamed: 0,description,boot_space,boot_space_5,boot_space_7
0,"The Hyundai Kona has a length of 4355 mm, a he...",479.5,,
1,"The Renault Austral has a length of 4510 mm, a...",465.0,,
2,"The Kia Sportage has a length of 4515 mm, a he...",565.5,,
3,"The Alfa-Romeo Tonale has a length of 4530 mm,...",500,,
4,"The Volkswagen Tiguan has a length of 4540 mm,...",,,
5,"The LynkCo 01 has a length of 4541 mm, a heigh...",466,,
6,"The Peugeot 3008 has a length of 4542 mm, a he...",520,,
7,The Mitsubishi Eclipse Cross has a length of 4...,404,,
8,"The Dacia Jogger has a length of 4547 mm, a he...",(5) 565-708 - (7) 160,565.0,160.0
9,"The BMW X2 has a length of 4554 mm, a height o...",560,,


In [15]:
df['boot_mean'] = df['boot_space'].apply(lambda x: average(x)[0] if len(x) > 3 else int(x))
df['boot_normal'] = df['boot_space'].apply(lambda x: average(x)[1] if len(x) > 3 else int(x))

TypeError: object of type 'float' has no len()

In [None]:
df.drop('boot_space', axis = 1, inplace = True)

In [None]:
df.head()

# Extract dimensions

In [None]:
def extract_dim(text):
  patterns = {
  'name': r'The\s+(.*?)\s+has',
  "length": r"length\s+of\s+(\d+)\s+mm",
  'measurement': r'measurement\s+of\s+(\d+)\s+millimeters',
  "height": r"height\s+of\s+(\d+)\s+mm",
  "width": r"width\s+of\s+(\d+)\s+mm",
  "ground_clearance": r"ground\s+clearance\s+of\s+(\d+)\s+cm",
  "category": r"category\s+of\s+(.*?)\."
  }

  car_info = {}

  for key, pattern in patterns.items():
    match = re.search(pattern, text)
    if match:
      car_info[key] = match.group(1) # Convert matched value to integer
    else:
      car_info[key] = None  # Set missing values to None

  return list(car_info.values()), list(car_info.keys())

In [None]:
columns = extract_dim(df['description'][0])[1]
for i in columns:
  df[i] = df['description'].apply(lambda x: extract_dim(x)[0][columns.index(i)])

In [None]:
# reordering the columns

df = df[['name', 'category','length', 'height',
       'width','measurement', 'boot_normal', 'boot_mean', 'ground_clearance']]

df.head()

In [None]:
numeric_columns = df.columns[2:]
for i in numeric_columns:
  df[i] = df[i].apply(lambda x: np.NaN if (x == None) else int(x))

# Imputing missing values of measurement.

In [None]:
df['mirrors'] = (pd.to_numeric(df['measurement'], errors='coerce') - df['width']).fillna(np.NaN)

In [None]:
df['mirror_rate'] = pd.to_numeric(df['mirrors'], errors = 'coerce') / df['width']
df['measurement']  = df['measurement'].replace('', np.NaN)
df['ground_clearance']  = df['ground_clearance'].replace('', np.NaN)
df['boot_mean']  = df['boot_mean'].replace('None', np.NaN)
df['boot_normal']  = df['boot_normal'].replace('None', np.NaN)

In [None]:
df['mirror_rate'].fillna(np.mean(df['mirror_rate']), inplace=True)
df['measurement'].fillna(round((df['width'] * (1+ df['mirror_rate'])), 0), inplace=True)

# Extract brand and model

In [None]:
# Define a regular expression pattern
pattern = r"^(.*?)\s+(.*)$"

# Extract brand and model using str.extract
df[['brand', 'model']] = df['name'].str.extract(pattern, expand=True)

In [None]:
df.columns

In [None]:
df = df[['brand', 'model', 'category', 'length', 'height', 'width', 'measurement',
       'boot_normal', 'boot_mean', 'ground_clearance', 'mirrors',
       'mirror_rate']]
df.head(100)