#### Web Scraping

In [1]:
# importing necessary libraries
import pandas as pd
import numpy as np
import requests
import re
from bs4 import BeautifulSoup

In [2]:
# get url and parse it

url='https://turbo.az'
page=requests.get(url)
html=BeautifulSoup(page.content, 'html.parser')
cars_info=html.find_all('div', class_='products-i__bottom')
cars=re.sub('<[^>]+>', ' ', str(cars_info[0]))
cars

'   95 000  $    Porsche Cayenne  2021, 3.0 L, 7 000 km  Bakı, 14.05.2023 13:28  '

In [4]:
# create dataframe
data=pd.DataFrame()

In [5]:
# scrape the pages and add them to the dataframe

for n in range(1, 1500):
    url1='https://turbo.az/autos/vip?page=9&q%5Bmake%5D%5B%5D='
    page=requests.get(url1)
    html=BeautifulSoup(page.content, 'html.parser')
    cars_info=html.find_all('div', class_='products-i__bottom')
    for i in range(len(cars_info)):
        cars=re.sub('<[^>]+>', ' ', str(cars_info[i]))
        price=cars.split('  ')[1].replace(' ', '')
        currency=cars.split('  ')[2]
        make_model=cars.split('  ')[4]
        year=cars.split('  ')[5].split(',')[0]
        engine=cars.split('  ')[5].split(',')[1].strip()
        driven=cars.split('  ')[5].split(',')[2].strip()
        turbo=turbo.append({
            'Make_model': make_model,
            'Price':price,
            'Currency': currency,
            'Engine': engine,
            'Driven_km': driven,
            'Production_year': year
        }, ignore_index=True)

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,Make_model,Price,Currency,Engine,Driven_km,Production_year
0,0,Kia Rio,23500,AZN,1.6 L,106 000 km,2018
1,1,Mercedes C 300,23800,AZN,3.0 L,198 000 km,2011
2,2,Mercedes E 220,45500,$,2.2 L,70 600 km,2019
3,3,Mercedes E 220,38900,$,2.0 L,153 000 km,2017
4,4,Porsche Panamera 4S,76500,$,2.9 L,41 000 km,2018


### Data Preprocessing

In [4]:
data['Make']=data['Make_model'].str.split(' ', expand=True)[0]
data['Model']=data['Make_model'].str.split(' ', expand=True)[1]
data['Make']=np.where(data['Make']=='Mercedes', 'Mercedes-Benz', data['Make'])

In [7]:
data

Unnamed: 0.1,Unnamed: 0,Make_model,Price,Currency,Engine,Driven_km,Production_year,Make,Model
0,0,Kia Rio,23500,AZN,1.6 L,106 000 km,2018,Kia,Rio
1,1,Mercedes C 300,23800,AZN,3.0 L,198 000 km,2011,Mercedes-Benz,C
2,2,Mercedes E 220,45500,$,2.2 L,70 600 km,2019,Mercedes-Benz,E
3,3,Mercedes E 220,38900,$,2.0 L,153 000 km,2017,Mercedes-Benz,E
4,4,Porsche Panamera 4S,76500,$,2.9 L,41 000 km,2018,Porsche,Panamera
...,...,...,...,...,...,...,...,...,...
35971,35971,Toyota Camry,30900,$,2.5 L,40 000 km,2020,Toyota,Camry
35972,35972,Volvo XC60,17900,$,3.0 L,204 559 km,2012,Volvo,XC60
35973,35973,Mercedes E 260,18500,AZN,2.6 L,313 070 km,2004,Mercedes-Benz,E
35974,35974,Kia Cerato,22800,AZN,2.0 L,113 895 km,2018,Kia,Cerato


In [8]:
# get url for change currence from $ to man
url='https://www.cbar.az/'
page1=requests.get(url)
html1=BeautifulSoup(page1.content, 'html.parser')
currencies=html1.find_all('div', class_='relize_item')
cur=re.sub('<[^>]+>', ' ', str(currencies))
cur

'[  16.05.2023  Valyuta hərracı barədə  ,   12.05.2023  Azərbaycan Respublikasının Mərkəzi Bankı tərəfindən “Maliyyə bazarlarında kibertəhlükəsizlik Strategiyası” qəbul edilmişdir  ,   11.05.2023  Valyuta hərracı barədə  ,   16.05.2023 tarixindən etibarən   1 USD - 1.7000    1 EUR - 1.8499    1 RUB - 0.0213    1 TRY - 0.0864     ,   04.05.2023 tarixindən etibarən   Aşağı həddi - 7.5%    Yuxarı həddi - 10.0%    Uçot faiz dərəcəsi - 9.0%   ]'

In [9]:
cur_usd=float(cur[cur.find('USD')+5:cur.find('1 EUR')].replace(' ', ''))
print('USD: ',cur_usd)

USD:  1.7


In [10]:
data['Currency'].unique()

array(['AZN', '$'], dtype=object)

In [11]:
data['Price']=pd.to_numeric(data['Price'])

In [12]:
data['Price in AZN']=data.loc[data['Currency']=='$', 'Price in AZN']=data['Price'] * cur_usd

In [13]:
data.loc[data['Currency']=='AZN', 'Price in AZN']=data['Price']

In [15]:
data.drop(['Unnamed: 0','Make_model','Price','Currency'], axis=1, inplace=True)

In [16]:
data['Driven_km']=data['Driven_km'].str.split(' ', expand=True)[0]

In [17]:
data['Engine']=data['Engine'].str.split(' ', expand=True)[0]

In [18]:
data

Unnamed: 0,Engine,Driven_km,Production_year,Make,Model,Price in AZN
0,1.6,106,2018,Kia,Rio,23500.0
1,3.0,198,2011,Mercedes-Benz,C,23800.0
2,2.2,70,2019,Mercedes-Benz,E,77350.0
3,2.0,153,2017,Mercedes-Benz,E,66130.0
4,2.9,41,2018,Porsche,Panamera,130050.0
...,...,...,...,...,...,...
35971,2.5,40,2020,Toyota,Camry,52530.0
35972,3.0,204,2012,Volvo,XC60,30430.0
35973,2.6,313,2004,Mercedes-Benz,E,18500.0
35974,2.0,113,2018,Kia,Cerato,22800.0
