# Notebook for collecting cars' data from 'joautok.hu' (Web scraping)

### Second step is import packages and modules
#### csv is for write/save csv file
#### bs4 is for pulling data from html
#### requests is for request and get the url
#### yfinance is for get data of mutual funds

In [48]:
import re
import csv
import json
from io import StringIO
from bs4 import BeautifulSoup
import requests

### Next step is get the url 
#### - First, give the url.
#### - After that, get the data from the specified url, as 'response' variable.
#### - With the response variable we inspect the result of the get() function.
#### - Code=Reason: 200=OK; 404=Not Found; 403=Forbidden

In [49]:
url_joautok = 'https://joautok.hu/hasznaltauto?page=2367'
response = requests.get(url_joautok)

In [50]:
response

<Response [200]>

In [51]:
response.reason

'OK'

### With BeautifulSoup we get a BeautifulSoup object, which represent the url document as a nested data structure

In [52]:
soup = BeautifulSoup(response.text, 'html.parser')

### Find every 'a' tag with class='item'
#### With len(cars) function, check the number of found cars.

In [53]:
cars = soup.find_all('a', 'item')

In [54]:
len(cars)

14283

### In this section we collect the main data of a car. (second car)

In [55]:
car = cars[1]
brand_mf = car.h2.text.replace(u'\xa0', u' ')
brand_mf

'PEUGEOT  308'

In [56]:
type = car.h3.text#.replace(u'\xa0', u' ')
type

'308 SW 1.6 BlueHDi Allure'

In [57]:
dealership_name = car.find('div','dealership-name').text.strip()
dealership_name

'Globe-Auto Kft.'

In [58]:
dealership_loc = car.find('div', 'city').text.strip()
dealership_loc

'(Mezőkövesd)'

In [59]:
dealership_evaluation = car.find('div','evaluation').text.strip()
dealership_evaluation

'Minősített Prémium kereskedő'

In [60]:
year = car.find('div', 'year-odo').span.b.text
year

'2018. 03.'

In [61]:
km = car.find('div', 'year-odo').find('span','dotted').text
km

'127 440 km'

In [62]:
price = car.find('div', 'price').text.strip()
price

'4 199 000 Ft'

#### Informations like number of doors, type of transmission, Power etc. are all text within 'span' tags with class 'dotted'.
#### We find all specified tag and get the 'text' part.

In [63]:
infos = car.find_all('span', 'dotted')
infos_list = []
for i in range(len(infos)):
    info = infos[i].text
    infos_list.append(info)
infos_list

['127 440 km',
 '120 LE',
 '1560 cm3',
 'Dízel',
 'Kombi',
 'Használt',
 'Eladó',
 '5 ajtós',
 '5 fő',
 'Manuális']

### With get_cardata funcion we collect every main info. (Same as the previous section just in one function).

In [64]:
def get_min_cardata(car):
    brand_mf = car.h2.text.replace(u'\xa0', u' ')
    type = car.h3.text#.replace(u'\xa0', u' ')
    dealership_name = car.find('div','dealership-name').text.strip()
    dealership_loc = car.find('div', 'city').text.strip()
    dealership_evaluation = car.find('div','evaluation').text.strip()
    year = car.find('div', 'year-odo').span.b.text
    km = car.find('div', 'year-odo').find('span','dotted').text
    price = car.find('div', 'price').text.strip()
    min_cardata = (brand_mf, type, dealership_name, dealership_loc, dealership_evaluation, year, km, price)
    return min_cardata

### Same as two cells above just  in one function.

In [65]:
def get_infos(car):
    infos = car.find_all('span', 'dotted')
    infos_list = []
    for i in range(len(infos)):
        info = infos[i].text
        infos_list.append(info)
    return infos_list

### Get every data and information with a for loop.

In [66]:
carsdata = []
carsinfo = []
for car in cars:
    #data_number = car.find_all('span', 'dotted')
    #len(data_number)
    #if len(data_number) >= 9:
    car_data = get_min_cardata(car)
    carsdata.append(car_data)
    
    car_info = get_infos(car)
    carsinfo.append(car_info)

### Save the main data and informations into two different csv files.

In [67]:
with open('min_carsdata.csv','w', newline='',encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['brand_mf', 'type', 'dealership_name', 'dealership_loc', 'dealership_evaluation', 'year', 'km', 'price'])
    writer.writerows(carsdata)

In [68]:
with open('min_carsinfo.csv','w', newline='',encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['infos'])
    writer.writerows(carsinfo)