#### Web Scraping : Using Selenium and BeautifulSoup

Web scraping is the practice of automatically retrieving the content of user-facing web pages, analyzing them, and extracting/structuring useful information.
##### Steps of Web Scraping : 
1. Importing and Installing necessary libraries and webdrivers (bridge between selenium and chrome or any browser).
2. Understanding Basic Selenium methods and BeautifulSoup methods to get the required data.
3. Converting to Pandas Dataframe and CSV.

In [1]:
#importing libraries
from selenium import webdriver #selenium is used to automate the browser to do certain tasks
from bs4 import BeautifulSoup #web-scrapping framework
import pandas as pd

In [2]:
prices = []
beds = []
baths = []
sizes = []
addresses = []

In [3]:
def scraping(page_numbers):
    pg_num = str(page_numbers)
    driver = webdriver.Chrome() #web driver automatically open the browser to acess website of choices
    url = "https://www.realtor.com/realestateandhomes-search/New-York_NY/pg-" + pg_num
    driver.get(url)
    content = driver.page_source #retrieve page source in string format
    #html parser allows : finding by tag names, finding by class names, getting whole text, finding href
    soup = BeautifulSoup(content, features='html.parser')
    for element in soup.findAll('li', attrs={'class': 'component_property-card'}):
       price = element.find('span', attrs={'data-label': 'pc-price'})
       bed = element.find('li', attrs={'data-label': 'pc-meta-beds'})
       bath = element.find('li', attrs={'data-label': 'pc-meta-baths'})
       size = element.find('li', attrs={'data-label': 'pc-meta-sqft'})
       address = element.find('div', attrs={'data-label': 'pc-address'})

       if bed and bath:
           nr_beds = bed.find('span', attrs={'data-label': 'meta-value'})
           nr_baths = bath.find('span', attrs={'data-label': 'meta-value'})

           if nr_beds and float(nr_beds.text) >= 2 and nr_baths :
               beds.append(nr_beds.text) #text method returns text without separators
               baths.append(nr_baths.text)

               if price and price.text:
                   prices.append(price.text)
               else:
                   prices.append('missing')

               if size and size.text:
                   sizes.append(size.text)
               else:
                   sizes.append('missing')

               if address and address.text:
                   addresses.append(address.text)
               else:
                   addresses.append('missing')

In [4]:
page_numbers = 20
for i in range(1,page_numbers,1):
    scraping(i)
df = pd.DataFrame({'Address': addresses, 'Price': prices, 'Beds': beds, 'Baths': baths, 'Sizes': sizes})
df.to_csv('scraped_data2.csv', index=False, encoding='utf-8')