In [17]:
#########################################################

from bs4 import BeautifulSoup
import requests
import random
import time
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import os
import sys
import numpy as np
import pandas as pd
import regex as re
import urllib3
requests.packages.urllib3.disable_warnings() 

url='https://www.immoabita.be/en/List/Page/16?pageId=0&sort=None'

driver = webdriver.Chrome()
driver.get(url)
#options.add_argument('user-data-dir=chrome_dir_final')
#driver = webdriver.Chrome(options=options,executable_path= '/home/becode/anaconda3/bin/chromedriver')
driver.implicitly_wait(30)
#driver.get(url)

soup=BeautifulSoup(driver.page_source, 'html.parser')

#Get web links of all properties
def get_property_links(url, driver, pages=20):
    prop_links=[]
    driver.get(url)
    for i in range(pages):
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        listings = soup.find_all("a", class_="estate-card")
        for row in listings:
            if row.has_attr("href"):
                page_data = 'https://www.immoabita.be'+row['href']
            prop_links.append(page_data)
        time.sleep(np.random.lognormal(0,1))
        next_button = soup.find_all("a", class_="list-pagging__link")
        for row in next_button:
            if row.has_attr("href"):
                next_button_link = ['https://www.immoabita.be'+row['href']]
        if i<19:
            driver.get(next_button_link[0])
   
    return prop_links  

#Parse html page
def get_html_data(url, driver):
    driver.get(url)
    time.sleep(np.random.lognormal(0,1))
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    return soup

#Flatten the url list
def flatten_list(prop_links):
    prop_links_flat=[]
    for sublist in prop_links:
        for item in sublist:
            prop_links_flat.append(item)
    return prop_links_flat

#Get Property Id
def get_propid(url):
    try:
        prop_id = url.split('/')[-1]
        dfProp = pd.DataFrame([["Id",prop_id]], columns = ['Property','Value'])
        return dfProp
    except:
        return np.nan

#Get Property Locality
def get_locality(soup):
    try:
        locality = soup.find("span",class_="block-xs").get_text()
        dfLoc = pd.DataFrame([["Locality",locality]], columns = ['Property','Value'])
        return dfLoc
    except:
        return 'None'

#Get property price
def get_price(soup):
    try:
        price = soup.find("span",class_="text-nowrap").get_text()
        dfPrice = pd.DataFrame([["Price",price]], columns = ['Property','Value'])
        return dfPrice
    except:
        return np.nan

#Get other attributes from html table
def get_others(url):
    try:
        r = requests.get(url,verify=False)
        df_list = pd.read_html(r.text) # this parses all the tables in webpages to a list
        df_T1 = pd.DataFrame()
        for i in df_list:
            df_T2 = pd.DataFrame()
            df_T2 = i
            df_T1 = df_T1.append(df_T2)
        dfOthers = df_T1.rename(columns = {0:'Property',1:'Value'},inplace = False)
        return dfOthers
    except:
        return 'None'

#Prepare final data
def get_property_data(driver,prop_links):
    PropData = pd.DataFrame()
    for link in prop_links:
        soup = get_html_data(link,driver)
        propid = get_propid(link)
        locality = get_locality(soup)
        price = get_price(soup)
        others = get_others(link)
        Finaldf = pd.DataFrame()
        Finaldf = pd.concat([propid,locality,price,others])
        Finaldf = Finaldf.transpose()
        Finaldf = Finaldf.reset_index(drop=True)
        header = Finaldf.iloc[0]
        Finaldf = Finaldf[1:]
        Finaldf.columns = header
       
        Data = pd.DataFrame(columns = ['Property Id','Locality','Type of Property','Price','Number of Rooms','Area','Terrace','Terrace Area','Garden','Surface of the land','Swimming Pool'])
       
        Data['Property Id'] = Finaldf['Id']
        Data['Locality'] = Finaldf['Locality']
        Data['Type of Property'] = Finaldf['Category']
        Data['Price'] = Finaldf['Price']
        Data['Number of Rooms'] = Finaldf['Number of bedrooms']
        try:
            Data['Area'] = Finaldf['Habitable surface']
        except:
            Data['Area'] = np.nan
        try:
            Data['Terrace'] = Finaldf['Terrace']
        except:
            Data['Terrace'] = np.nan
        try:
            Data['Terrace Area'] = Finaldf['Terrace 1 (surface)']
        except:
            Data['Terrace Area'] = np.nan
        try:
            Data['Garden'] = Finaldf['Garden']
        except:
            Data['Garden'] = np.nan
        try:
            Data['Surface of the land'] = Finaldf['Ground surface']
        except:
            Data['Surface of the land'] = np.nan
        try:
            Data['Swimming Pool'] = Finaldf['Pool']
        except:
            Data['Swimming Pool'] = np.nan
            
   
        PropData = PropData.append(Data)
       
    return PropData

#Call functions to prepare data
prop_links = get_property_links(url, driver, pages=3)

prop_data = get_property_data(driver,prop_links)

driver.close()

#Write data to csv
prop_data.to_csv('ImmoWeb_Property_Sale_Data.csv', index = False, encoding = "UTF-8")