In [1]:
import requests
import time, os
import re

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys

In [2]:
objects_list = []

In [3]:
class BaseInfo:
    def __init__(self, city='None', district='None', neighborhood='None', prop_type='None', rooms='None', living_room='None', price='None'):
        self.city = city 
        self.district = district
        self.neighborhood = neighborhood
        self.prop_type = prop_type
        self.rooms = rooms
        self.living_room = living_room
        self.price = price
        
    def print_info(self):
        print('city:{} district:{} neighborhood:{} prop_type:{} rooms:{} living_room:{} price:{}'.format(self.city, self.district,
                                                                                                self.neighborhood,self.prop_type,
                                                                                                self.rooms, self.living_room, self.price))
        
class Advertisement:
    def __init__(self, base_info, last_edit='None', area='None', prop_floor='None', building_age='None',
                 heating_type='None', building_floors='None', credit_availability='None', furniture_availability='None',
                 bathrooms='None', building_type='None', building_status='None', usage_status='None', deed='None', dues='None',
                 swap='None', student_single='None', facade='None', fuel_type='None'):
        self.base_info = base_info
        self.last_edit = last_edit
        self.area = area
        self.building_age = building_age
        self.prop_floor = prop_floor
        self.heating_type = heating_type
        self.building_floors = building_floors
        self.credit_availability = credit_availability
        self.furniture_availability = furniture_availability
        self.bathrooms = bathrooms
        self.building_type = building_type
        self.building_status = building_status
        self.usage_status = usage_status
        self.deed = deed
        self.dues = dues
        self.swap = swap
        self.student_single = student_single
        self.facade = facade
        self.fuel_type = fuel_type
        
    def print_info(self):
        self.base_info.print_info()
        print('last_edit:{} area:{} prop_floor:{} building_age:{} heating_type:{} building_floors:{} credit_availability:{} furniture_availability:{} bathrooms:{} building_type:{} building_status:{} usage_status:{} deed:{} dues:{} swap:{} student_single:{} facade:{} fuel_type:{}'.format(
                  self.last_edit, self.area, self.prop_floor, self.building_age, self.heating_type,
                  self.building_floors, self.credit_availability, self.furniture_availability, self.bathrooms, self.building_type,
                  self.building_status, self.usage_status, self.deed, self.dues, self.swap, self.student_single, self.facade,
                  self.fuel_type))

In [4]:
chromedriver = './app/chromedriver' # path to the chromedriver executable
os.environ['webdriver.chrome.driver'] = chromedriver

In [5]:
base_url = 'https://www.hurriyetemlak.com'
istanbul_sell_url = '/istanbul-satilik'

In [6]:
def parse_base_inf(ul_short_info_list, price):
    i = 1
    city = ''
    district = ''
    neighborhood = ''
    prop_type = ''
    rooms = ''
    living_room = ''
    for li in ul_short_info_list.find_all('li'):
        if(i==1):
            city = li.text.strip()
        elif(i==2):
            district = li.text.strip()
        elif(i==3):
            neighborhood = li.text.strip()
        elif(i==5):
            prop_type = li.text.strip()
        elif(i==6):
            value = li.text.strip()
            rooms, living_room = value.split('+', 1)
        i = i + 1
    return BaseInfo(city, district, neighborhood, prop_type, rooms.strip(), living_room.strip(), price.text.strip())

In [7]:
def parse_adv_info(ul_adv_info_list, base_info):
    last_edit = ''
    area = ''
    prop_floor = ''
    building_age = ''
    heating_type = ''
    building_floors = ''
    credit_availability = ''
    furniture_availability = ''
    bathrooms = ''
    building_type = ''
    building_status = ''
    usage_status = ''
    deed = ''
    dues = ''
    swap = ''
    student_single = ''
    facade = ''
    fuel_type = ''
    if ul_adv_info_list:
        for li in ul_adv_info_list.find_all('li'):
            param = li.find_all('span')
            param_type = param[0].text.strip()
            param_value = param[1].text.strip()
            if(param_type=='Son Güncelleme Tarihi'):
                last_edit = param_value
            elif(param_type=='Brüt / Net M2'):
                area = param_value
            elif(param_type=='Bulunduğu Kat'):
                prop_floor = param_value
            elif(param_type=='Bina Yaşı'):
                building_age = param_value
            elif(param_type=='Isınma Tipi'):
                heating_type = param_value
            elif(param_type=='Kat Sayısı'):
                building_floors = param_value
            elif(param_type=='Krediye Uygunluk'):
                credit_availability = param_value
            elif(param_type=='Eşya Durumu'):
                furniture_availability = param_value
            elif(param_type=='Banyo Sayısı'):
                bathrooms = param_value
            elif(param_type=='Yapı Tipi'):
                building_type = param_value
            elif(param_type=='Yapının Durumu'):
                building_status = param_value
            elif(param_type=='Kullanım Durumu'):
                usage_status = param_value
            elif(param_type=='Tapu Durumu'):
                deed = param_value
            elif(param_type=='Aidat'):
                dues = param_value
            elif(param_type=='Takas'):
                swap = param_value
            elif(param_type=='Öğrenciye / Bekara'):
                student_single = param_value
            elif(param_type=='Cephe'):
                facade = param_value
            elif(param_type=='Yakıt Tipi'):
                fuel_type = param_value
    
    return Advertisement(base_info, last_edit, area, prop_floor, building_age, heating_type,
                        building_floors, credit_availability, furniture_availability, bathrooms, building_type, building_status,
                        usage_status, deed, dues, swap, student_single, facade, fuel_type)

In [8]:
def get_house_info(page_source):
    global objects_list
    soup = BeautifulSoup(page_source, 'html5lib')
    price = soup.find('p', {'class': 'price'})
    ul_short_info_list = soup.find('ul', {'class': 'short-info-list'})
    ul_adv_info_list = soup.find('ul', {'class': 'adv-info-list'})
    if ul_short_info_list and price:
        base_info = parse_base_inf(ul_short_info_list, price)
        if ul_adv_info_list:
            full_adv_info = parse_adv_info(ul_adv_info_list, base_info)
            objects_list.append(full_adv_info)

In [9]:
def get_house_page(props_list):
    driver = webdriver.Chrome(chromedriver)
    for a in props_list:
        link = base_url + str(a['href'])
        driver.get(link)
        get_house_info(driver.page_source)
        
    driver.close()

In [10]:
def get_houses_list(page_source):
    soup = BeautifulSoup(page_source, 'html5lib')
    props_list = soup.find_all('a', {'class': 'img-link'})
    
    for a in props_list:
        if a['href'].startswith('https:'):
            props_list.remove(a)
    
    get_house_page(props_list)

In [11]:
def get_pages():
    driver = webdriver.Chrome(chromedriver)
    for i in range(1201,1401):
        page = '/?page={}'.format(i)
        link = base_url + istanbul_sell_url + page
        driver.get(link)
        # driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
        get_houses_list(driver.page_source)

In [12]:
get_pages()

In [13]:
for prop in objects_list:
    prop.print_info()
    print('--------------------------------')

city:İstanbul district:Eyüpsultan neighborhood:Akşemsettin prop_type:Daire rooms:4 living_room:2 price:670.000 TL
last_edit:9 Gün Önce area:210 m2 prop_floor:4. Kat building_age:7 Yaşında heating_type:Kombi building_floors:4 Katlı credit_availability:Uygun furniture_availability:Eşyalı Değil bathrooms:2 building_type:Betonarme building_status:İkinci El usage_status:Ev Sahibi Oturuyor deed:Kat Mülkiyeti dues:35 TL swap:Hayır student_single: facade:Doğu fuel_type:Doğalgaz
--------------------------------
city:İstanbul district:Silivri neighborhood:Selimpaşa prop_type:Çiftlik Evi rooms:15 living_room:2 price:17.500.000 TL
last_edit:9 Gün Önce area:850 m2 prop_floor:Villa Katı building_age:25 Yaşında heating_type:Kombi building_floors:4 Katlı credit_availability:Uygun furniture_availability:Eşyalı Değil bathrooms:5 building_type:Betonarme building_status:İkinci El usage_status:Ev Sahibi Oturuyor deed:Arsa dues: swap: student_single: facade: fuel_type:Doğalgaz
------------------------------

--------------------------------
city:İstanbul district:Beşiktaş neighborhood:Levazım prop_type:Residence rooms:4 living_room:1 price:15.000.000 TL
last_edit:19 Gün Önce area:330 m2 prop_floor:8. Kat building_age:4 Yaşında heating_type:Merkezi (Pay Ölçer) building_floors:18 Katlı credit_availability:Uygun furniture_availability:Eşyalı Değil bathrooms:4 building_type:Betonarme building_status:İkinci El usage_status:Boş deed:Kat Mülkiyeti dues: swap:Hayır student_single:Evet facade:Kuzey, fuel_type:Doğalgaz
--------------------------------
city:İstanbul district:Kartal neighborhood:Kordonboyu prop_type:Daire rooms:3 living_room:1 price:495.000 TL
last_edit:10 Gün Önce area:150 m2 prop_floor:1. Kat building_age:30 Yaşında heating_type:Kombi building_floors:5 Katlı credit_availability:Uygun furniture_availability:Eşyalı Değil bathrooms:1 building_type:Betonarme building_status:İkinci El usage_status:Kiracı Oturuyor deed:Kat Mülkiyeti dues:50 TL swap:Hayır student_single:Evet facade:Kuzey, 

last_edit:32 Gün Önce area:205 m2 prop_floor:Ara Kat building_age:4 Yaşında heating_type:Soba building_floors:9 Katlı credit_availability:Uygun furniture_availability:Eşyalı Değil bathrooms:2 building_type:Betonarme building_status: usage_status:Boş deed:Kat Mülkiyeti dues:600 TL swap:Hayır student_single: facade: fuel_type:Doğalgaz
--------------------------------
city:İstanbul district:Sarıyer neighborhood:Zekeriyaköy prop_type:Daire rooms:3 living_room:1 price:1.250.000 TL
last_edit:27 Gün Önce area:168 m2 prop_floor:Ara Kat building_age:15 Yaşında heating_type:Kombi building_floors:5 Katlı credit_availability:Uygun furniture_availability:Eşyalı Değil bathrooms:2 building_type:Betonarme building_status:İkinci El usage_status:Ev Sahibi Oturuyor deed:Kat İrtifakı dues:790 TL swap:Hayır student_single:Evet facade:Batı fuel_type:Doğalgaz
--------------------------------
city:İstanbul district:Bakırköy neighborhood:Yeşilyurt prop_type:Daire rooms:3 living_room:1 price:1.900.000 TL
last_e

--------------------------------
city:İstanbul district:Beyoğlu neighborhood:Cihangir prop_type:Bina rooms:2 living_room:1 price:1.550.000 TL
last_edit:21 Gün Önce area:110 m2 prop_floor:1. Kat building_age:50 Yaşında heating_type:Kombi building_floors:5 Katlı credit_availability:Uygun furniture_availability:Eşyalı Değil bathrooms:2 building_type:Kagir building_status:Sıfır usage_status:Boş deed:Kat Mülkiyeti dues:30 TL swap:Hayır student_single: facade:Kuzey fuel_type:Doğalgaz
--------------------------------
city:İstanbul district:Şişli neighborhood:Kuştepe prop_type:Residence rooms:2 living_room:1 price:7.500.000 TL
last_edit:6 Gün Önce area:254 m2 prop_floor:21 ve üzeri building_age:10 Yaşında heating_type:Merkezi building_floors:31 Katlı credit_availability:Uygun furniture_availability:Eşyalı Değil bathrooms:3 building_type: building_status:İkinci El usage_status:Kiracı Oturuyor deed: dues: swap: student_single: facade: fuel_type:
--------------------------------
city:İstanbul dis

city:İstanbul district:Kadıköy neighborhood:Zühtüpaşa prop_type:Daire rooms:2 living_room:1 price:550.000 TL
last_edit:Bugün area:90 m2 prop_floor:1. Kat building_age:41 Yaşında heating_type:Merkezi building_floors:4 Katlı credit_availability:Uygun furniture_availability:Eşyalı Değil bathrooms:1 building_type: building_status: usage_status: deed:Kat Mülkiyeti dues:200 TL swap: student_single: facade:Güney fuel_type:
--------------------------------
city:İstanbul district:Küçükçekmece neighborhood:Cennet prop_type:Daire rooms:1 living_room:1 price:350.000 TL
last_edit:11 Gün Önce area:80 m2 prop_floor:Yüksek Giriş building_age:2 Yaşında heating_type:Kombi building_floors:4 Katlı credit_availability:Uygun furniture_availability:Eşyalı Değil bathrooms:1 building_type: building_status: usage_status: deed: dues: swap: student_single: facade: fuel_type:
--------------------------------
city:İstanbul district:Beşiktaş neighborhood:Ulus prop_type:Daire rooms:5 living_room:2 price:21.000.000 TL

last_edit:6 Gün Önce area:75 m2 prop_floor:2. Kat building_age:3 Yaşında heating_type:Kombi building_floors:4 Katlı credit_availability:Uygun furniture_availability:Eşyalı Değil bathrooms:1 building_type:Betonarme building_status:İkinci El usage_status:Kiracı Oturuyor deed:Kat Mülkiyeti dues: swap:Hayır student_single:Evet facade:Doğu, fuel_type:Doğalgaz
--------------------------------
city:İstanbul district:Adalar neighborhood:Maden prop_type:Daire rooms:4 living_room:1 price:1.100.000 TL
last_edit:49 Gün Önce area:180 m2 prop_floor:Bahçe Katı building_age:45 Yaşında heating_type:Isıtma Yok building_floors:3 Katlı credit_availability:Uygun furniture_availability:Eşyalı Değil bathrooms:4 building_type:Yığma building_status:İkinci El usage_status:Ev Sahibi Oturuyor deed: dues:200 TL swap:Hayır student_single: facade:Kuzey, fuel_type:
--------------------------------
city:İstanbul district:Bahçelievler neighborhood:Siyavuşpaşa prop_type:Daire rooms:3 living_room:1 price:410.000 TL
last_

city:İstanbul district:Maltepe neighborhood:Feyzullah prop_type:Daire rooms:2 living_room:1 price:750.000 TL
last_edit:2 Gün Önce area:90 m2 prop_floor:1. Kat building_age:Sıfır Bina heating_type:Kombi building_floors:5 Katlı credit_availability:Uygun furniture_availability:Eşyalı Değil bathrooms:1 building_type:Betonarme building_status:Sıfır usage_status:Boş deed:Kat Mülkiyeti dues:100 TL swap:Hayır student_single:Evet facade: fuel_type:Doğalgaz
--------------------------------
city:İstanbul district:Kartal neighborhood:Hürriyet prop_type:Daire rooms:2 living_room:1 price:165.000 TL
last_edit:16 Gün Önce area:75 m2 prop_floor:Giriş Katı building_age:6 Yaşında heating_type:Kombi building_floors:5 Katlı credit_availability:Uygun furniture_availability:Eşyalı Değil bathrooms:1 building_type:Betonarme building_status:İkinci El usage_status:Boş deed:Kat İrtifakı dues:40 TL swap:Evet student_single:Evet facade: fuel_type:Doğalgaz
--------------------------------
city:İstanbul district:Beşi

city:İstanbul district:Pendik neighborhood:Ramazanoğlu prop_type:Müstakil Ev rooms:2 living_room:1 price:680.000 TL
last_edit:18 Gün Önce area:240 m2 prop_floor:Giriş Katı building_age:18 Yaşında heating_type:Kombi building_floors: credit_availability:Bilinmiyor furniture_availability:Eşyalı Değil bathrooms:1 building_type:Betonarme building_status:İkinci El usage_status:Ev Sahibi Oturuyor deed: dues: swap:Evet student_single: facade: fuel_type:Doğalgaz
--------------------------------
city:İstanbul district:Kadıköy neighborhood:Göztepe prop_type:Daire rooms:3 living_room:1 price:1.590.000 TL
last_edit:28 Gün Önce area:135 m2 prop_floor:7. Kat building_age:Sıfır Bina heating_type:Merkezi (Pay Ölçer) building_floors:8 Katlı credit_availability:Uygun furniture_availability:Eşyalı Değil bathrooms:2 building_type:Betonarme building_status:Sıfır usage_status:Boş deed:Kat İrtifakı dues: swap: student_single: facade:Güney fuel_type:Doğalgaz
--------------------------------
city:İstanbul distr

In [14]:
props_dict = {'city': [], 'district': [], 'neighborhood': [], 'prop_type': [], 'rooms': [], 'living_room': [], 'price':[],
              'last_edit': [], 'prop_type': [], 'area': [], 'prop_floor': [], 'building_age': [], 'heating_type': [],
              'building_floors': [], 'credit_availability': [], 'furniture_availability': [], 'bathrooms': [],
              'building_type': [], 'building_status': [], 'usage_status': [], 'deed': [], 'dues': [], 'swap': [],
              'student_single': [], 'facade': [], 'fuel_type': []}
for prop in objects_list:
    props_dict['city'].append(prop.base_info.city)
    props_dict['district'].append(prop.base_info.district)
    props_dict['neighborhood'].append(prop.base_info.neighborhood)
    props_dict['prop_type'].append(prop.base_info.prop_type)
    props_dict['rooms'].append(prop.base_info.rooms)
    props_dict['living_room'].append(prop.base_info.living_room)
    props_dict['price'].append(prop.base_info.price)
    
    props_dict['last_edit'].append(prop.last_edit)
    props_dict['area'].append(prop.area)
    props_dict['prop_floor'].append(prop.prop_floor)
    props_dict['building_age'].append(prop.building_age)
    props_dict['heating_type'].append(prop.heating_type)
    props_dict['building_floors'].append(prop.building_floors)
    props_dict['credit_availability'].append(prop.credit_availability)
    props_dict['furniture_availability'].append(prop.furniture_availability)
    props_dict['bathrooms'].append(prop.bathrooms)
    props_dict['building_type'].append(prop.building_type)
    props_dict['building_status'].append(prop.building_status)
    props_dict['usage_status'].append(prop.usage_status)
    props_dict['deed'].append(prop.deed)
    props_dict['dues'].append(prop.dues)
    props_dict['swap'].append(prop.swap)
    props_dict['student_single'].append(prop.student_single)
    props_dict['facade'].append(prop.facade)
    props_dict['fuel_type'].append(prop.fuel_type)

for key in props_dict.keys():
    print('size: {}'.format(len(props_dict[key])))

size: 4800
size: 4800
size: 4800
size: 4800
size: 4800
size: 4800
size: 4800
size: 4800
size: 4800
size: 4800
size: 4800
size: 4800
size: 4800
size: 4800
size: 4800
size: 4800
size: 4800
size: 4800
size: 4800
size: 4800
size: 4800
size: 4800
size: 4800
size: 4800
size: 4800


In [15]:
dataframe = pd.DataFrame(props_dict)
dataframe.to_csv('./data/hurriyet7.csv',index=False)

In [16]:
dataframe = pd.read_csv('./data/hurriyet7.csv')
dataframe.head()

Unnamed: 0,city,district,neighborhood,prop_type,rooms,living_room,price,last_edit,area,prop_floor,...,bathrooms,building_type,building_status,usage_status,deed,dues,swap,student_single,facade,fuel_type
0,İstanbul,Eyüpsultan,Akşemsettin,Daire,4,2,670.000 TL,9 Gün Önce,210 m2,4. Kat,...,2,Betonarme,İkinci El,Ev Sahibi Oturuyor,Kat Mülkiyeti,35 TL,Hayır,,Doğu,Doğalgaz
1,İstanbul,Silivri,Selimpaşa,Çiftlik Evi,15,2,17.500.000 TL,9 Gün Önce,850 m2,Villa Katı,...,5,Betonarme,İkinci El,Ev Sahibi Oturuyor,Arsa,,,,,Doğalgaz
2,İstanbul,Silivri,Ortaköy,Villa,12,1,4.000.000 TL,7 Gün Önce,500 m2,Villa Katı,...,2,,,,,,,,,
3,İstanbul,Maltepe,Aydınevler,Daire,2,1,390.000 TL,7 Gün Önce,90 m2,2. Kat,...,1,Betonarme,Sıfır,Boş,Kat Mülkiyeti,120 TL,Hayır,Evet,,Doğalgaz
4,İstanbul,Fatih,Ayvansaray,Müstakil Ev,2,2,320.000 TL,34 Gün Önce,135 m2,,...,1,,,,,,,,,
