In [1]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import time

In [2]:
def getPrice(flat_page):
    price = flat_page.find('div', attrs={'class':'object_descr_price'})
    price = re.split('<div>|руб|\W', str(price))
    price = "".join([i for i in price if i.isdigit()][-3:])
    return int(price)

In [3]:
def getCoords(flat_page):
    coords = flat_page.find('div', attrs={'class':'map_info_button_extend'}).contents[1]
    coords = re.split('&amp|center=|%2C', str(coords))
    coords_list = []
    for item in coords:
        if item[0].isdigit():
            coords_list.append(item)
    lat = float(coords_list[0])
    lon = float(coords_list[1])
    return lat, lon

In [4]:
def getRoom(flat_page):
    rooms = flat_page.find('div', attrs={'class':'object_descr_title'})
    rooms = html_stripper(rooms)
    room_number = ''
    for i in re.split('-|\n', rooms):
        if 'комн' in i:
            break
        else:
            room_number += i
    room_number = "".join(room_number.split())
    return room_number

In [5]:
def getWalk(flat_page):
    walkm = flat_page.find('span', attrs={'class':'object_item_metro_comment'})
    if "пешком" in str(walkm):
        return 1
    else:
        return 0

In [6]:
def getS(table):
    return float((re.split('площадь:\n\n|м2', table)[1]).replace(",", "."))

In [7]:
def getLivingS(table):
    return float(re.split('Жилая площадь:\n\n|м2\n\n\n\nПлощадь кухни', table)[1].replace(",", "."))

In [8]:
def getKitchenS(table):
    return float(re.split("кухни:\n\n|Балкон", table)[1][0].replace("–","0"))

In [9]:
def getTel(table):
    if re.split('Телефон:\n|\n\n\nВид', table)[1]=='нет':
        return 0 
    else:
        return 1

In [10]:
def getNew(table):
    if ((re.split('Тип дома:\n\n|, \n', table)[1]).find('вторичка') >= 0):
        return 0
    else:
        return 1

In [11]:
def getBrick(table):
    if (table.find('кирп')!=-1 or table.find('жб')!=-1 or table.find('монолит')!=-1):
        return 1
    else:
        return 0

In [12]:
def getMetrDist(flat_page):
    walkm = str(flat_page.find('span', attrs={'class':'object_item_metro_comment'}))
    return float(re.sub("\D", "", walkm))

In [13]:
from math import radians, cos, sin, asin, sqrt
def haversine(lon1, lat1, lon2, lat2):
    #Calculate the great circle distance between two points 
    #on the earth (specified in decimal degrees)
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    km = 6367 * c
    return km*1000.0

def getDist(lat1, lon1):
    centreLat = 55.75370903771494 
    centreLon = 37.61981338262558 
    return haversine(lon1, lat1, centreLon, centreLat)

In [14]:
def getBallod(table):
    ballod = re.split('Балкон:\n|\n\n\nЛифт', table)[1]
    if ballod.find('бал')!=-1 or ballod.find('лод')!=-1:
        return 1
    else:
        return 0

In [15]:
def getFloorAndTotalFloor(table):
    strtoint = re.split('Этаж|Тип', table)[1]
    full_pattern = re.compile('[^0-9\\\/]|_')
    strtoint = re.sub(full_pattern, '', strtoint)
    return list(map(int, strtoint.split('/')))

In [16]:
def html_stripper(text):
    return re.sub('<[^<]+?>', '', str(text))

In [17]:
orehovo = 'http://www.cian.ru/cat.php?deal_type=sale&district%5B0%5D=95&engine_version=2&offer_type=flat&p={}'

In [18]:
links = []
for page in range(1, 30):
    page_url =  orehovo.format(page)

    search_page = requests.get(page_url)
    search_page = search_page.content
    search_page = BeautifulSoup(search_page, 'lxml')

    flat_urls = search_page.findAll('div', attrs = {'ng-class':"{'serp-item_removed': offer.remove.state, 'serp-item_popup-opened': isPopupOpen}"})
    flat_urls = re.split('http://www.cian.ru/sale/flat/|/" ng-class="', str(flat_urls))

    for link in flat_urls:
        if link.isdigit():
            links.append(link)

In [19]:
flats = []
errors = []
for i in range(0, len(links)):
    try:
        flat_url = 'http://www.cian.ru/sale/flat/' + str(links[i]) + '/'
        print(flat_url)
        flat_page = requests.get(flat_url)
        flat_page = flat_page.content
        flat_page = BeautifulSoup(flat_page, 'lxml')
        table = flat_page.find('table', attrs = {'class':'object_descr_props'})
        table = html_stripper(table)
        flatStats = {'district':'Orekhovo-Borisovo Yuzhnoye'}
        flatStats['N'] = i
        flatStats['Rooms'] = getRoom(flat_page)
        flatStats['Price'] = getPrice(flat_page) 
        flatStats['Totsp'] = getS(table)
        flatStats['Livsp'] = getLivingS(table)
        flatStats['Kitsp'] = getKitchenS(table)
        flatStats['Metrdist'] = getMetrDist(flat_page)
        flatStats['Walk'] = getWalk(flat_page)
        flatStats['Brick'] = getBrick(table)
        flatStats['Tel'] = getTel(table)
        flatStats['Bal'] = getBallod(table)
        flatStats['Floor'] = getFloorAndTotalFloor(table)[0]
        flatStats['Nfloors'] = getFloorAndTotalFloor(table)[1]
        flatStats['New'] = getNew(table)
        coords = getCoords(flat_page)
        flatStats['Dist'] = getDist(coords[0], coords[1])
        flats.append(flatStats.copy())
    except:
        print("Error at " +flat_url)
        errors.append(flat_url)

http://www.cian.ru/sale/flat/151065385/
http://www.cian.ru/sale/flat/151037285/
http://www.cian.ru/sale/flat/147023453/
http://www.cian.ru/sale/flat/141527552/
http://www.cian.ru/sale/flat/150703591/
http://www.cian.ru/sale/flat/51252264/
http://www.cian.ru/sale/flat/51250999/
http://www.cian.ru/sale/flat/51251559/
http://www.cian.ru/sale/flat/150924306/
http://www.cian.ru/sale/flat/147494757/
http://www.cian.ru/sale/flat/149429092/
http://www.cian.ru/sale/flat/147898966/
http://www.cian.ru/sale/flat/148566952/
http://www.cian.ru/sale/flat/51251534/
http://www.cian.ru/sale/flat/149341092/
http://www.cian.ru/sale/flat/150456303/
http://www.cian.ru/sale/flat/150226105/
http://www.cian.ru/sale/flat/150855351/
http://www.cian.ru/sale/flat/149837534/
http://www.cian.ru/sale/flat/149016772/
http://www.cian.ru/sale/flat/151080346/
http://www.cian.ru/sale/flat/149580164/
http://www.cian.ru/sale/flat/149595900/
http://www.cian.ru/sale/flat/149352714/
http://www.cian.ru/sale/flat/149482802/
http

In [25]:
import csv

def export_dict_list_to_csv(data, filename):
    with open(filename, 'w') as f:
        # Assuming that all dictionaries in the list have the same keys.
        headers = sorted([k for k, v in data[0].items()])
        csv_data = [headers]

        for d in data:
            csv_data.append([d[h] for h in headers])

        writer = csv.writer(f)
        writer.writerows(csv_data)

In [26]:
export_dict_list_to_csv(flats, "flats.csv")