In [1]:
import requests, time
from bs4 import BeautifulSoup
import pandas as pd


house_information = {
    'house_title': [], # 房屋标题
    'total_price': [], # 房屋总价
    'total_price_unit': [], # 房价
    'unit_price': [], # 单价
    'unit_price_unit': [], # 单价单位
    'community_name': [], # 小区名称
    'administrative_region': [], # 行政区
    'area_name': [], # 区域名称
    'house_type': [], # 房屋户型
    'house_floor': [], # 所在楼层
    'house_area': [], # 建筑面积
    'house_structure': [], # 户型结构
    'construction_type': [], # 建筑类型
    'house_direction': [], # 房屋朝向
    'construction_structure': [], # 建筑结构
    'decoration_status': [], # 装修情况
    'elevator_house_ratio': [], # 梯户比例
    'elevator_status': [], # 配备电梯
    'listing_time': [], # 挂牌时间
    'transaction_rights': [], # 交易权属
    'last_transaction_time': [], # 上次交易
    'house_purpose': [], # 房屋用途
    'house_year_limit': [], # 房屋年限
    'belongings': [], # 产权所属
    'mortgage_status': [], # 抵押信息
    'house_backup': [], # 房本备件
    'house_type_distribution': [], # 户型分布
    'longitude': [],
    'latitude': [],
    'subway': [],
    'subway_distance': [],
    'bus': [],
    'bus_distance': [],
    'kindergarten': [],
    'kindergarten_distance': [],
    'primary_school': [],
    'primary_school_distance': [],
    'middle_school': [],
    'middle_school_distance': [],
    'university': [],
    'university_distance': [],
    'hospital': [],
    'hospital_distance': [],
    'pharmacy': [],
    'pharmacy_distance': [],
    'shop': [],
    'shop_distance': [],
    'supermarket': [],
    'supermarket_distance': [],
    'market': [],
    'market_distance': [],
    'bank': [],
    'bank_distance': [],
    'ATM': [],
    'ATM_distance': [],
    'restaurant': [],
    'restaurant_distance': [],
    'cafe': [],
    'cafe_distance': [],
    'park': [],
    'park_distance': [],
    'cinema': [],
    'cinema_distance': [],
    'fitness_center': [],
    'fitness_center_distance': [],
    'gym': [],
    'gym_distance': [],
    'house_link': [] # 房屋链接
}

dynamic_data = {
    # 经纬度
    'longitude': '',
    'latitude': '',
    # 交通
    'subway': '',
    'subway_distance': '',
    'bus': '',
    'bus_distance': '',
    # 教育
    'kindergarten': '',
    'kindergarten_distance': '',
    'primary_school': '',
    'primary_school_distance': '',
    'middle_school': '',
    'middle_school_distance': '',
    'university': '',
    'university_distance': '',
    # 医疗
    'hospital': '',
    'hospital_distance': '',
    'pharmacy': '',
    'pharmacy_distance': '',
    # 购物
    'shop': '',
    'shop_distance': '',
    'supermarket': '',
    'supermarket_distance': '',
    'market': '',
    'market_distance': '',
    # 生活
    'bank': '',
    'bank_distance': '',
    'ATM': '',
    'ATM_distance': '',
    'restaurant': '',
    'restaurant_distance': '',
    'cafe': '',
    'cafe_distance': '',
    # 娱乐
    'park': '',
    'park_distance': '',
    'cinema': '',
    'cinema_distance': '',
    'fitness_center': '',
    'fitness_center_distance': '',
    'gym': '',
    'gym_distance': '' 
}

columns = ['房屋标题', '房屋总价', '房价单位', '单价', '单价单位', '小区名称', '行政区', '区域名称', '房屋户型', '所在楼层',\
           '建筑面积', '户型结构', '建筑类型', '房屋朝向', '建筑结构', '装修情况', '梯户比例', '配备电梯', '挂牌时间', '交易权属', \
           '上次交易', '房屋用途', '房屋年限', '产权所属', '抵押信息', '房本备件', '户型分布', '经度', '纬度', '地铁站', \
           '距地铁站距离', '汽车站', '距汽车站距离', '幼儿园', '距幼儿园距离', '小学', '距小学距离', '初中', '距初中距离', '大学', \
           '距大学距离', '医院', '距医院距离', '药店', '距药店距离', '商场', '距商场距离', '超市', '距超市距离', \
           '市场', '距市场距离', '银行', '距银行距离', 'ATM', '距ATM距离', '餐厅', '距餐厅距离', '咖啡厅', \
           '距咖啡厅距离','公园', '距公园距离', '电影院', '距电影院距离', '健身房', '距健身房距离', '体院馆', \
           '距体院馆距离', '房屋链接']

dynamic_data_cn = ['经度', '纬度', '地铁站', '距地铁站距离', '汽车站', '距汽车站距离', '幼儿园', '距幼儿园距离', \
                   '小学', '距小学距离', '初中', '距初中距离', '大学', '距大学距离', '医院', '距医院距离', '药店',\
                   '距药店距离', '商场', '距商场距离', '超市', '距超市距离', '市场', '距市场距离', '银行', '距银行距离',\
                   'ATM', '距ATM距离', '餐厅', '距餐厅距离', '咖啡厅', '距咖啡厅距离','公园', '距公园距离', '电影院',\
                   '距电影院距离', '健身房', '距健身房距离', '体院馆', '距体院馆距离']

In [2]:
def load_amap_dynamic_data(address, city_name, amap_key, dynamic_data = dynamic_data):
    
    # get geo location (latitude, longitude)
    url = 'http://restapi.amap.com/v3/geocode/geo?key=' + amap_key + '&address=' + address + '&city=' + city_name
    page = requests.get(url)
    geo_response = page.json()
    location = geo_response['geocodes'][0]['location']

    # assign latitude and longitude
    latitude, longitude = location.split(',')
    dynamic_data['latitude'] = latitude
    dynamic_data['longitude'] = longitude

    key_word_list = ['地铁', '公交', '幼儿园', '小学', '初中', '大学', \
                     '医院', '药店', '商场', '超市', '市场', '银行', 'ATM', \
                     '餐厅', '咖啡馆', '公园', '电影院', '健身房', '体院馆']

    key_word_index = 0

    dynamic_data_keys = dynamic_data.keys()
    for dynamic_data_key in list(dynamic_data_keys)[2::2]:

        # assemble request url
        url = 'http://restapi.amap.com/v3/place/around?key=' + amap_key + '&location=' + location + '&keywords=' + key_word_list[key_word_index] + '&types=&offset=&page=&extensions=all'
        field_page = requests.get(url)
        place_response = field_page.json()
        name = place_response['pois'][0]['name']
        distance = place_response['pois'][0]['distance']
        key_word_index += 1

        dynamic_data[dynamic_data_key] = name
        dynamic_data[dynamic_data_key + '_distance'] = distance
    
    return dynamic_data

In [3]:
def load_pages(url, city_name, amap_key, dynamic_data = dynamic_data, load_dynamic = True, verbose = 0):
    
    headers = {'User-Agent': 'Mozilla/5.0'}

    # load page from url
    page = requests.get(url, headers = headers)
    soup = BeautifulSoup(page.text, 'html.parser')
    # url index
    url_index = 1

    # iterate house landing page lists
    house_list = soup.find('ul', class_='sellListContent')

    # get each panel of house page
    houses = house_list.find_all('li', {'class': 'clear'})

    # iterate house panel to extract name and link
    for house in houses:

        # house title
        house_title_section = house.find('div', class_='title')
        house_title = house_title_section.find('a', class_='VIEWDATA').getText().strip()
        house_information['house_title'].append(house_title)

        # house link
        house_link = house_title_section.find('a', class_='VIEWDATA').get('href')
        house_information['house_link'].append(house_link)

        """
        ENTER THE PAGE FROM HERE
        """
        # open house link to get more information:
        house_page = requests.get(house_link, headers = headers)
        house_soup = BeautifulSoup(house_page.text, 'html.parser')


        # Price Section
        price_section = house_soup.find('div', class_='price')

        # get total price
        total_price = price_section.find('span', class_='total').getText().strip()
        house_information['total_price'].append(total_price)

        # get total price unit
        total_price_unit = price_section.find('span', class_='unit').getText().strip()
        house_information['total_price_unit'].append(total_price_unit)

        # get unit price
        unit_price = price_section.find('span', class_='unitPriceValue').getText().strip()
        house_information['unit_price'].append(unit_price)

        # get unit price unit
        unit_price_unit = price_section.find('i').getText().strip()
        house_information['unit_price_unit'].append(unit_price_unit)


        # Community Section
        community_section = house_soup.find('div', class_='communityName')
        community_name = community_section.find('a', class_='info').getText().strip()
        house_information['community_name'].append(community_name)


        # Area Section
        area_section = house_soup.find('div', class_='areaName')
        area_name_info = area_section.find('span', class_='info')
        area_names = area_name_info.find_all('a')

        # get administrative region
        administrative_region = area_names[0].getText().strip()
        house_information['administrative_region'].append(administrative_region)

        # get area name
        area_name = area_names[1].getText().strip()
        house_information['area_name'].append(area_name)

        # Intro Content Seciton
        intro_content_section = house_soup.find('div', class_='introContent')

        # base content section 
        base_section = intro_content_section.find_all('ul')[0]

        # Save information into house_information
        base_section_list = base_section.find_all('li')
        base_index = 0

        # 房屋户型
        if base_section_list[base_index].find('span').getText() == '房屋户型':
            house_type = base_section_list[base_index].find('span').next_sibling.strip()
            base_index += 1
        else:
            house_type = ''
            base_index += 2
        house_information['house_type'].append(house_type)
        # 所在楼层
        if len(base_section_list) > base_index and base_section_list[base_index].find('span').getText() == '所在楼层':
            house_floor = base_section_list[base_index].find('span').next_sibling.strip()
            base_index += 1
        else:
            house_floor = ''
            base_index += 2
        house_information['house_floor'].append(house_floor)
        # 建筑面积
        if len(base_section_list) > base_index and base_section_list[base_index].find('span').getText() == '建筑面积':
            house_area = base_section_list[base_index].find('span').next_sibling.strip()
            base_index += 1
        else:
            house_area = ''
            base_index += 2
        house_information['house_area'].append(house_area)
        # 户型结构
        if len(base_section_list) > base_index and base_section_list[base_index].find('span').getText() == '户型结构':
            house_structure = base_section_list[base_index].find('span').next_sibling.strip()
            base_index += 1
        else:
            house_structure = ''
            base_index += 2
        house_information['house_structure'].append(house_structure)
        # 建筑类型
        if len(base_section_list) > base_index and base_section_list[base_index].find('span').getText() == '建筑类型':
            construction_type = base_section_list[base_index].find('span').next_sibling.strip()
            base_index += 1
        else:
            construction_type = ''
            base_index += 2
        house_information['construction_type'].append(construction_type)
        # 房屋朝向
        if len(base_section_list) > base_index and base_section_list[base_index].find('span').getText() == '房屋朝向':
            house_direction = base_section_list[base_index].find('span').next_sibling.strip()
            base_index += 1
        else:
            house_direction = ''
            base_index += 2
        house_information['house_direction'].append(house_direction)
        # 建筑结构
        if len(base_section_list) > base_index and base_section_list[base_index].find('span').getText() == '建筑结构':
            construction_structure = base_section_list[base_index].find('span').next_sibling.strip()
            base_index += 1
        else:
            construction_structure = ''
            base_index += 2
        house_information['construction_structure'].append(construction_structure)
        # 装修情况
        if len(base_section_list) > base_index and base_section_list[base_index].find('span').getText() == '装修情况':
            decoration_status = base_section_list[base_index].find('span').next_sibling.strip()
            base_index += 1
        else:
            decoration_status = ''
            base_index += 2
        house_information['decoration_status'].append(decoration_status)
        # 梯户比例
        if len(base_section_list) > base_index and base_section_list[base_index].find('span').getText() == '梯户比例':
            elevator_house_ratio = base_section_list[base_index].find('span').next_sibling.strip()
            base_index += 1
        else:
            elevator_house_ratio = ''
            base_index += 2
        house_information['elevator_house_ratio'].append(elevator_house_ratio)
        # 配备电梯
        if len(base_section_list) > base_index and base_section_list[base_index].find('span').getText() == '配备电梯':
            elevator_status = base_section_list[base_index].find('span').next_sibling.strip()
            base_index += 1
        else:
            elevator_status = ''
            base_index += 2
        house_information['elevator_status'].append(elevator_status)

        # transaction section
        transaction_section = intro_content_section.find_all('ul')[1]

        # Extract the information into list
        transaction_section_list = transaction_section.find_all('li')
        # 挂牌时间
        if transaction_section_list[0].find('span').getText() == '挂牌时间':
            listing_time = transaction_section_list[0].find('span').next_sibling.strip()
        else:
            listing_time = ''
        house_information['listing_time'].append(listing_time)
        # 交易权属
        if transaction_section_list[1].find('span').getText() == '交易权属':
            transaction_rights = transaction_section_list[1].find('span').next_sibling.strip()
        else:
            transaction_rights = ''
        house_information['transaction_rights'].append(transaction_rights)
        # 上次交易
        if transaction_section_list[2].find('span').getText() == '上次交易':
            last_transaction_time = transaction_section_list[2].find('span').next_sibling.strip()
        else:
            last_transaction_time = ''
        house_information['last_transaction_time'].append(last_transaction_time)
        # 房屋用途
        if transaction_section_list[3].find('span').getText() == '房屋用途':
            house_purpose = transaction_section_list[3].find('span').next_sibling.strip()
        else:
            house_purpose = ''
        house_information['house_purpose'].append(house_purpose)
        # 房屋年限
        if transaction_section_list[4].find('span').getText() == '房屋年限':
            house_year_limit = transaction_section_list[4].find('span').next_sibling.strip()
        else:
            house_year_limit = ''
        house_information['house_year_limit'].append(house_year_limit)
        # 产权所属
        if transaction_section_list[5].find('span').getText() == '产权所属': 
            belongings = transaction_section_list[5].find('span').next_sibling.strip()
        else:
            belongings = ''
        house_information['belongings'].append(belongings)
        # 抵押信息
        if transaction_section_list[6].find('span').getText() == '抵押信息':
            mortgage_status = transaction_section_list[6].find_all('span')[-1].getText()
        else:
            mortgage_status = ''
        house_information['mortgage_status'].append(mortgage_status)
        # 房本备件
        if transaction_section_list[7].find('span').getText() == '房本备件':
            house_backup = transaction_section_list[7].find('span').next_sibling.strip()
        else:
            house_backup = ''
        house_information['house_backup'].append(house_backup)

        # 户型分布
        house_type_distribution_section = house_soup.find('div', class_='des')
        house_type_distribution = []
        if house_type_distribution_section is not None:
            rows = house_type_distribution_section.find_all('div', class_='row')
            
            for row in rows:
                nodes = row.find_all('div', class_='col')
                content = ''
                for node in nodes:
                    text = node.getText().strip()
                    content += (text + '|')
                house_type_distribution.append(content)
        house_information['house_type_distribution'].append(house_type_distribution)

        if verbose >= 1:
            print(url_index, house_title, house_link)
        url_index += 1

        
        if load_dynamic:
            start_time = time.time()
            address = administrative_region + ' ' + community_name + ' ' + area_name
            dynamic_data = load_amap_dynamic_data(address, city_name, amap_key)
            end_time = time.time()
            if verbose >= 2:
                print('dynamic data takes:', end_time - start_time)

        for key in dynamic_data.keys():
            house_information[key].append(dynamic_data[key])


    return house_information

### Single thread

In [None]:
def single_thread(city_name, amap_key, city_code, excel_name, total_page = 100, get_dynamic_information = True, verbose = 1):

    # initial house_information and dynamic_data
    for key in house_information.keys():
        house_information[key] = []
    for key in dynamic_data.keys():
        dynamic_data[key] = []

    for i in range(1, total_page + 1):

        print('Page:', i)
        start_time = time.time()
        url = 'https://' + city_code + '.ke.com/ershoufang/pg' + str(i)
        print(url)
        load_pages(url, city_name, amap_key, dynamic_data, get_dynamic_information, verbose)
        end_time = time.time()
        print('Page %d takes: %f' % (i, end_time-start_time))


    # convert dictionary into pandas data frame
    houses = pd.DataFrame.from_dict(house_information)
    houses.columns = columns
    houses.to_excel(excel_name, index=False)

In [None]:
city_name = '上海'
city_code = 'sh'
excel_name = city_name + '.xlsx'
amap_key = 'd7d06348caccd6b66b68be8e2054f778'
get_dynamic_information = False
# verbose = 0: display nothing; verbose = 1: display title & url; verbose = 2: display title & url and dynamic data cost time.
verbose = 1
total_page = 100


single_thread(city_name, amap_key, city_code, excel_name, total_page = total_page, get_dynamic_information = get_dynamic_information, verbose = verbose)

In [4]:
from pathlib import Path

def single_thread_nearly_all(city_name, amap_key, city_code, excel_name, get_dynamic_information = True, verbose = 1):

#     # initial house_information and dynamic_data
#     for key in house_information.keys():
#         house_information[key] = []
#     for key in dynamic_data.keys():
#         dynamic_data[key] = []

    
    headers = {'User-Agent': 'Mozilla/5.0'}
    total_index = 1

    # load page from url
    page = requests.get('https://' + city_code + '.ke.com/ershoufang/', headers = headers)
    soup = BeautifulSoup(page.text, 'html.parser')

    # iterate house landing page lists
    house_list = soup.find('div', {'data-role': 'ershoufang'})

    # get all the area links
    area_links = []
    area_links_section = house_list.find_all('a', {'class': 'CLICKDATA'})
    for area_link in area_links_section:
        area_links.append(area_link.get('href').split('/')[-2])

    for area_link in area_links:
        
        current_file = Path(area_link + '_' + excel_name)

        if not current_file.is_file():

            # initial house_information and dynamic_data
            for key in house_information.keys():
                house_information[key] = []
            for key in dynamic_data.keys():
                dynamic_data[key] = []

            # get total pages
            area_page = requests.get('https://' + city_code + '.ke.com/ershoufang/' + area_link, headers = headers)
            area_soup = BeautifulSoup(area_page.text, 'html.parser')

            total_page_section = area_soup.find('div', {'class':'page-box house-lst-page-box'})
            if total_page_section == None:
                total_page = 0
            else:
                total_page = eval(total_page_section.get('page-data'))['totalPage']
            total_page

            # get each page details
            for i in range(1, total_page + 1):

                print('Area: %s Page: %d Total Record No: %d' % (area_link, i, total_index))
                total_index += 1
                start_time = time.time()
                url = 'https://' + city_code + '.ke.com/ershoufang/' + area_link + '/pg' + str(i)

                load_pages(url, city_name, amap_key, dynamic_data, get_dynamic_information, verbose)
                end_time = time.time()
                print('Page %d takes: %f' % (i, end_time-start_time))


            # convert dictionary into pandas data frame
            houses = pd.DataFrame.from_dict(house_information)
            houses.columns = columns
            houses.to_excel(area_link + '_' + excel_name, index=False)

In [None]:
city_name = '上海'
city_code = 'sh'
excel_name = city_name + '.xlsx'
amap_key = 'd7d06348caccd6b66b68be8e2054f778'
get_dynamic_information = False
# verbose = 0: display nothing; verbose = 1: display title & url; verbose = 2: display title & url and dynamic data cost time.
verbose = 1


single_thread_nearly_all(city_name, amap_key, city_code, excel_name, get_dynamic_information = get_dynamic_information, verbose = verbose)

Area: qingpu Page: 1 Total Record No: 1
1 满五唯一|无抵押无户口|产权清晰|业主急售 https://sh.ke.com/ershoufang/107102846226.html
2 春江三月公寓二期 3室2厅 南 https://sh.ke.com/ershoufang/107102845185.html
3 春江三月公寓二期 2室1厅 南 https://sh.ke.com/ershoufang/107102845175.html
4 青浦城区，全明户型，精装三房，诚意出售，低总价 https://sh.ke.com/ershoufang/107102844399.html
5 佳乐苑 3室2厅 南 北 https://sh.ke.com/ershoufang/107102841433.html
6 满五唯一，花园洋房，拎包入住带车位，南北通 https://sh.ke.com/ershoufang/107102833209.html
7 景观楼层   业主自住保养好，清爽两房 https://sh.ke.com/ershoufang/107102833205.html
8 高楼层，两室两厅一厨一卫，两房朝南，视野开阔价格低 https://sh.ke.com/ershoufang/107102830505.html
9 桂花园(青浦) 3室1厅 南 https://sh.ke.com/ershoufang/107102829303.html
10 满两年，小区四周依水还绕，位置安静，拎包入住 https://sh.ke.com/ershoufang/107102829132.html
11 店长推荐，必看好房，动静分明，闹中取静 https://sh.ke.com/ershoufang/107102828993.html
12 二房全明户型，一手动迁，随时可签 https://sh.ke.com/ershoufang/107102827631.html
13 近地铁，业主诚心出售，南北户型，视野采好 https://sh.ke.com/ershoufang/107102826451.html
14 精致社区，优美环境，买两房享受美好人生 https://sh.ke.com/ershoufang/107102825093

25 外滩旁 满五唯一 精装修 正气三开间朝南 全明 https://sh.ke.com/ershoufang/107102709915.html
26 地铁口次新房，总价低小面积两房，未住过人随时可看 https://sh.ke.com/ershoufang/107102709843.html
27 西虹桥，品质两居，万科物业，带产权车位 https://sh.ke.com/ershoufang/107102709785.html
28 锦绣逸庭(公寓) 2室1厅 南 北 https://sh.ke.com/ershoufang/107102709631.html
29 春江三月公寓二期 2室2厅 南 https://sh.ke.com/ershoufang/107102708847.html
30 众里寻他千百度，蓦然回首，此房就在灯火阑珊处 https://sh.ke.com/ershoufang/107102708102.html
Page 4 takes: 107.002340
Area: qingpu Page: 5 Total Record No: 5
1 汇金路 地铁  带装修  总价低 https://sh.ke.com/ershoufang/107102707921.html
2 业主诚意出售，小区环境好，居住舒适 https://sh.ke.com/ershoufang/107102704497.html
3 崧泽华城德康雅苑 3室1厅 南 https://sh.ke.com/ershoufang/107102704125.html
4 南北通风 三面采光 北临河 原始动迁税少 https://sh.ke.com/ershoufang/107102701771.html
5 满五唯一|精装修|大四房|中间楼层 https://sh.ke.com/ershoufang/107102699654.html
6 高楼层，采光好，通风好，诚意出售 https://sh.ke.com/ershoufang/107102698949.html
7 御澜湾 3室2厅 南 https://sh.ke.com/ershoufang/107102697983.html
8 国展百老汇在侧，双地铁，通达2、10、17号线 https://sh.ke.com/ersh

18 华骥苑飞机户型三房两卫，户型方正空间大。 https://sh.ke.com/ershoufang/107102559277.html
19 欣沁苑中区中高楼层精装三开间朝南两房 https://sh.ke.com/ershoufang/107102559237.html
20 精装大两房 南北双阳台 通风采光好 中间楼层 近地铁 https://sh.ke.com/ershoufang/107102556964.html
21 逸泰三居室，远眺宝龙，满足您三代同堂的梦想 https://sh.ke.com/ershoufang/107102548283.html
22 精装修，采光好，诚意出售，随时可看，通风好 https://sh.ke.com/ershoufang/107102546926.html
23 春江三月公寓二期 2室1厅 南 https://sh.ke.com/ershoufang/107102539972.html
24 正虹桥丨满五唯一丨电梯精装三房丨动静分离丨采光充足 https://sh.ke.com/ershoufang/107102538697.html
25 玫瑰湾正气小三房，诚售，看房方便，税费少 https://sh.ke.com/ershoufang/107102535344.html
26 中信泰富花园洋房精装修拎包入住 https://sh.ke.com/ershoufang/107102526401.html
27 秀景苑二期中间楼层纯边套，采光充足，视野开阔。 https://sh.ke.com/ershoufang/107102524304.html
28 3房2卫*明亮温馨，自住保养好。 https://sh.ke.com/ershoufang/107102522842.html
29 总价低，近地铁，看房方便，配套全，无遮挡，诚意出售 https://sh.ke.com/ershoufang/107102522796.html
30 一手动迁税费少，双南户型，景观楼层 https://sh.ke.com/ershoufang/107102522356.html
Page 8 takes: 119.885414
Area: qingpu Page: 9 Total Record No: 9
1 一手动迁税费少，

10 随时可看、业主诚心出售、前面无楼栋遮挡 https://sh.ke.com/ershoufang/107102376749.html
11 房东诚心出售简单装修两房 全明正两房！！！ https://sh.ke.com/ershoufang/107102376603.html
12 精装修，中1央空调加地暖，采光好，看房随时 https://sh.ke.com/ershoufang/107102376348.html
13 一手动迁，税费少，交通方便，楼层好 https://sh.ke.com/ershoufang/107102371712.html
14 次新房，蛙城之上，生活配套齐全，诚意出售 https://sh.ke.com/ershoufang/107102370656.html
15 低价边套三房，位置楼层佳，采光通风好 https://sh.ke.com/ershoufang/107102370856.html
16 春江三月公寓二期 2室1厅 南 北 https://sh.ke.com/ershoufang/107102369855.html
17 春江三月公寓二期 2室1厅 南 https://sh.ke.com/ershoufang/107102369850.html
18 采光好、纯西边套、随时可看、业主诚售 https://sh.ke.com/ershoufang/107102368129.html
19 近地铁、出门万达茂、东渡哇城、房东诚意出售 https://sh.ke.com/ershoufang/107102367018.html
20 双桥公寓（一区） 2室2厅 南 https://sh.ke.com/ershoufang/107102366964.html
21 中间楼层 采光充足精装修大三房，地铁700米，近公园 https://sh.ke.com/ershoufang/107102366385.html
22 精装修 拎包入住 全明户型 楼层好 诚意出售 https://sh.ke.com/ershoufang/107102360035.html
23 直视青浦环城水系公园，标准洋房小区 https://sh.ke.com/ershoufang/107102355537.html
24 悦湖郡 2室2厅 南 https

In [None]:
pudong_上海.xlsx
minhang_上海.xlsx
baoshan_上海.xlsx
xuhui_上海.xlsx
putuo_上海.xlsx
yangpu_上海.xlsx
changning_上海.xlsx
songjiang_上海.xlsx
jiading_上海.xlsx
huangpu_上海.xlsx
jingan_上海.xlsx
hongkou_上海.xlsx
qingpu_上海.xlsx
fengxian_上海.xlsx
jinshan_上海.xlsx
chongming_上海.xlsx
shanghaizhoubian_上海.xlsx

### Multi-thread

In [None]:
from multiprocessing import Pool, Value
from itertools import repeat

def f(page_number, city_name, city_code, amap_key, dynamic_data, get_dynamic_information, verbose):
    
    start_time = time.time()
    url = 'https://' + city_code + '.ke.com/ershoufang/pg' + str(page_number)
    print('URL Start:', url)
    result = load_pages(url, city_name, amap_key, dynamic_data, get_dynamic_information, verbose)
    end_time = time.time()
    print('No: %d URL %s takes: %f' % (counter.value, url, end_time - start_time))
    counter.value += 1
    return result

def multi_thread(city_name, city_code, excel_name, amap_key, get_dynamic_information = True, start_page = 1, end_page = 100, pool_workers = 8, verbose = 0):

    # initial house_information and dynamic_data
    for key in house_information.keys():
        house_information[key] = []
    for key in dynamic_data.keys():
        dynamic_data[key] = []

    # multiprocessing
    start_time = time.time()
    pool = Pool(pool_workers)
    parameters = zip(
        range(start_page, end_page+1),
        repeat(city_name),
        repeat(city_code),
        repeat(amap_key),
        repeat(dynamic_data),
        repeat(get_dynamic_information),
        repeat(verbose))

    pool_result = pool.starmap(f, parameters)
    end_time = time.time()
    print('Total Time:', end_time-start_time)

    # save result
    temp = {}

    for key in house_information.keys():
        temp[key] = []
    for case in pool_result:
        for key in case.keys():
            temp[key].extend(case[key])

    # convert dictionary into pandas data frame
    houses = pd.DataFrame.from_dict(temp)
    houses.columns = columns
    
    # remove empty dynamic columns
    if get_dynamic_information == False:
        houses.drop(columns=dynamic_data_cn, inplace = True)
    
    # save result back to file
    houses.to_excel(excel_name, index=False)

In [None]:
start_page = 1
end_page = 100
pool_workers = 25
city_name = '上海'
city_code = 'sh'
excel_name = 'shanghai.xlsx'
amap_key = 'd7d06348caccd6b66b68be8e2054f778'
get_dynamic_information = False
# verbose = 0: display nothing; verbose = 1: display title & url; verbose = 2: display title & url and dynamic data cost time.
verbose = 1
# output counter
counter = Value('i', start_page)

multi_thread(city_name, city_code, excel_name, amap_key, pool_workers = pool_workers, get_dynamic_information = get_dynamic_information, start_page = start_page, end_page = end_page, verbose = verbose)


### Single Thread Dynamic Field Imputation

In [None]:
def load_source(excel_name, dynamic_data_cn = dynamic_data_cn):
    
    source = pd.read_excel(excel_name)

    # Check and try to add missing keys
    if list(source.keys()) != columns:
        for key in dynamic_data_cn:
            source[key] = None

    # Reorder Links
    links = source['房屋链接']
    source.drop(columns=['房屋链接'], inplace = True)
    source['房屋链接'] = links

    # Change to English columns
    source.columns = house_information

    return source


In [None]:
# 一个人一天只能免费查50页；从那个页面断了，start设置成哪个页面。
start = 444
end = 475
city_name = '深圳'
amap_key = 'bf94ee925fa677acb320dc5a1d305740'
excel_name = city_name + '.xlsx'

source = load_source(excel_name)
for index, row in source.iloc[start: end].iterrows():
    
    start_time = time.time()

#     address = row['administrative_region'] + ' ' + row['community_name'] + ' ' + row['area_name']
    address = row['community_name']
    print(address)
    dynamic_data = load_amap_dynamic_data(address, city_name, amap_key)

    
    print('No. %d %s' % (index, row['house_link']), end= ' ')
    for key in dynamic_data.keys():
        source.at[index, key] = dynamic_data[key]
    end_time = time.time()
    print('takes: %f' % (end_time - start_time))

In [None]:
pd.set_option('display.max_columns', None)
source.columns = columns
source.to_excel(excel_name, index=False)
source = pd.read_excel(excel_name)
# source = load_source(excel_name)
source.iloc[380: 400]

### Multi Thread Dynamic Field Imputation

In [None]:
def load_source(excel_name, dynamic_data_cn = dynamic_data_cn):
    
    source = pd.read_excel(excel_name)

    # Check and try to add missing keys
    if list(source.keys()) != columns:
        for key in dynamic_data_cn:
            source[key] = None

    # Reorder Links
    links = source['房屋链接']
    source.drop(columns=['房屋链接'], inplace = True)
    source['房屋链接'] = links

    # Change to English columns
    source.columns = house_information

    return source

In [None]:
from multiprocessing import Pool, Value
from itertools import repeat

start = 0
end = start + 50
pool_workers = 20
city_name = '北京'
excel_name = 'beijing.xlsx'
# output counter
counter = Value('i', start)
amap_key = '4d6bccdff9b15c88e224c2713e860cf0'
source = load_source(excel_name)

# Extract address
address_list = []
for index, row in source.iloc[start: end].iterrows():
    address = row['administrative_region'] + ' ' + row['community_name']  + ' ' + row['area_name']
    address_list.append(address)

def f(address, city_name, amap_key):
    start_time = time.time()
    dynamic_data = load_amap_dynamic_data(address, city_name, amap_key)
    end_time = time.time()
    print('%d / %d %s takes %f' % (counter.value, end, address, end_time - start_time))
    counter.value += 1
    return dynamic_data

# multiprocessing
start_time = time.time()
pool = Pool(pool_workers)
parameters = zip(
    address_list,
    repeat(city_name),
    repeat(amap_key))

pool_result = pool.starmap(f, parameters)
end_time = time.time()
print('Total Time:', end_time-start_time)

# assign pool result back to data frame
pool_index = 0
for index in range(start, end):
    result = pool_result[pool_index]
    pool_index += 1
    for key in result.keys():
        source.at[index, key] = result[key]

# save result to file
source.columns = columns
source.to_excel(excel_name, index=False)

In [None]:
# show result
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
source.iloc[start: end + 10]

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# source.iloc[2240: 2260]
source

### 地铁站坐标获取

In [None]:
import requests
import pandas as pd

offset = 50
init = 1
city_name = '上海'

subways = {
    'name': [],
    'location': []
}

url = 'http://restapi.amap.com/v3/place/text?key=d7d06348caccd6b66b68be8e2054f778&keywords=地铁站&types=&city=' + city_name + '&children=&offset=' + str(offset) + '&page=' + str(init) + '&extensions=all'
field_page = requests.get(url)
result_json = field_page.json()
count = int(result_json['count']) // offset + init


for page in range(init, count + 1):
    
    url = 'http://restapi.amap.com/v3/place/text?key=d7d06348caccd6b66b68be8e2054f778&keywords=地铁站&types=&city=' + city_name + '&children=&offset=' + str(offset) + '&page=' + str(page) + '&extensions=all'
    print(page, url)
    field_page = requests.get(url)
    result_json = field_page.json()
    subways_json = result_json['pois']
    for subway in subways_json:
        subways['name'].append(subway['name'])
        subways['location'].append(subway['location'])


subway = pd.DataFrame.from_dict(subways)
subway['name'] = subway['name'].str.replace('\(地铁站\)', '')
subway.to_csv(city_name + 'subway.csv', index=False)