In [11]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import time


In [2]:
headers = {
    'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
}

columns_index = ["标题", "总价", "小区名", "区", "街", "门牌号", "建造年代", "产权性质", "产权年限", "卧室数", "客厅数", "卫生间数",
                 "建筑面积", "房屋朝向", "楼层高低", "总楼层", "配套电梯", "唯一住房", "房屋单价（每平方）", "参考首付",
                 "装修程度", "房本年限", "是否一手"]

In [3]:
def get_link(url):
    
    
    wb_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(wb_data.text, "html.parser")  # 'lxml'
    links = soup.select(
        "body > div.contain-mod > div.g-module > div.m-main > ul > li > a > div.details > p.title > span")

    rows_list = []

    for link in links:
        href = link.get("href")
        rows_list.append(get_info(href))

    return rows_list


def get_info(href):
    wb_data = requests.get(href, headers=headers)
    soup = BeautifulSoup(wb_data.text, "html.parser")

    row = {}
    selector_ske = "#content > div.wrapper > div.wrapper-lf > div.houseInfoBox > div > div.houseInfo-wrap > ul > li:nth-child({}) > div.houseInfo-content > a"
    row.update({"标题": get_title(soup)})
    row.update({"总价": get_price(soup)})
    row.update({"小区名": get_community(soup)})

    location = get_location(soup)
    row.update({"区": location[0]})
    row.update({"街": location[1]})
    row.update({"门牌号": location[2]})

    row.update({"建造年代": get_building_year(soup)})
    row.update({"产权性质": get_permission_type(soup)})
    row.update({"产权年限": get_permission_duration(soup)})

    unit_type = get_unit_type(soup)
    row.update({"卧室数": unit_type[0]})
    row.update({"客厅数": unit_type[1]})
    row.update({"卫生间数": unit_type[2]})

    row.update({"建筑面积": get_area(soup)})
    row.update({"房屋朝向": get_towards(soup)})

    floor = get_floor_info(soup)
    row.update({"楼层高低": floor[0]})
    row.update({"总楼层": floor[1]})

    row.update({"配套电梯": get_elevator(soup)})
    row.update({"唯一住房": get_sole_house(soup)})
    row.update({"房屋单价（每平方）": get_unit_price(soup)})
    row.update({"参考首付": get_down_payment(soup)})
    row.update({"装修程度": get_decoration(soup)})
    row.update({"房本年限": get_permission_owning_time(soup)})
    row.update({"是否一手": get_first_hand(soup)})

    return row


def get_title(soup):
    return soup.select("#content > div.clearfix.title-guarantee > h3")[0].get_text().strip()


def get_price(soup):
    return int(soup.select("#content > div.wrapper > div.wrapper-lf > div.clearfix > div.basic-info.clearfix > span.light.info-tag > em")[0].get_text().strip())


def get_community(soup):
    return soup.select("#content > div.wrapper > div.wrapper-lf > div.houseInfoBox > div > div.houseInfo-wrap > ul > li:nth-child(1) > div.houseInfo-content > a")[0].get_text().strip()


def get_location(soup):
    location = soup.select(
        "#content > div.wrapper > div.wrapper-lf > div.houseInfoBox > div > div.houseInfo-wrap > ul > li:nth-child(4) > div.houseInfo-content > p")[0].get_text()
    add_levels = re.sub('\s+', ' ', location).split("－")
    for i, level in enumerate(add_levels):
        add_levels[i] = level.strip()
    return add_levels


def get_building_year(soup):
    buliding_year = soup.select(
        "#content > div.wrapper > div.wrapper-lf > div.houseInfoBox > div > div.houseInfo-wrap > ul > li:nth-child(7) > div.houseInfo-content")[0].get_text().strip()
    return int(re.findall(r"\d+\.?\d*", buliding_year)[0])


def get_permission_type(soup):
    return soup.select("#content > div.wrapper > div.wrapper-lf > div.houseInfoBox > div > div.houseInfo-wrap > ul > li:nth-child(10) > div.houseInfo-content")[0].get_text().strip()


def get_permission_duration(soup):
    permission_duration = soup.select(
        "#content > div.wrapper > div.wrapper-lf > div.houseInfoBox > div > div.houseInfo-wrap > ul > li:nth-child(13) > div.houseInfo-content")[0].get_text().strip()
    permission_duration = int(permission_duration.replace('年', ''))
    return permission_duration


def get_unit_type(soup):
    unit_type = soup.select(
        "#content > div.wrapper > div.wrapper-lf > div.houseInfoBox > div > div.houseInfo-wrap > ul > li:nth-child(2) > div.houseInfo-content")[0].get_text().strip()
    return [int(x) for x in re.findall(r"\d+\.?\d*", unit_type)]
    #[room, living_room, toilet]


def get_area(soup):
    area = soup.select(
        "#content > div.wrapper > div.wrapper-lf > div.houseInfoBox > div > div.houseInfo-wrap > ul > li:nth-child(5) > div.houseInfo-content")[0].get_text().strip()
    return float(re.findall(r"\d+\.?\d*", area)[0])


def get_towards(soup):
    return soup.select("#content > div.wrapper > div.wrapper-lf > div.houseInfoBox > div > div.houseInfo-wrap > ul > li:nth-child(8) > div.houseInfo-content")[0].get_text().strip()


def get_floor_info(soup):
    floor = soup.select(
        "#content > div.wrapper > div.wrapper-lf > div.houseInfoBox > div > div.houseInfo-wrap > ul > li:nth-child(11) > div.houseInfo-content")[0].get_text().strip()
    total_floor = int(re.findall(r"\d+\.?\d*", floor)[0])
    floor_type = floor.split("层")[0]
    return [floor_type, total_floor]


def get_elevator(soup):
    return soup.select("#content > div.wrapper > div.wrapper-lf > div.houseInfoBox > div > div.houseInfo-wrap > ul > li:nth-child(14) > div.houseInfo-content")[0].get_text().strip()


def get_sole_house(soup):
    return soup.select("#content > div.wrapper > div.wrapper-lf > div.houseInfoBox > div > div.houseInfo-wrap > ul > li:nth-child(17) > div.houseInfo-content")[0].get_text().strip()


def get_unit_price(soup):
    unit_price = soup.select(
        "#content > div.wrapper > div.wrapper-lf > div.houseInfoBox > div > div.houseInfo-wrap > ul > li:nth-child(3) > div.houseInfo-content")[0].get_text().strip()
    return float(re.findall(r"\d+\.?\d*", unit_price)[0])


def get_down_payment(soup):
    down_payment = soup.select(
        "#content > div.wrapper > div.wrapper-lf > div.houseInfoBox > div > div.houseInfo-wrap > ul > li:nth-child(6) > div.houseInfo-content")[0].get_text().strip()
    return float(re.findall(r"\d+\.?\d*", down_payment)[0])


def get_decoration(soup):
    return soup.select("#content > div.wrapper > div.wrapper-lf > div.houseInfoBox > div > div.houseInfo-wrap > ul > li:nth-child(12) > div.houseInfo-content")[0].get_text().strip()


def get_permission_owning_time(soup):
    return soup.select("#content > div.wrapper > div.wrapper-lf > div.houseInfoBox > div > div.houseInfo-wrap > ul > li:nth-child(15) > div.houseInfo-content")[0].get_text().strip()


def get_first_hand(soup):
    return soup.select("#content > div.wrapper > div.wrapper-lf > div.houseInfoBox > div > div.houseInfo-wrap > ul > li:nth-child(18) > div.houseInfo-content")[0].get_text().strip()

In [4]:
get_link("https://shanghai.anjuke.com/community/props/sale/735/p1/#filtersort")

[{'标题': '明星品质小区,低于同户型100万的新出神房,业主降价急售的',
  '总价': 1121,
  '小区名': '上海苏堤春晓名苑',
  '区': '普陀',
  '街': '长寿路',
  '门牌号': '长寿路800弄',
  '建造年代': 2006,
  '产权性质': '普通住宅',
  '产权年限': 70,
  '卧室数': 3,
  '客厅数': 2,
  '卫生间数': 2,
  '建筑面积': 178.7,
  '房屋朝向': '南北',
  '楼层高低': '高',
  '总楼层': 18,
  '配套电梯': '有',
  '唯一住房': '是',
  '房屋单价（每平方）': 62748.0,
  '参考首付': 336.3,
  '装修程度': '精装修',
  '房本年限': '满五年',
  '是否一手': '否'},
 {'标题': '苏堤春晓 品质小区 高区全南的大三房 业主诚售 税费各付',
  '总价': 925,
  '小区名': '上海苏堤春晓名苑',
  '区': '普陀',
  '街': '长寿路',
  '门牌号': '长寿路800弄',
  '建造年代': 2006,
  '产权性质': '普通住宅',
  '产权年限': 70,
  '卧室数': 3,
  '客厅数': 2,
  '卫生间数': 2,
  '建筑面积': 146.0,
  '房屋朝向': '南北',
  '楼层高低': '中',
  '总楼层': 18,
  '配套电梯': '有',
  '唯一住房': '是',
  '房屋单价（每平方）': 63374.0,
  '参考首付': 277.5,
  '装修程度': '精装修',
  '房本年限': '满五年',
  '是否一手': '否'},
 {'标题': '苏堤春晓 繁华地段交通配套成熟 家乐福地铁站0距离 生活方便',
  '总价': 888,
  '小区名': '上海苏堤春晓名苑',
  '区': '普陀',
  '街': '长寿路',
  '门牌号': '长寿路800弄',
  '建造年代': 2003,
  '产权性质': '普通住宅',
  '产权年限': 70,
  '卧室数': 2,
  '客厅数': 2,
  '卫生间数': 1,
  '建筑面积': 104.

In [5]:
#上海苏堤春晓名苑
urls = "https://shanghai.anjuke.com/community/props/sale/735/p{}/#filtersort"
# 不知道怎么扒有多少页，先爬10页

rows_list = []
for i in range(1,11):
    url = urls.format(i)
    rows_list += get_link(url)
    print(i)
    time.sleep(l)

1
2
3
4
5
6
7
8
9
10


Unnamed: 0,标题,总价,小区名,区,街,门牌号,建造年代,产权性质,产权年限,卧室数,...,房屋朝向,楼层高低,总楼层,配套电梯,唯一住房,房屋单价（每平方）,参考首付,装修程度,房本年限,是否一手
0,"明星品质小区,低于同户型100万的新出神房,业主降价急售的",1121,上海苏堤春晓名苑,普陀,长寿路,长寿路800弄,2006,普通住宅,70,3,...,南北,高,18,有,是,62748.0,336.3,精装修,满五年,否
1,苏堤春晓 繁华地段交通配套成熟 家乐福地铁站0距离 生活方便,888,上海苏堤春晓名苑,普陀,长寿路,长寿路800弄,2003,普通住宅,70,2,...,南,低,32,有,是,84709.0,267.0,精装修,满五年,否
2,苏堤950万的三房，超 级稀 缺的房源，豪华装修，诚售,950,上海苏堤春晓名苑,普陀,长寿路,长寿路800弄,2006,普通住宅,70,3,...,南,中,23,有,是,69343.0,285.0,精装修,满五年,否
3,苏堤春晓，明星小区，人车分流，带100平空中花园，带产权车位,1200,上海苏堤春晓名苑,普陀,长寿路,长寿路800弄,2005,公寓,70,3,...,南,中,32,有,是,92088.0,360.0,精装修,满五年,否
4,1250万买顶楼复式房，豪华装修，带露台，接受10个月置换！,1250,上海苏堤春晓名苑,普陀,长寿路,长寿路800弄,2005,公寓,70,4,...,南,高,18,有,是,65907.0,375.0,精装修,满五年,否
5,急售！！急售！！低于市场价100万急售！！！产权车位 税各付,880,上海苏堤春晓名苑,普陀,长寿路,长寿路800弄,2005,普通住宅,70,3,...,南北,高,23,有,是,60303.0,264.0,豪华装修,满五年,否
6,空中私家花园送120平大露台，苏堤春晓人车分流，中区景观房,1238,上海苏堤春晓名苑,普陀,长寿路,长寿路800弄,2006,普通住宅,70,3,...,南北,中,33,有,是,95019.0,371.4,精装修,满五年,否
7,品质小区 人车分流 全南户型 带连体双阳台 采光充足 地铁口,805,上海苏堤春晓名苑,普陀,长寿路,长寿路800弄,2005,普通住宅,70,2,...,南,低,32,有,是,77070.0,241.5,精装修,满五年,否
8,房东税费自己承担，南北通中高区三房，错过这个价格真的没有了,702,上海苏堤春晓名苑,普陀,长寿路,长寿路800弄,2006,普通住宅,70,3,...,南北,高,31,有,是,60000.0,210.6,精装修,满五年,否
9,急售，低于市场价，联体朝南双阳台，人车分流，地铁口拎包即住,850,上海苏堤春晓名苑,普陀,长寿路,长寿路800弄,2005,公寓,70,2,...,南,中,26,有,是,83333.0,255.0,精装修,满五年,否


In [10]:
pd.DataFrame(rows_list, columns=columns_index).to_excel("output.xlsx")