# 链家房源爬取

In [1]:
import time
import csv
import requests
from lxml import etree

# 常量定义
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
BASE_URL = "https://bj.lianjia.com/zufang/chaoyang/pg"
CSV_FILE = "朝阳区租房信息.csv"


def get_data(base_url, page_num):
    data_list = []
    session = requests.Session()
    session.headers.update({"User-Agent": USER_AGENT})
    for i in range(0, page_num):
        print(f"准备爬取第 {i + 1} 页......")
        count = 0
        url = base_url + str(i + 1)
        html = ask_url(url, session)
        if not html:
            print(f"跳过第 {i + 1} 页，无法获取页面内容。")
            continue
        tree = etree.HTML(html)
        items = tree.xpath('//div[@class="content__list--item--main"]')

        for item in items:
            data = []
            try:
                link = item.xpath('.//p[@class="content__list--item--title"]/a/@href')
                link = 'https://bj.lianjia.com' + link[0] if link else "未知"
                street = item.xpath('.//p[@class="content__list--item--des"]/a/text()')
                street = street[1] if len(street) > 1 else "未知"
                area = item.xpath('.//p[@class="content__list--item--des"]/a/@title')
                area = area[0] if area else "未知"
                price = item.xpath('.//span[@class="content__list--item-price"]/em/text()')
                price = price[0] if price else "未知"
                text = item.xpath('.//p[@class="content__list--item--des"]/text()')
                size = next((s.replace('<i>/</i>', '').replace('㎡', '').strip() for s in text if '㎡' in s), "未知")
                orientation = next((s.replace('<i>/</i>', '').strip() for s in text if '室' in s and '厅' in s), "未知")
                layout = next((s.replace('<i>/</i>', '').replace(' ', '').strip() for s in text if '南' in s or '北' in s or '东' in s or '西' in s), "未知")

                data.append(link)
                data.append(street)
                data.append(area)
                data.append(size)
                data.append(price)
                data.append(layout)
                data.append(orientation)

                if "未知" not in data:
                    print(data)
                    data_list.append(data)
                    count += 1

            except Exception as e:
                print(f"处理房源信息时出错: {e}") # 打印更详细的错误信息

        print(f"本页共爬取到 {count} 条有效数据。")
        print("---------------------------")

    return data_list


def ask_url(url, session):  # 添加 session 参数
    print("正在获取页面内容......")
    try:
        response = session.get(url, timeout=10)  # 使用 session 发送请求, 添加超时
        response.raise_for_status()  # 检查HTTP状态码，如果不是200，抛出异常
        html = response.text
        time.sleep(10)
        return html
    except requests.exceptions.RequestException as e:  # 捕获更广泛的 requests 异常
        print(f"请求失败: {e}")
        return ""


def save_data(data_list, save_path):
    col = ["URL", "街道", "小区", "面积", "价格", "朝向", "户型"]
    # 使用集合去除重复数据，以URL为键
    unique_data = []
    seen_urls = set()
    for data in data_list:
        url = data[0]  # URL是列表中的第一个元素
        if url not in seen_urls:
            unique_data.append(data)
            seen_urls.add(url)
    print(f"去重后还有 {len(unique_data)} 条数据。")
    print("正在保存到 CSV 文件中......")
    with open(save_path, "w", encoding='utf-8-sig', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(col)
        for data in unique_data:
            writer.writerow(data)
    print("数据已成功保存到 CSV 文件。")


if __name__ == '__main__':
    datalist = get_data(BASE_URL, 5)
    print(f"共爬取到 {len(datalist)} 条有效数据。")
    save_data(datalist, CSV_FILE)


准备爬取第 1 页......
正在获取页面内容......
['https://bj.lianjia.com/zufang/BJ2017854914570485760.html', '西坝河', '柳芳北街9号院', '65.46', '7000', '南', '2室1厅1卫']
['https://bj.lianjia.com/zufang/BJ2017856979782860800.html', '劲松', '广和东里', '50.12', '4800', '南', '2室1厅1卫']
['https://bj.lianjia.com/zufang/BJ2017470177750286336.html', '四惠', '通惠家园', '74.93', '6200', '南', '2室1厅1卫']
['https://bj.lianjia.com/zufang/BJ2017836510342021120.html', '常营', '筑福家园', '77.43', '5900', '南', '2室1厅1卫']
['https://bj.lianjia.com/zufang/BJ2015340131656925184.html', '高碑店', '南太平庄北巷', '64.36', '3500', '南北', '2室1厅1卫']
['https://bj.lianjia.com/zufang/BJ2017145847808000000.html', '国展', '左家庄东里', '63.09', '6200', '南', '2室1厅1卫']
['https://bj.lianjia.com/zufang/BJ1840010743072686080.html', '朝青', '晴悦家园', '80.48', '6800', '南', '2室1厅1卫']
['https://bj.lianjia.com/zufang/BJ2011289186778218496.html', '工体', '杰座大厦', '71.56', '9000', '东', '1室1厅1卫']
['https://bj.lianjia.com/zufang/BJ2017169778463997952.html', '望京', '融科橄榄城一期', '153.91', '19000', '南北', '