## 中国历年主要城市房价数据分析

In [40]:
import requests
from lxml import etree
import pandas as pd
import re

## 获取房价数据

In [14]:
def get_year_price(year):

    url = f'https://www.anjuke.com/fangjia/quanguo{year}/'

    page = requests.get(url)

    html = page.text

    s = etree.HTML(html)

    title_ls = s.xpath('//div[@class="fjlist-box boxstyle1"][1]//ul//li//a/b/text()')

    price_ls = s.xpath('//div[@class="fjlist-box boxstyle1"][1]//ul//li//a/span/text()')

    df = pd.DataFrame({'title':title_ls,'price':price_ls})

    return df

In [15]:
year = 2020

In [19]:
f'https://www.anjuke.com/fangjia/quanguo{year}/'

'https://www.anjuke.com/fangjia/quanguo2020/'

In [16]:
df_2020 = get_year_price(2020)

In [18]:
df_2020

Unnamed: 0,title,price


被封IP了，太惨了，只能手动去58同城获取房价数据，虽然不准，但可以看下趋势

## 58同城房价数据

In [29]:
year_ls = list(range(2010,2021))

sheet_name = [str(year) for year in year_ls]

df = pd.read_excel('house_price_history.xlsx',sheet_name=sheet_name)

In [37]:
# 数据合并
ls = []

for sheet in sheet_name:
    tmp = df[sheet]
    tmp.columns = ['info']
    tmp['year'] = int(sheet)
    ls.append(tmp)

df1 = pd.concat(ls,axis=0)

In [51]:
# 抽取城市和房价

df1['city'] = df1['info'].apply(lambda x:re.findall('(.*)房价',x)[0])

df1['price'] = df1['info'].apply(lambda x:int(re.findall('(\d+)元',x)[0]))

In [52]:
df1

Unnamed: 0,info,year,city,price
0,北京房价24535元/㎡0.81% ↑,2010,北京,24535
1,上海房价22311元/㎡1.68% ↑,2010,上海,22311
2,杭州房价21683元/㎡0.66% ↑,2010,杭州,21683
3,深圳房价16511元/㎡3.33% ↑,2010,深圳,16511
4,青岛房价15525元/㎡8.89% ↑,2010,青岛,15525
...,...,...,...,...
95,长治房价8953元/㎡0.42% ↑,2020,长治,8953
96,德州房价8865元/㎡0.30% ↓,2020,德州,8865
97,贵阳房价8864元/㎡0.31% ↓,2020,贵阳,8864
98,连云港房价8851元/㎡1.02% ↑,2020,连云港,8851


In [53]:
df1.to_csv('clean_house_price.csv',index=False)

## 安居客房价数据

In [77]:
year_ls = list(range(2012,2022))

sheet_name = [str(year) for year in year_ls]

df = pd.read_excel('house_price_ajk.xlsx',sheet_name=sheet_name)

In [78]:
# 数据合并
ls = []

for sheet in sheet_name:
    tmp = df[sheet]
    tmp.columns = ['info']
    tmp['year'] = int(sheet)
    ls.append(tmp)

df_price = pd.concat(ls,axis=0)

In [79]:
# 抽取城市和房价

df_price['city'] = df_price['info'].apply(lambda x:re.findall('年(.*)房价',x)[0])

df_price['price'] = df_price['info'].apply(lambda x:int(re.findall('(\d+)元',x)[0]))

In [80]:
df_price

Unnamed: 0,info,year,city,price
0,2012年北京房价30158元/㎡2.15%↑,2012,北京,30158
1,2012年上海房价23428元/㎡1.4%↑,2012,上海,23428
2,2012年深圳房价19298元/㎡0.8%↑,2012,深圳,19298
3,2012年杭州房价18713元/㎡1.77%↑,2012,杭州,18713
4,2012年三亚房价18187元/㎡6.48%↑,2012,三亚,18187
...,...,...,...,...
45,2021年宁德房价12628元/㎡0.34%↑,2021,宁德,12628
46,2021年泉州房价12527元/㎡0.11%↑,2021,泉州,12527
47,2021年莆田房价12513元/㎡0.03%↑,2021,莆田,12513
48,2021年拉萨房价12492元/㎡0.01%↑,2021,拉萨,12492


In [62]:
df_price.to_csv('clean_house_price_ajk.csv',index=False)

## 收入数据处理

In [81]:
df = pd.read_excel('city_income.xlsx')

In [82]:
df.head()

Unnamed: 0,地区,2019年,2018年,2017年,2016年,2015年,2014年,2013年,2012年,2011年,2010年
0,北京,173205,149843,134994,122749,113073,103400,93997,85306,75835,65683.0
1,天津,111602,103931,96965,87806,81486,73839,68864,65398,55636,52963.0
2,石家庄,79581,75114,67880,61189,54441,48272,43712,38426,35132,31459.0
3,太原,82860,80825,72114,64820,60516,57771,51161,48905,44868,38839.0
4,呼和浩特,84105,71387,63084,56213,53698,50469,48797,65637,42797,37694.0


In [83]:
# 数据处理

df.set_index('地区',inplace=True)

df_income = df.stack().to_frame().reset_index()
df_income.columns = ['city','year','income']

In [91]:
df_income['year'] = df_income['year'].apply(lambda x:int(x[:4]))

In [92]:
df_income

Unnamed: 0,city,year,income
0,北京,2019,173205.0
1,北京,2018,149843.0
2,北京,2017,134994.0
3,北京,2016,122749.0
4,北京,2015,113073.0
...,...,...,...
354,乌鲁木齐,2014,61617.0
355,乌鲁木齐,2013,56680.0
356,乌鲁木齐,2012,51135.0
357,乌鲁木齐,2011,47178.0


In [90]:
df_price

Unnamed: 0,info,year,city,price
0,2012年北京房价30158元/㎡2.15%↑,2012,北京,30158
1,2012年上海房价23428元/㎡1.4%↑,2012,上海,23428
2,2012年深圳房价19298元/㎡0.8%↑,2012,深圳,19298
3,2012年杭州房价18713元/㎡1.77%↑,2012,杭州,18713
4,2012年三亚房价18187元/㎡6.48%↑,2012,三亚,18187
...,...,...,...,...
45,2021年宁德房价12628元/㎡0.34%↑,2021,宁德,12628
46,2021年泉州房价12527元/㎡0.11%↑,2021,泉州,12527
47,2021年莆田房价12513元/㎡0.03%↑,2021,莆田,12513
48,2021年拉萨房价12492元/㎡0.01%↑,2021,拉萨,12492


In [97]:
df_merge = pd.merge(df_price,df_income,on=['year','city'],how='left')

In [98]:
df_merge

Unnamed: 0,info,year,city,price,income
0,2012年北京房价30158元/㎡2.15%↑,2012,北京,30158,85306.0
1,2012年上海房价23428元/㎡1.4%↑,2012,上海,23428,80191.0
2,2012年深圳房价19298元/㎡0.8%↑,2012,深圳,19298,59010.0
3,2012年杭州房价18713元/㎡1.77%↑,2012,杭州,18713,56418.0
4,2012年三亚房价18187元/㎡6.48%↑,2012,三亚,18187,
...,...,...,...,...,...
495,2021年宁德房价12628元/㎡0.34%↑,2021,宁德,12628,
496,2021年泉州房价12527元/㎡0.11%↑,2021,泉州,12527,
497,2021年莆田房价12513元/㎡0.03%↑,2021,莆田,12513,
498,2021年拉萨房价12492元/㎡0.01%↑,2021,拉萨,12492,


In [99]:
df_merge.to_csv('clean_house_price_income.csv',index=False)