In [1]:
import pandas as pd
import numpy as np
import re
import os
import datetime
from dateutil.parser import parse

In [2]:
import plotly.plotly as py
import plotly.graph_objs as go

In [3]:
file_ls = [i for i in os.listdir('.') if i.endswith('xlsx')]

In [4]:
df_ls = [pd.read_excel(i) for i in file_ls]

In [5]:
data = pd.concat(df_ls)

In [6]:
data.drop_duplicates(subset='title',inplace=True)

In [7]:
data.shape

(8805, 5)

In [8]:
data.head(2)

Unnamed: 0,author,href,last_time,response,title
0,果样1990,https://www.douban.com/group/topic/117028713/,05-16 00:42,18.0,长宁区昭化路粉色三室一厅招室友（限女生）近2/3/4/11号线 5.17可住
1,纸短情长,https://www.douban.com/group/topic/116792744/,05-16 00:42,1715.0,大华颐和华城精装两室一厅一卫带阳台即急租


### 按发帖人发帖量排序

In [9]:
group_author = data.groupby('author').size().sort_values(ascending=False)

In [10]:
author_bar = [go.Bar(x=group_author.index,y=group_author.values)]
py.iplot(author_bar,filename='author-post')

### 抽取出疑似中介，并排除掉

In [11]:
middle = group_author[group_author>5].index
data = data[~data['author'].isin(middle)]

data.shape

(6275, 5)

### 取最新回复日期为5月份的

In [12]:
data['time'] = data['last_time'].apply(lambda x:parse(x).date())
data = data[data['time']>=datetime.date(2018,5,1)]

In [13]:
data.shape

(4454, 6)

### 每天都有多少帖子

In [64]:
group = data.groupby('time').size()

In [65]:
day_bar = [go.Bar(x=group.index,y=group.values)]
py.iplot(day_bar,filename='post-byday')

### 排除掉已租的、次卧、主卧

In [14]:
exp = '已租|已出租|次卧|主卧|单间|找室友|合租'
data['exp'] = data['title'].apply(lambda x: 1 if re.search(exp,x) else 0)

data = data[data['exp']==0]

### 匹配两室一厅且在漕河泾附近的房子

In [31]:
total_mask = '整租|(两|二|2).{0,1}（室|房）'
region_mask = '漕河泾|上海南站|桂林路|宜山路|漕宝路'
region2 = '陕西南路|徐家汇|上海体育馆|衡山路|常熟路|黄陂南路|东安路|虹桥路|漕溪路|嘉善路'

In [32]:
data['mask1'] = data['title'].apply(lambda x: 1 if re.search(total_mask,x) else 0)
data['mask2'] = data['title'].apply(lambda x: 1 if re.search(region_mask,x) else 0)
data['mask3'] = data['title'].apply(lambda x: 1 if re.search(region2,x) else 0)


In [33]:
data[data['mask3']==1]

Unnamed: 0,author,href,last_time,response,title,time,exp,mask1,mask2,mask3
34,大槐树,https://www.douban.com/group/topic/115947283/,05-16 00:39,1354.0,❤❤这间房竟敢建到地铁里！！❤❤近⑨号线直达徐家汇漕河泾❤❤非中介个人房源！！！,2018-05-16,0,0,1,1
64,何处惹尘埃,https://www.douban.com/group/topic/116851881/,05-16 00:32,51.0,徐家汇，天钥桥路，电梯大2房，90平带储物间，朝南急租，8000,2018-05-16,0,0,0,1
93,J.W.,https://www.douban.com/group/topic/117163327/,05-16 00:21,,求租，虹桥路10号线沿线,2018-05-16,0,0,0,1
112,Hu,https://www.douban.com/group/topic/116807302/,05-16 00:13,33.0,九亭9号线（至地铁站2分钟，徐家汇30分钟，限男生）1980,2018-05-16,0,0,0,1
221,CC,https://www.douban.com/group/topic/116637855/,05-15 23:32,3.0,1.9.11.4号线徐家汇地铁站三分钟超大精装房间出租,2018-05-15,0,0,0,1
238,空,https://www.douban.com/group/topic/117068865/,05-15 23:23,5.0,徐汇交大妹子诚觅一室友租徐家汇两室户,2018-05-15,0,0,0,1
280,美美,https://www.douban.com/group/topic/117038454/,05-15 23:13,10.0,1/9/11/4/7号线徐家汇零陵小区南卧找舍友，限女生～,2018-05-15,0,0,0,1
329,长空,https://www.douban.com/group/topic/117159836/,05-15 22:55,1.0,徐家汇、衡山路1室户（独立卫浴、厨房）4390元每月，5月底起租,2018-05-15,0,1,0,1
353,落枫,https://www.douban.com/group/topic/114983387/,05-15 22:49,148.0,【3-4-10虹桥路】两室一厅 2100,2018-05-15,0,0,0,1
364,米花,https://www.douban.com/group/topic/117125505/,05-15 22:46,6.0,10号线宋园路3/4号线虹桥路 朝南一室户,2018-05-15,0,0,0,1


In [34]:
data[(data['mask3']==1)&(data['mask1']==1)]

Unnamed: 0,author,href,last_time,response,title,time,exp,mask1,mask2,mask3
329,长空,https://www.douban.com/group/topic/117159836/,05-15 22:55,1.0,徐家汇、衡山路1室户（独立卫浴、厨房）4390元每月，5月底起租,2018-05-15,0,1,0,1
413,秋风悲画扇,https://www.douban.com/group/topic/116114444/,05-15 22:28,1742.0,【品牌公寓 首月免租】女神社区，直达徐家汇，整租一房带独立厨卫，价格1800,2018-05-15,0,1,0,1
1548,柯南特穆尔,https://www.douban.com/group/topic/117048189/,05-15 11:06,3.0,【个人转租，和房东签合同】徐家汇中心地段，两室一厅一厨一卫整租,2018-05-15,0,1,0,1
1613,发如雪,https://www.douban.com/group/topic/116933203/,05-15 10:22,263.0,8/11号线地铁直达！整租一室户，低至1500，15分钟直达徐家汇，人民广场！！！,2018-05-15,0,1,0,1
1638,那一抹亮光,https://www.douban.com/group/topic/113750911/,05-15 10:04,724.0,7号线/11号线整租一室户1700起 25分钟到长寿路 静安寺 曹杨路 徐家汇,2018-05-15,0,1,0,1
1989,小星星,https://www.douban.com/group/topic/116949924/,05-14 21:17,4.0,【整租】枫林新村，近大木桥路和东安路地铁站，精装朝南，家具齐全！4000/月！,2018-05-14,0,1,0,1
16,芳芳,https://www.douban.com/group/topic/116237587/,05-16 00:42,98.0,徐家汇漕河泾九号线佘山近地铁口整租两房出租，房东直租！超低价2900,2018-05-16,0,1,1,1
399,发如雪,https://www.douban.com/group/topic/116932827/,05-15 23:19,131.0,8/11号线地铁直达！整租一室户！低至1500，15分钟到达徐家汇，人民广场！！,2018-05-15,0,1,0,1
