In [157]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 分析空置情况

## 数据加载

In [158]:
vacancy = pd.read_excel("./data/vacancy.xlsx", parse_dates=["起始日期"])
vacancy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 719 entries, 0 to 718
Data columns (total 22 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   序号      719 non-null    int64         
 1   管理主体    719 non-null    object        
 2   项目名称    719 non-null    object        
 3   大厦名称    719 non-null    object        
 4   房号      719 non-null    object        
 5   用途      719 non-null    object        
 6   状态      708 non-null    object        
 7   面积      719 non-null    float64       
 8   使用率     708 non-null    float64       
 9   预计单价    719 non-null    float64       
 10  预计月租金   719 non-null    float64       
 11  起始日期    719 non-null    datetime64[ns]
 12  空置天数    719 non-null    int64         
 13  空置月数    718 non-null    float64       
 14  空置时长    719 non-null    object        
 15  责任人     526 non-null    object        
 16  空置原因    150 non-null    object        
 17  解决措施    94 non-null     object        
 18  计划解决时间  27

## 数据清洗

In [204]:
vacancy_simp = vacancy.iloc[:, [2, 3, 4, 7, 8, 9, 11, 15]].copy()
vacancy_simp

Unnamed: 0,项目名称,大厦名称,房号,面积,使用率,预计单价,起始日期,责任人
0,蛇口网谷,万融大厦,A座303/305,866.39,0.700,120.0,2019-04-01,曹泽宇
1,蛇口网谷,万融大厦,A401-402,1893.60,0.700,135.0,2021-04-01,曹泽宇
2,蛇口网谷,万融大厦,BG01/G02/G06/G07/G08/G09/G10/G11/G12,986.06,0.750,240.0,2020-09-01,商业小组
3,蛇口网谷,万融大厦,A401-402,1893.60,0.700,135.0,2021-04-01,张可言
4,蛇口网谷,万融大厦,B座106A,211.90,0.750,135.0,2021-06-01,曹泽宇
...,...,...,...,...,...,...,...,...
714,招商局芯云谷,6#孵化器,504,187.01,0.705,41.0,2019-11-26,
715,招商局芯云谷,6#孵化器,505,176.08,0.705,41.0,2019-11-26,
716,招商局芯云谷,6#孵化器,506,260.13,0.705,41.0,2019-11-26,
717,招商局芯云谷,6#孵化器,507,180.38,0.705,41.0,2019-11-26,


计算空置天数和空置月数. Pandas 专门提供了 Time deltas  来处理日期时间差. 两个日期时间相减会得到一个 dtype 是 `timedelta64[ns]`  对象来处理日期时间差的问题.

In [213]:
# 截止 2021 年 7 月 31 日 与空置起始日期间的差
vacant_timedelta = pd.to_datetime("2021-08-01") - vacancy_simp["起始日期"]
# 空置天数
vacant_days = (vacant_timedelta / np.timedelta64(1, "D")).astype("int")
# 空置月数
vacant_month = round(vacant_timedelta / np.timedelta64(1, "M"), 2)
vacancy_simp["vacant_days"] = vacant_days
vacancy_simp["vacant_month"] = vacant_month
vacancy_simp.head()

Unnamed: 0,项目名称,大厦名称,房号,面积,使用率,预计单价,起始日期,责任人,vacant_days,vacant_month
0,蛇口网谷,万融大厦,A座303/305,866.39,0.7,120.0,2019-04-01,曹泽宇,853,28.03
1,蛇口网谷,万融大厦,A401-402,1893.6,0.7,135.0,2021-04-01,曹泽宇,122,4.01
2,蛇口网谷,万融大厦,BG01/G02/G06/G07/G08/G09/G10/G11/G12,986.06,0.75,240.0,2020-09-01,商业小组,334,10.97
3,蛇口网谷,万融大厦,A401-402,1893.6,0.7,135.0,2021-04-01,张可言,122,4.01
4,蛇口网谷,万融大厦,B座106A,211.9,0.75,135.0,2021-06-01,曹泽宇,61,2.0


In [214]:
def rolling_window(window=2, data=[]):
    if len(data) <= window:
        return [data]
    else:
        return [data[:window]] + rolling_window(window, data[1:])
    
def bins_label(head='', tail='', template='{0} - {1}', bins=[]):
    return [head] + [template.format(i[0], i[1]) for i in rolling_window(data=bins)[1:-1]] + [tail]

将"vacant_month"空置月份进行分箱操作, 创建"light_bins"列来表示红黄灯, 以及创建"vacancy_bins"列来表示空置时长

In [215]:
vacancy_month_bins = [0, 3, 6, 12, np.iinfo(np.int8).max]
ligth_labels = ['绿灯', '黄灯', '红灯', '红灯']
month_labels = bins_label(head='3个月内', tail='12个月以上', template='{0}到{1}个月', bins=vacancy_month_bins)

vacancy_simp['light_bins'] = pd.cut(vacancy_simp["vacant_month"], vacancy_month_bins, labels=ligth_labels, ordered=False)
vacancy_simp['vacancy_bins'] = pd.cut(vacancy_simp["vacant_month"], vacancy_month_bins, labels=month_labels)

vacancy_simp.head()

Unnamed: 0,项目名称,大厦名称,房号,面积,使用率,预计单价,起始日期,责任人,vacant_days,vacant_month,light_bins,vacancy_bins
0,蛇口网谷,万融大厦,A座303/305,866.39,0.7,120.0,2019-04-01,曹泽宇,853,28.03,红灯,12个月以上
1,蛇口网谷,万融大厦,A401-402,1893.6,0.7,135.0,2021-04-01,曹泽宇,122,4.01,黄灯,3到6个月
2,蛇口网谷,万融大厦,BG01/G02/G06/G07/G08/G09/G10/G11/G12,986.06,0.75,240.0,2020-09-01,商业小组,334,10.97,红灯,6到12个月
3,蛇口网谷,万融大厦,A401-402,1893.6,0.7,135.0,2021-04-01,张可言,122,4.01,黄灯,3到6个月
4,蛇口网谷,万融大厦,B座106A,211.9,0.75,135.0,2021-06-01,曹泽宇,61,2.0,绿灯,3个月内


将面积进行分箱操作, 创建"area_bins"列

In [216]:
area_bins = [0, 100, 200, 300, 400, 500, 1000, 1500, 3000, np.iinfo(np.int16).max]
area_bins_labels = bins_label(head='100平以内', tail='3000平以上', template='{0}至{1}平', bins=area_bins)

vacancy_simp['area_bins'] = pd.cut(vacancy_simp['面积'], area_bins, labels=area_bins_labels)

vacancy_simp.head()

Unnamed: 0,项目名称,大厦名称,房号,面积,使用率,预计单价,起始日期,责任人,vacant_days,vacant_month,light_bins,vacancy_bins,area_bins
0,蛇口网谷,万融大厦,A座303/305,866.39,0.7,120.0,2019-04-01,曹泽宇,853,28.03,红灯,12个月以上,500至1000平
1,蛇口网谷,万融大厦,A401-402,1893.6,0.7,135.0,2021-04-01,曹泽宇,122,4.01,黄灯,3到6个月,1500至3000平
2,蛇口网谷,万融大厦,BG01/G02/G06/G07/G08/G09/G10/G11/G12,986.06,0.75,240.0,2020-09-01,商业小组,334,10.97,红灯,6到12个月,500至1000平
3,蛇口网谷,万融大厦,A401-402,1893.6,0.7,135.0,2021-04-01,张可言,122,4.01,黄灯,3到6个月,1500至3000平
4,蛇口网谷,万融大厦,B座106A,211.9,0.75,135.0,2021-06-01,曹泽宇,61,2.0,绿灯,3个月内,200至300平


## 数据分析

### 从面积的维度分析

分析面积与空置情况的关系, 首先分析红灯情况下, 哪些面积分段红灯比较多

In [291]:
# 按 面积分箱+灯分箱 分组
grouped_area_ligth = vacancy_simp.groupby(['area_bins', 'light_bins'])
# 分组统计记录数. 使用 agg 聚合函数后会返回一个 df
df_grouped_area_ligth_by_size = grouped_area_ligth.agg({'vacant_month': 'size'})
# 计算每组中各种灯的占比
# 难点: 对于多级分组如何求出下一级的分组在上一级分组中的占比
# 技巧: 使用 groupby() 函数的 level 参数, 再结合 apply() 函数来实现
df_grouped_area_ligth_by_size_percent = df_grouped_area_ligth_by_size.groupby(level=0).apply(lambda x: x / x.sum())
# rename cols
df_grouped_area_ligth_by_size_percent = df_grouped_area_ligth_by_size_percent.rename(columns={
    'vacant_month': 'percent'
})

df_grouped_area_ligth_by_size_percent

Unnamed: 0_level_0,Unnamed: 1_level_0,percent
area_bins,light_bins,Unnamed: 2_level_1
100平以内,红灯,0.743842
100平以内,绿灯,0.08867
100平以内,黄灯,0.167488
100至200平,红灯,0.343972
100至200平,绿灯,0.634752
100至200平,黄灯,0.021277
200至300平,红灯,0.701493
200至300平,绿灯,0.164179
200至300平,黄灯,0.134328
300至400平,红灯,0.678571


In [292]:
df_grouped_percent_red_light = df_grouped_area_ligth_by_size_percent.query('light_bins == "红灯"')
df_grouped_percent_red_light.sort_values(by='percent', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,percent
area_bins,light_bins,Unnamed: 2_level_1
500至1000平,红灯,0.888889
400至500平,红灯,0.870968
1000至1500平,红灯,0.769231
3000平以上,红灯,0.75
100平以内,红灯,0.743842
200至300平,红灯,0.701493
1500至3000平,红灯,0.684211
300至400平,红灯,0.678571
100至200平,红灯,0.343972


In [197]:
grouped_area_bins_light.head()

Unnamed: 0,项目名称,大厦名称,房号,面积,使用率,预计单价,起始日期,空置天数,空置月数,责任人,vacant_days,vacant_month,light,vacancy,area_bins
0,蛇口网谷,万融大厦,A座303/305,866.39,0.700,120.0,2019-04-01,876,28.0,曹泽宇,887,29.15,红灯,12个月以上,500至1000平
1,蛇口网谷,万融大厦,A401-402,1893.60,0.700,135.0,2021-04-01,145,4.0,曹泽宇,156,5.13,黄灯,3到6个月,1500至3000平
2,蛇口网谷,万融大厦,BG01/G02/G06/G07/G08/G09/G10/G11/G12,986.06,0.750,240.0,2020-09-01,357,11.0,商业小组,368,12.10,红灯,12个月以上,500至1000平
3,蛇口网谷,万融大厦,A401-402,1893.60,0.700,135.0,2021-04-01,124,4.0,张可言,156,5.13,黄灯,3到6个月,1500至3000平
4,蛇口网谷,万融大厦,B座106A,211.90,0.750,135.0,2021-06-01,84,2.0,曹泽宇,95,3.13,黄灯,3到6个月,200至300平
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
706,招商局芯云谷,6#孵化器,403,85.03,0.705,40.0,2019-11-26,637,20.0,,648,21.29,红灯,12个月以上,100平以内
709,招商局芯云谷,6#孵化器,406,260.13,0.705,40.0,2019-11-26,637,20.0,,648,21.29,红灯,12个月以上,200至300平
712,招商局芯云谷,6#孵化器,502,224.26,0.705,41.0,2019-11-26,637,20.0,,648,21.29,红灯,12个月以上,200至300平
713,招商局芯云谷,6#孵化器,503,83.37,0.705,41.0,2019-11-26,637,20.0,,648,21.29,红灯,12个月以上,100平以内


In [271]:
vacancy_simp.groupby(['项目名称','大厦名称'])['vacant_days'].nlargest()

项目名称  大厦名称       
W6仓   前海易港中心  87     197
              88     197
              89     197
              90     197
              91     197
                    ... 
高铁网谷  高铁大厦    226    639
              227    639
              228    639
              229    639
              261    273
Name: vacant_days, Length: 222, dtype: int64

In [279]:
vacancy_simp.sort_values(by='vacant_days', ascending=False).groupby(['项目名称'], as_index=False).first()

Unnamed: 0,项目名称,大厦名称,房号,面积,使用率,预计单价,起始日期,责任人,vacant_days,vacant_month,light_bins,vacancy_bins,area_bins
0,W6仓,前海易港中心,302库,2139.21,1.0,38.0,2021-01-16,张志宇,197,6.47,红灯,6到12个月,1500至3000平
1,W6辅助楼,前海易港中心,2、3、4、8层,4946.82,0.7,48.0,2018-07-01,张志宇,1127,37.03,红灯,12个月以上,3000平以上
2,东湖网谷,7号楼,206,158.26,0.7114,58.0,2020-11-01,王琼,273,8.97,红灯,6到12个月,100至200平
3,仙东网谷,仙东网谷,3-8楼部分房源,2141.66,0.63,36.0,2020-01-01,方亮,578,18.99,红灯,12个月以上,1500至3000平
4,光明科技园,A3,04C-5,143.97,0.75,60.0,2020-01-01,黄丁可,578,18.99,红灯,12个月以上,100至200平
5,南昌东湖意库,整体项目,整体项目,12626.08,,50.0,2021-05-31,,62,2.04,绿灯,3个月内,3000平以上
6,南海意库,1栋,103B,244.09,0.76,240.0,2020-12-24,商业小组,220,7.23,红灯,6到12个月,200至300平
7,庐州意库,庐州意库,6#401-1,358.58,0.952117,70.0,2021-07-01,魏从林,31,1.02,绿灯,3个月内,300至400平
8,招商局芯云谷,3#人才公寓,613,80.56,0.7502,24.826216,2019-11-26,全员,614,20.17,红灯,12个月以上,100平以内
9,文化艺术中心,/,212,399.48,0.586,124.0,2019-06-01,林巧,792,26.02,红灯,12个月以上,300至400平


In [299]:
import pandas._testing as tm

frame = tm.makeTimeDataFrame(3)
frame

Unnamed: 0,A,B,C,D
2000-01-03,0.397228,0.021911,0.481974,-0.218343
2000-01-04,-0.122126,-0.051248,-1.237669,-3.057918
2000-01-05,0.573423,1.296903,-0.126138,-0.409968


In [300]:
frame.shape

(3, 4)

In [303]:
frame.to_numpy().ravel('F')

array([ 0.3972278 , -0.12212637,  0.57342319,  0.02191096, -0.05124767,
        1.29690319,  0.48197359, -1.23766935, -0.12613828, -0.2183427 ,
       -3.05791823, -0.40996776])

In [305]:
np.asarray(frame.columns).repeat(3)

array(['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'D', 'D', 'D'],
      dtype=object)

In [306]:
np.tile(np.asarray(frame.index), 4)

array(['2000-01-03T00:00:00.000000000', '2000-01-04T00:00:00.000000000',
       '2000-01-05T00:00:00.000000000', '2000-01-03T00:00:00.000000000',
       '2000-01-04T00:00:00.000000000', '2000-01-05T00:00:00.000000000',
       '2000-01-03T00:00:00.000000000', '2000-01-04T00:00:00.000000000',
       '2000-01-05T00:00:00.000000000', '2000-01-03T00:00:00.000000000',
       '2000-01-04T00:00:00.000000000', '2000-01-05T00:00:00.000000000'],
      dtype='datetime64[ns]')