### 现在我们有2015到2017年25万条911的紧急电话的数据，请统计出出这些数据中不同类型的紧急情况的次数，如果我们还想统计出不同月份不同类型紧急电话的次数的变化情况，应该怎么做呢？


In [1]:
from matplotlib import pyplot as plt
from matplotlib import rc
import pandas as pd
import numpy as np

In [4]:
file_path = './911.csv'

In [5]:
df = pd.read_csv(file_path)
print(df.head(1))

         lat        lng                                               desc  \
0  40.297876 -75.581294  REINDEER CT & DEAD END;  NEW HANOVER; Station ...   

       zip                   title            timeStamp          twp  \
0  19525.0  EMS: BACK PAINS/INJURY  2015-12-10 17:10:52  NEW HANOVER   

                     addr  e  
0  REINDEER CT & DEAD END  1  


In [6]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249737 entries, 0 to 249736
Data columns (total 9 columns):
lat          249737 non-null float64
lng          249737 non-null float64
desc         249737 non-null object
zip          219391 non-null float64
title        249737 non-null object
timeStamp    249737 non-null object
twp          249644 non-null object
addr         249737 non-null object
e            249737 non-null int64
dtypes: float64(3), int64(1), object(5)
memory usage: 17.1+ MB
None


In [28]:
#获取分类
title = df['title'].str.split(': ').tolist()
cate_list = list(set([i[0] for i in title]))
print(cate_list)

['Traffic', 'EMS', 'Fire']


In [31]:
#构造全为0的数组
zero_df = pd.DataFrame(np.zeros((df.shape[0],len(cate_list))),columns = cate_list)

#赋值   通过遍历title数据量大，速度慢，因此使用contains（）来获得包含当前分类的位置
for cate in cate_list:
    zero_df[cate][df['title'].str.contains(cate)] = 1
print(zero_df)

# #直接遍历效率慢
# for i in range(df.shape[0]):
#     zero_df.loc[i,title[i][0]] = 1
# print(zero_df)

        Traffic  EMS  Fire
0           0.0  1.0   0.0
1           0.0  1.0   0.0
2           0.0  0.0   1.0
3           0.0  1.0   0.0
4           0.0  1.0   0.0
...         ...  ...   ...
249732      0.0  1.0   0.0
249733      0.0  1.0   0.0
249734      0.0  1.0   0.0
249735      0.0  0.0   1.0
249736      1.0  0.0   0.0

[249737 rows x 3 columns]


In [33]:
sum_ret = zero_df.sum(axis=0)
print(sum_ret)

Traffic     87465.0
EMS        124844.0
Fire        37432.0
dtype: float64


### 如果我们还想统计出不同月份不同类型紧急电话的次数的变化情况

In [37]:
cate_list = [i[0] for i in title]
df['cate'] = pd.DataFrame(np.array(cate_list).reshape((df.shape[0],1)))
print(df.head(5))

         lat        lng                                               desc  \
0  40.297876 -75.581294  REINDEER CT & DEAD END;  NEW HANOVER; Station ...   
1  40.258061 -75.264680  BRIAR PATH & WHITEMARSH LN;  HATFIELD TOWNSHIP...   
2  40.121182 -75.351975  HAWS AVE; NORRISTOWN; 2015-12-10 @ 14:39:21-St...   
3  40.116153 -75.343513  AIRY ST & SWEDE ST;  NORRISTOWN; Station 308A;...   
4  40.251492 -75.603350  CHERRYWOOD CT & DEAD END;  LOWER POTTSGROVE; S...   

       zip                    title            timeStamp                twp  \
0  19525.0   EMS: BACK PAINS/INJURY  2015-12-10 17:10:52        NEW HANOVER   
1  19446.0  EMS: DIABETIC EMERGENCY  2015-12-10 17:29:21  HATFIELD TOWNSHIP   
2  19401.0      Fire: GAS-ODOR/LEAK  2015-12-10 14:39:21         NORRISTOWN   
3  19401.0   EMS: CARDIAC EMERGENCY  2015-12-10 16:47:36         NORRISTOWN   
4      NaN           EMS: DIZZINESS  2015-12-10 16:56:52   LOWER POTTSGROVE   

                         addr  e  cate  
0      REINDEER

In [38]:
print(df.groupby(by='cate').count()['title'])

cate
EMS        124840
Fire        37432
Traffic     87465
Name: title, dtype: int64
