# pandas高级应用

In [1]:
import numpy as np
import pandas as pd

-----分类数据-----

In [45]:
np.random.seed(12345)
draws=np.random.randn(1000)
draws[:6]
# 使用pandas.qcut面元函数，它会返回pandas.Categorical
bins=pd.qcut(draws, 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
bins.codes[:10]

array([1, 2, 1, 1, 3, 3, 2, 2, 3, 3], dtype=int8)

In [84]:
bins=pd.Series(bins, name='quartile')
results=(pd.Series(draws).groupby(bins).agg(['count','max','min']).reset_index())
results

Unnamed: 0,quartile,count,max,min
0,Q1,250,-0.685484,-2.949343
1,Q2,250,-0.010115,-0.683066
2,Q3,250,0.628894,-0.010032
3,Q4,250,3.927528,0.634238


In [93]:
# 如果你是在一个特定数据集上做大量分析，将其转换为分类可以极大地提高效率
N = 10000000
draws = pd.Series(np.random.randn(N))
labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N //4))
# 将标签转换为分类：
%time categories=labels.astype('category')
categories.memory_usage()


Wall time: 340 ms


In [96]:
cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2, dtype='category')
cat_s
pd.get_dummies(cat_s)

Unnamed: 0,a,b,c,d
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,0,1
4,1,0,0,0
5,0,1,0,0
6,0,0,1,0
7,0,0,0,1


-----GroupBy高级应用-----

In [98]:
N = 15
times = pd.date_range('2017-05-20 00:00', freq='1min',periods=N)
df = pd.DataFrame({'time': times, 'value': np.arange(N)})
df.set_index('time').resample('5min').count()

Unnamed: 0_level_0,value
time,Unnamed: 1_level_1
2017-05-20 00:00:00,5
2017-05-20 00:05:00,5
2017-05-20 00:10:00,5


In [123]:
# 要对每个key值进行相同的重采样，引入pandas.TimeGrouper
df2 = pd.DataFrame({'time': times.repeat(3),'key': np.tile(['a', 'b', 'c'], N),'value': np.arange(N * 3.)})
time_key=pd.TimeGrouper('5min')
df2.set_index('time').groupby(['key', time_key]).sum().reset_index()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,key,time,value
0,a,2017-05-20 00:00:00,30.0
1,a,2017-05-20 00:05:00,105.0
2,a,2017-05-20 00:10:00,180.0
3,b,2017-05-20 00:00:00,35.0
4,b,2017-05-20 00:05:00,110.0
5,b,2017-05-20 00:10:00,185.0
6,c,2017-05-20 00:00:00,40.0
7,c,2017-05-20 00:05:00,115.0
8,c,2017-05-20 00:10:00,190.0
