## 数据透视:`pivot()`

测试`pivot_table()`：

In [35]:
import numpy as np
import pandas as pd

date = pd.date_range('20220101', periods=8)

types = ['A', 'B', 'C', 'D']
df_pivot_table = pd.DataFrame({
    'date' : date,
    'class1' : [types[x] for x in np.random.randint(0, 4, size=8)],
    'class2' : [types[x] for x in np.random.randint(0, 4, size=8)],
    'score' : np.random.random(8)
})

df_pivot_table


Unnamed: 0,date,class1,class2,score
0,2022-01-01,A,B,0.235793
1,2022-01-02,C,D,0.697392
2,2022-01-03,C,C,0.878962
3,2022-01-04,C,D,0.947072
4,2022-01-05,A,D,0.924215
5,2022-01-06,A,D,0.092037
6,2022-01-07,B,B,0.3768
7,2022-01-08,C,B,0.953593


In [37]:
# 使用`pivot_table`的时候其实需要指定对应的聚合函数，默认为`mean`
df_pivot_table.pivot_table(index='class1', columns='class2', values='score')

class2,B,C,D
class1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.235793,,0.508126
B,0.3768,,
C,0.953593,0.878962,0.822232


测试`pivot()`：

In [8]:
import numpy as np
import pandas as pd

date = pd.date_range('20220101', periods=6)

types = ['A', 'B', 'C', 'D']
df = pd.DataFrame({
    'date' : date,
    'variables' : [types[x] for x in np.random.randint(0, 4, size=6)],
    'values' : np.random.random(6)
})

df

Unnamed: 0,date,variables,values
0,2022-01-01,A,0.94876
1,2022-01-02,B,0.105496
2,2022-01-03,A,0.634017
3,2022-01-04,A,0.523807
4,2022-01-05,B,0.855407
5,2022-01-06,B,0.596774


In [11]:
df.pivot(index='date', columns='variables', values='values')

variables,A,B
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-01-01,0.94876,
2022-01-02,,0.105496
2022-01-03,0.634017,
2022-01-04,0.523807,
2022-01-05,,0.855407
2022-01-06,,0.596774


## melt()

In [22]:
import numpy as np
import pandas as pd

date = pd.date_range('20220101', periods=6)

types = ['A', 'B', 'C', 'D']
df = pd.DataFrame({
    'date' : date,
    'class' : [types[x] for x in np.random.randint(0, 4, size=6)],
    'score' : np.random.random(6)
})

df

Unnamed: 0,date,class,score
0,2022-01-01,B,0.434302
1,2022-01-02,D,0.259947
2,2022-01-03,C,0.632063
3,2022-01-04,D,0.215588
4,2022-01-05,B,0.408237
5,2022-01-06,B,0.568689


In [24]:
df_melt = df.melt(id_vars=['date'], value_vars=['class', 'score'])
df_melt

Unnamed: 0,date,variable,value
0,2022-01-01,class,B
1,2022-01-02,class,D
2,2022-01-03,class,C
3,2022-01-04,class,D
4,2022-01-05,class,B
5,2022-01-06,class,B
6,2022-01-01,score,0.434302
7,2022-01-02,score,0.259947
8,2022-01-03,score,0.632063
9,2022-01-04,score,0.215588


In [25]:
df_melt.pivot(index='date', columns='variable', values='value')

variable,class,score
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-01-01,B,0.434302
2022-01-02,D,0.259947
2022-01-03,C,0.632063
2022-01-04,D,0.215588
2022-01-05,B,0.408237
2022-01-06,B,0.568689


## 堆叠：stack()/unstack()

In [12]:
import numpy as np
import pandas as pd

date = pd.date_range('20220101', periods=6)

types = ['A', 'B', 'C', 'D']
df = pd.DataFrame({
    'date' : date,
    'variables' : [types[x] for x in np.random.randint(0, 4, size=6)],
    'values' : np.random.random(6)
})

df

Unnamed: 0,date,variables,values
0,2022-01-01,B,0.784695
1,2022-01-02,A,0.348717
2,2022-01-03,B,0.53727
3,2022-01-04,D,0.384949
4,2022-01-05,A,0.286118
5,2022-01-06,A,0.335596


In [13]:
df_stack = df.stack()
df_stack

0  date         2022-01-01 00:00:00
   variables                      B
   values                  0.784695
1  date         2022-01-02 00:00:00
   variables                      A
   values                  0.348717
2  date         2022-01-03 00:00:00
   variables                      B
   values                   0.53727
3  date         2022-01-04 00:00:00
   variables                      D
   values                  0.384949
4  date         2022-01-05 00:00:00
   variables                      A
   values                  0.286118
5  date         2022-01-06 00:00:00
   variables                      A
   values                  0.335596
dtype: object

In [17]:
# df_stack 为单索引，压缩为`Series`类型
df_stack.info()

<class 'pandas.core.series.Series'>
MultiIndex: 18 entries, (0, 'date') to (5, 'values')
Series name: None
Non-Null Count  Dtype 
--------------  ----- 
18 non-null     object
dtypes: object(1)
memory usage: 476.0+ bytes


In [14]:
df_stack.unstack()

Unnamed: 0,date,variables,values
0,2022-01-01 00:00:00,B,0.784695
1,2022-01-02 00:00:00,A,0.348717
2,2022-01-03 00:00:00,B,0.53727
3,2022-01-04 00:00:00,D,0.384949
4,2022-01-05 00:00:00,A,0.286118
5,2022-01-06 00:00:00,A,0.335596
