**reshaping=转换表或者向量(DataFrame/Series)的结构，让其适合进行数据分析**

+ pivot
+ pivot-table
+ stack
+ un-stack

### pivot

pivot函数有三个参数 :

+ index
+ columns
+ values



In [19]:
from collections import OrderedDict
from pandas import DataFrame
import pandas as pd
import numpy as np

In [20]:
table = OrderedDict((
    ("Item", ['Item0', 'Item0', 'Item1', 'Item1']),
    ('CType',['Gold', 'Bronze', 'Gold', 'Silver']),
    ('USD',  [1, 2, 3, 4]),
    ('EU',   [1, 2, 3, 4])
))
df = DataFrame(table)
df

Unnamed: 0,Item,CType,USD,EU
0,Item0,Gold,1,1
1,Item0,Bronze,2,2
2,Item1,Gold,3,3
3,Item1,Silver,4,4


In [21]:
p=df.pivot(index='Item',columns='CType',values='USD')
p  #p里面没有EU信息，某种意义上pivot后是对原来信息的简化

CType,Bronze,Gold,Silver
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Item0,2.0,1.0,
Item1,,3.0,4.0


In [19]:
# 原来的DataFrame: 获取Item0，金牌客户的的价值
print (df[(df.Item=='Item0') & (df.CType=='Gold')].USD.values)

[1]


In [20]:
# Pivoted DataFrame
print (p[p.index=='Item0'].Gold.values)

[ 1.]


### 多列pivot

In [22]:
p = df.pivot(index='Item', columns='CType')
p #muliindex ==hierarchical column

Unnamed: 0_level_0,USD,USD,USD,EU,EU,EU
CType,Bronze,Gold,Silver,Bronze,Gold,Silver
Item,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Item0,2.0,1.0,,2.0,1.0,
Item1,,3.0,4.0,,3.0,4.0


In [23]:
print(df[(df.Item=='Item0') & (df.CType=='Gold')].USD.values)

[1]


In [24]:
print(p.USD[p.USD.index=='Item0'].Gold.values)

[ 1.]


### pivot常见错误-index/columns索引后对应后有多行相同值

In [40]:
table = OrderedDict((
    ("Item", ['Item0', 'Item0', 'Item0', 'Item1']),
    ('CType',['Gold', 'Bronze', 'Gold', 'Silver']),
    ('USD',  [1, 2, 3, 4]),
    ('EU',   [1, 2, 3, 4])
))
df = DataFrame(table)
print(df)
p = df.pivot(index='Item', columns='CType', values='USD')

    Item   CType  USD  EU
0  Item0    Gold    1   1
1  Item0  Bronze    2   2
2  Item0    Gold    3   3
3  Item1  Silver    4   4


ValueError: Index contains duplicate entries, cannot reshape

### pivot table

In [41]:
p=df.pivot_table(index='Item',columns='CType',values='USD')
p

CType,Bronze,Gold,Silver
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Item0,2.0,2.0,
Item1,,,4.0


In [42]:
df

Unnamed: 0,Item,CType,USD,EU
0,Item0,Gold,1,1
1,Item0,Bronze,2,2
2,Item0,Gold,3,3
3,Item1,Silver,4,4


In [46]:
table = OrderedDict((
    ("Item", ['Item0', 'Item0', 'Item0', 'Item1']),
    ('CType',['Gold', 'Bronze', 'Gold', 'Silver']),
    ('USD',  [1, 2, 3, 4]),
    ('EU',   [1.1, 2.2, 3, 4.4])
))
df = DataFrame(table)
p = df.pivot_table(index='Item', columns='CType', values='USD', aggfunc=np.min)  #aggfun指明aggretation函数
p

CType,Bronze,Gold,Silver
Item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Item0,2.0,1.0,
Item1,,,4.0


### stack/unstack

pivot实际是对DataFrame进行stack的一种特例。stack意味把最内层的列索引变成最内层的行索引，而unstack就是把最内层的行索引变列索引

![](./img/stack_unstack.png)

In [47]:
# Row Multi-Index
row_idx_arr = list(zip(['r0', 'r0'], ['r00', 'r01']))
row_idx = pd.MultiIndex.from_tuples(row_idx_arr)
print(row_idx)
# Column Multi-Index
col_idx_arr = list(zip(['c0', 'c0', 'c1'], ['c00', 'c01', 'c10']))
col_idx = pd.MultiIndex.from_tuples(col_idx_arr)
print(col_idx)
# Create the DataFrame
df = DataFrame(np.arange(6).reshape(2,3), index=row_idx, columns=col_idx)
df = df.applymap(lambda x: (x // 3, x % 3))
df

MultiIndex(levels=[['r0'], ['r00', 'r01']],
           labels=[[0, 0], [0, 1]])
MultiIndex(levels=[['c0', 'c1'], ['c00', 'c01', 'c10']],
           labels=[[0, 0, 1], [0, 1, 2]])


Unnamed: 0_level_0,Unnamed: 1_level_0,c0,c0,c1
Unnamed: 0_level_1,Unnamed: 1_level_1,c00,c01,c10
r0,r00,"(0, 0)","(0, 1)","(0, 2)"
r0,r01,"(1, 0)","(1, 1)","(1, 2)"


In [48]:
p=df.stack()
p

Unnamed: 0,Unnamed: 1,Unnamed: 2,c0,c1
r0,r00,c00,"(0, 0)",
r0,r00,c01,"(0, 1)",
r0,r00,c10,,"(0, 2)"
r0,r01,c00,"(1, 0)",
r0,r01,c01,"(1, 1)",
r0,r01,c10,,"(1, 2)"


In [50]:
p.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,c0,c0,c0,c1,c1,c1
Unnamed: 0_level_1,Unnamed: 1_level_1,c00,c01,c10,c00,c01,c10
r0,r00,"(0, 0)","(0, 1)",,,,"(0, 2)"
r0,r01,"(1, 0)","(1, 1)",,,,"(1, 2)"


### pivot案例

In [51]:
df = pd.read_excel("../data/sales-funnel.xlsx")
df.head()

Unnamed: 0,Account,Name,Rep,Manager,Product,Quantity,Price,Status
0,714466,Trantow-Barrows,Craig Booker,Debra Henley,CPU,1,30000,presented
1,714466,Trantow-Barrows,Craig Booker,Debra Henley,Software,1,10000,presented
2,714466,Trantow-Barrows,Craig Booker,Debra Henley,Maintenance,2,5000,pending
3,737550,"Fritsch, Russel and Anderson",Craig Booker,Debra Henley,CPU,1,35000,declined
4,146832,Kiehn-Spinka,Daniel Hilton,Debra Henley,CPU,2,65000,won


In [52]:
df.dtypes

Account      int64
Name        object
Rep         object
Manager     object
Product     object
Quantity     int64
Price        int64
Status      object
dtype: object

In [53]:
df["Status"] = df["Status"].astype("category")
df["Status"].cat.set_categories(["won","pending","presented","declined"],inplace=True)

In [54]:
df.dtypes

Account        int64
Name          object
Rep           object
Manager       object
Product       object
Quantity       int64
Price          int64
Status      category
dtype: object

In [6]:
pd.pivot_table(df,index=["Manager","Rep"],values=["Price"])

Unnamed: 0_level_0,Unnamed: 1_level_0,Price
Manager,Rep,Unnamed: 2_level_1
Debra Henley,Craig Booker,20000
Debra Henley,Daniel Hilton,38333
Debra Henley,John Smith,20000
Fred Anderson,Cedric Moss,27500
Fred Anderson,Wendy Yule,44250


In [7]:
pd.pivot_table(df,index=["Manager","Rep"],values=["Price"],aggfunc=np.sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,Price
Manager,Rep,Unnamed: 2_level_1
Debra Henley,Craig Booker,80000
Debra Henley,Daniel Hilton,115000
Debra Henley,John Smith,40000
Fred Anderson,Cedric Moss,110000
Fred Anderson,Wendy Yule,177000


In [8]:
pd.pivot_table(df,index=["Manager","Rep"],values=["Price"],aggfunc=[np.mean,len])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,len
Unnamed: 0_level_1,Unnamed: 1_level_1,Price,Price
Manager,Rep,Unnamed: 2_level_2,Unnamed: 3_level_2
Debra Henley,Craig Booker,20000,4
Debra Henley,Daniel Hilton,38333,3
Debra Henley,John Smith,20000,2
Fred Anderson,Cedric Moss,27500,4
Fred Anderson,Wendy Yule,44250,4


In [9]:
pd.pivot_table(df,index=["Manager","Rep"],values=["Price"],
               columns=["Product"],aggfunc=[np.sum])

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,sum,sum,sum
Unnamed: 0_level_1,Unnamed: 1_level_1,Price,Price,Price,Price
Unnamed: 0_level_2,Product,CPU,Maintenance,Monitor,Software
Manager,Rep,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
Debra Henley,Craig Booker,65000.0,5000.0,,10000.0
Debra Henley,Daniel Hilton,105000.0,,,10000.0
Debra Henley,John Smith,35000.0,5000.0,,
Fred Anderson,Cedric Moss,95000.0,5000.0,,10000.0
Fred Anderson,Wendy Yule,165000.0,7000.0,5000.0,


In [10]:
pd.pivot_table(df,index=["Manager","Rep"],values=["Price"],
               columns=["Product"],aggfunc=[np.sum],fill_value=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,sum,sum,sum
Unnamed: 0_level_1,Unnamed: 1_level_1,Price,Price,Price,Price
Unnamed: 0_level_2,Product,CPU,Maintenance,Monitor,Software
Manager,Rep,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3
Debra Henley,Craig Booker,65000,5000,0,10000
Debra Henley,Daniel Hilton,105000,0,0,10000
Debra Henley,John Smith,35000,5000,0,0
Fred Anderson,Cedric Moss,95000,5000,0,10000
Fred Anderson,Wendy Yule,165000,7000,5000,0


In [11]:
pd.pivot_table(df,index=["Manager","Rep"],values=["Price","Quantity"],
               columns=["Product"],aggfunc=[np.sum],fill_value=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,sum,sum,sum,sum,sum,sum,sum
Unnamed: 0_level_1,Unnamed: 1_level_1,Price,Price,Price,Price,Quantity,Quantity,Quantity,Quantity
Unnamed: 0_level_2,Product,CPU,Maintenance,Monitor,Software,CPU,Maintenance,Monitor,Software
Manager,Rep,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3
Debra Henley,Craig Booker,65000,5000,0,10000,2,2,0,1
Debra Henley,Daniel Hilton,105000,0,0,10000,4,0,0,1
Debra Henley,John Smith,35000,5000,0,0,1,2,0,0
Fred Anderson,Cedric Moss,95000,5000,0,10000,3,1,0,1
Fred Anderson,Wendy Yule,165000,7000,5000,0,7,3,2,0


In [12]:
pd.pivot_table(df,index=["Manager","Rep","Product"],
               values=["Price","Quantity"],aggfunc=[np.sum],fill_value=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sum,sum
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Price,Quantity
Manager,Rep,Product,Unnamed: 3_level_2,Unnamed: 4_level_2
Debra Henley,Craig Booker,CPU,65000,2
Debra Henley,Craig Booker,Maintenance,5000,2
Debra Henley,Craig Booker,Software,10000,1
Debra Henley,Daniel Hilton,CPU,105000,4
Debra Henley,Daniel Hilton,Software,10000,1
Debra Henley,John Smith,CPU,35000,1
Debra Henley,John Smith,Maintenance,5000,2
Fred Anderson,Cedric Moss,CPU,95000,3
Fred Anderson,Cedric Moss,Maintenance,5000,1
Fred Anderson,Cedric Moss,Software,10000,1


In [13]:
pd.pivot_table(df,index=["Manager","Rep","Product"],
               values=["Price","Quantity"],
               aggfunc=[np.sum,np.mean],fill_value=0,margins=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sum,sum,mean,mean
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Price,Quantity,Price,Quantity
Manager,Rep,Product,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Debra Henley,Craig Booker,CPU,65000.0,2.0,32500.0,1.0
Debra Henley,Craig Booker,Maintenance,5000.0,2.0,5000.0,2.0
Debra Henley,Craig Booker,Software,10000.0,1.0,10000.0,1.0
Debra Henley,Daniel Hilton,CPU,105000.0,4.0,52500.0,2.0
Debra Henley,Daniel Hilton,Software,10000.0,1.0,10000.0,1.0
Debra Henley,John Smith,CPU,35000.0,1.0,35000.0,1.0
Debra Henley,John Smith,Maintenance,5000.0,2.0,5000.0,2.0
Fred Anderson,Cedric Moss,CPU,95000.0,3.0,47500.0,1.5
Fred Anderson,Cedric Moss,Maintenance,5000.0,1.0,5000.0,1.0
Fred Anderson,Cedric Moss,Software,10000.0,1.0,10000.0,1.0


In [14]:
pd.pivot_table(df,index=["Manager","Status"],values=["Price"],
               aggfunc=[np.sum],fill_value=0,margins=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,sum
Unnamed: 0_level_1,Unnamed: 1_level_1,Price
Manager,Status,Unnamed: 2_level_2
Debra Henley,won,65000.0
Debra Henley,pending,50000.0
Debra Henley,presented,50000.0
Debra Henley,declined,70000.0
Fred Anderson,won,172000.0
Fred Anderson,pending,5000.0
Fred Anderson,presented,45000.0
Fred Anderson,declined,65000.0
All,,522000.0


In [15]:
pd.pivot_table(df,index=["Manager","Status"],columns=["Product"],values=["Quantity","Price"],
               aggfunc={"Quantity":len,"Price":np.sum},fill_value=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,Quantity,Quantity,Quantity,Quantity,Price,Price,Price,Price
Unnamed: 0_level_1,Product,CPU,Maintenance,Monitor,Software,CPU,Maintenance,Monitor,Software
Manager,Status,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Debra Henley,won,1,0,0,0,65000,0,0,0
Debra Henley,pending,1,2,0,0,40000,10000,0,0
Debra Henley,presented,1,0,0,2,30000,0,0,20000
Debra Henley,declined,2,0,0,0,70000,0,0,0
Fred Anderson,won,2,1,0,0,165000,7000,0,0
Fred Anderson,pending,0,1,0,0,0,5000,0,0
Fred Anderson,presented,1,0,1,1,30000,0,5000,10000
Fred Anderson,declined,1,0,0,0,65000,0,0,0


In [55]:
table = pd.pivot_table(df,index=["Manager","Status"],columns=["Product"],values=["Quantity","Price"],
               aggfunc={"Quantity":len,"Price":[np.sum,np.mean]},fill_value=0)
table

Unnamed: 0_level_0,Unnamed: 1_level_0,Quantity,Quantity,Quantity,Quantity,Price,Price,Price,Price,Price,Price,Price,Price
Unnamed: 0_level_1,Unnamed: 1_level_1,len,len,len,len,mean,mean,mean,mean,sum,sum,sum,sum
Unnamed: 0_level_2,Product,CPU,Maintenance,Monitor,Software,CPU,Maintenance,Monitor,Software,CPU,Maintenance,Monitor,Software
Manager,Status,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3
Debra Henley,won,1,0,0,0,65000,0,0,0,65000,0,0,0
Debra Henley,pending,1,2,0,0,40000,5000,0,0,40000,10000,0,0
Debra Henley,presented,1,0,0,2,30000,0,0,10000,30000,0,0,20000
Debra Henley,declined,2,0,0,0,35000,0,0,0,70000,0,0,0
Fred Anderson,won,2,1,0,0,82500,7000,0,0,165000,7000,0,0
Fred Anderson,pending,0,1,0,0,0,5000,0,0,0,5000,0,0
Fred Anderson,presented,1,0,1,1,30000,0,5000,10000,30000,0,5000,10000
Fred Anderson,declined,1,0,0,0,65000,0,0,0,65000,0,0,0


In [17]:
table.query('Manager == ["Debra Henley"]')

Unnamed: 0_level_0,Unnamed: 1_level_0,Quantity,Quantity,Quantity,Quantity,Price,Price,Price,Price,Price,Price,Price,Price
Unnamed: 0_level_1,Unnamed: 1_level_1,len,len,len,len,mean,mean,mean,mean,sum,sum,sum,sum
Unnamed: 0_level_2,Product,CPU,Maintenance,Monitor,Software,CPU,Maintenance,Monitor,Software,CPU,Maintenance,Monitor,Software
Manager,Status,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3
Debra Henley,won,1,0,0,0,65000,0,0,0,65000,0,0,0
Debra Henley,pending,1,2,0,0,40000,5000,0,0,40000,10000,0,0
Debra Henley,presented,1,0,0,2,30000,0,0,10000,30000,0,0,20000
Debra Henley,declined,2,0,0,0,35000,0,0,0,70000,0,0,0


In [18]:
table.query('Status == ["pending","won"]')

Unnamed: 0_level_0,Unnamed: 1_level_0,Quantity,Quantity,Quantity,Quantity,Price,Price,Price,Price,Price,Price,Price,Price
Unnamed: 0_level_1,Unnamed: 1_level_1,len,len,len,len,mean,mean,mean,mean,sum,sum,sum,sum
Unnamed: 0_level_2,Product,CPU,Maintenance,Monitor,Software,CPU,Maintenance,Monitor,Software,CPU,Maintenance,Monitor,Software
Manager,Status,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3
Debra Henley,won,1,0,0,0,65000,0,0,0,65000,0,0,0
Debra Henley,pending,1,2,0,0,40000,5000,0,0,40000,10000,0,0
Fred Anderson,won,2,1,0,0,82500,7000,0,0,165000,7000,0,0
Fred Anderson,pending,0,1,0,0,0,5000,0,0,0,5000,0,0
