# Reshaping
https://pandas.pydata.org/pandas-docs/stable/reshaping.html

In [38]:
import pandas as pd
from pandas import DataFrame as DF
import numpy as np

# 1. Long to wide (pivot)

In [39]:
# Data generation (no need to understand this chunk of code)
np.random.seed(0)
import pandas.util.testing as tm; tm.N = 3
def unpivot(frame):
    N, K = frame.shape
    data = {'value' : frame.values.ravel('F'),
            'variable' : np.asarray(frame.columns).repeat(N),
            'date' : np.tile(np.asarray(frame.index), K)}
    return pd.DataFrame(data, columns=['date', 'variable', 'value'])
df = unpivot(tm.makeTimeDataFrame())
df

Unnamed: 0,date,variable,value
0,2000-01-03,A,1.764052
1,2000-01-04,A,0.400157
2,2000-01-05,A,0.978738
3,2000-01-03,B,2.240893
4,2000-01-04,B,1.867558
5,2000-01-05,B,-0.977278
6,2000-01-03,C,0.950088
7,2000-01-04,C,-0.151357
8,2000-01-05,C,-0.103219
9,2000-01-03,D,0.410599


In [40]:
# one column with values
df.pivot(index='date', columns='variable', values='value')

variable,A,B,C,D
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-03,1.764052,2.240893,0.950088,0.410599
2000-01-04,0.400157,1.867558,-0.151357,0.144044
2000-01-05,0.978738,-0.977278,-0.103219,1.454274


In [41]:
df.drop(0).pivot(index='date', columns='variable', values='value')

variable,A,B,C,D
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-03,,2.240893,0.950088,0.410599
2000-01-04,0.400157,1.867558,-0.151357,0.144044
2000-01-05,0.978738,-0.977278,-0.103219,1.454274


In [42]:
# multiple columns with values
df['value2'] = df['value'] * 2
df

Unnamed: 0,date,variable,value,value2
0,2000-01-03,A,1.764052,3.528105
1,2000-01-04,A,0.400157,0.800314
2,2000-01-05,A,0.978738,1.957476
3,2000-01-03,B,2.240893,4.481786
4,2000-01-04,B,1.867558,3.735116
5,2000-01-05,B,-0.977278,-1.954556
6,2000-01-03,C,0.950088,1.900177
7,2000-01-04,C,-0.151357,-0.302714
8,2000-01-05,C,-0.103219,-0.206438
9,2000-01-03,D,0.410599,0.821197


In [43]:
df2 = df.pivot(index='date', columns='variable')
df2

Unnamed: 0_level_0,value,value,value,value,value2,value2,value2,value2
variable,A,B,C,D,A,B,C,D
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2000-01-03,1.764052,2.240893,0.950088,0.410599,3.528105,4.481786,1.900177,0.821197
2000-01-04,0.400157,1.867558,-0.151357,0.144044,0.800314,3.735116,-0.302714,0.288087
2000-01-05,0.978738,-0.977278,-0.103219,1.454274,1.957476,-1.954556,-0.206438,2.908547


In [44]:
df2['value2']

variable,A,B,C,D
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-03,3.528105,4.481786,1.900177,0.821197
2000-01-04,0.800314,3.735116,-0.302714,0.288087
2000-01-05,1.957476,-1.954556,-0.206438,2.908547


# 2. Wide to long (melt)

In [45]:
cheese = pd.DataFrame({'first' : ['John', 'Mary'],
                       'last' : ['Doe', 'Bo'],
                       'height' : [5.5, 6.0],
                       'weight' : [130, 150]})
cheese

Unnamed: 0,first,height,last,weight
0,John,5.5,Doe,130
1,Mary,6.0,Bo,150


In [46]:
# all column values not specified will be unfolded to a long one dimensional column
cheese.melt(id_vars=['first', 'last'])

Unnamed: 0,first,last,variable,value
0,John,Doe,height,5.5
1,Mary,Bo,height,6.0
2,John,Doe,weight,130.0
3,Mary,Bo,weight,150.0


In [47]:
cheese.melt(id_vars=['first'], value_vars=['height','weight'])

Unnamed: 0,first,variable,value
0,John,height,5.5
1,Mary,height,6.0
2,John,weight,130.0
3,Mary,weight,150.0


In [48]:
cheese.melt?

In [49]:
# change the names of resulting columns in the data
cheese2 = cheese.melt(id_vars=['first', 'last'], var_name='attribute', value_name='measured')
cheese2

Unnamed: 0,first,last,attribute,measured
0,John,Doe,height,5.5
1,Mary,Bo,height,6.0
2,John,Doe,weight,130.0
3,Mary,Bo,weight,150.0


### can you pivot back to the original wide format data?

In [50]:
cheese2.pivot(index='first',columns='attribute',values='measured')

attribute,height,weight
first,Unnamed: 1_level_1,Unnamed: 2_level_1
John,5.5,130.0
Mary,6.0,150.0


In [51]:
df2 = df.pivot(index='date', columns='variable', values='value')
df2

variable,A,B,C,D
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-03,1.764052,2.240893,0.950088,0.410599
2000-01-04,0.400157,1.867558,-0.151357,0.144044
2000-01-05,0.978738,-0.977278,-0.103219,1.454274


In [52]:
df2.melt()

Unnamed: 0,variable,value
0,A,1.764052
1,A,0.400157
2,A,0.978738
3,B,2.240893
4,B,1.867558
5,B,-0.977278
6,C,0.950088
7,C,-0.151357
8,C,-0.103219
9,D,0.410599


In [53]:
df2['date'] = df2.index
df2.melt(id_vars='date')

Unnamed: 0,date,variable,value
0,2000-01-03,A,1.764052
1,2000-01-04,A,0.400157
2,2000-01-05,A,0.978738
3,2000-01-03,B,2.240893
4,2000-01-04,B,1.867558
5,2000-01-05,B,-0.977278
6,2000-01-03,C,0.950088
7,2000-01-04,C,-0.151357
8,2000-01-05,C,-0.103219
9,2000-01-03,D,0.410599


# 3. Stacking and unstacking
Stacking pivots a level of column labels into the row index. 
Unstacking pivots a level of the row index into the column index
## 3.1 One Level Label on Column

In [54]:
tuples = list(zip(['bar', 'bar', 'baz', 'baz','foo', 'foo', 'qux', 'qux'],
                   ['one', 'two', 'one', 'two','one', 'two', 'one', 'two']))
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [55]:
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
index

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second'])

In [56]:
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.761038,0.121675
bar,two,0.443863,0.333674
baz,one,1.494079,-0.205158
baz,two,0.313068,-0.854096
foo,one,-2.55299,0.653619
foo,two,0.864436,-0.742165
qux,one,2.269755,-1.454366
qux,two,0.045759,-0.187184


In [57]:
df2 = df[:4]
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.761038,0.121675
bar,two,0.443863,0.333674
baz,one,1.494079,-0.205158
baz,two,0.313068,-0.854096


In [61]:
stacked = df2.stack()
stacked

first  second   
bar    one     A    0.761038
               B    0.121675
       two     A    0.443863
               B    0.333674
baz    one     A    1.494079
               B   -0.205158
       two     A    0.313068
               B   -0.854096
dtype: float64

In [62]:
stacked.index

MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two'], ['A', 'B']],
           labels=[[0, 0, 0, 0, 1, 1, 1, 1], [0, 0, 1, 1, 0, 0, 1, 1], [0, 1, 0, 1, 0, 1, 0, 1]],
           names=['first', 'second', None])

In [69]:
stacked

first  second   
bar    one     A    0.761038
               B    0.121675
       two     A    0.443863
               B    0.333674
baz    one     A    1.494079
               B   -0.205158
       two     A    0.313068
               B   -0.854096
dtype: float64

In [68]:
stacked.unstack()


Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.761038,0.121675
bar,two,0.443863,0.333674
baz,one,1.494079,-0.205158
baz,two,0.313068,-0.854096


In [73]:
# explicitly specify which level of index to use as column names
stacked.unstack(1)

Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,0.761038,0.443863
bar,B,0.121675,0.333674
baz,A,1.494079,0.313068
baz,B,-0.205158,-0.854096


In [75]:
# explicitly specify which level of index to use as column names
stacked.unstack('first')

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.761038,0.121675
bar,two,0.443863,0.333674
baz,one,1.494079,-0.205158
baz,two,0.313068,-0.854096


## 3.2 Multiple Level Labels on Columns

In [76]:
columns = pd.MultiIndex.from_tuples([
   ....:         ('A', 'cat', 'long'), ('B', 'cat', 'long'),
   ....:         ('A', 'dog', 'short'), ('B', 'dog', 'short')
   ....:     ],
   ....:     names=['exp', 'animal', 'hair_length']
   ....: )
columns

MultiIndex(levels=[['A', 'B'], ['cat', 'dog'], ['long', 'short']],
           labels=[[0, 1, 0, 1], [0, 0, 1, 1], [0, 0, 1, 1]],
           names=['exp', 'animal', 'hair_length'])

In [77]:
np.random.seed(0)
df = pd.DataFrame(np.random.randn(4, 4), columns=columns)
df

exp,A,B,A,B
animal,cat,cat,dog,dog
hair_length,long,long,short,short
0,1.764052,0.400157,0.978738,2.240893
1,1.867558,-0.977278,0.950088,-0.151357
2,-0.103219,0.410599,0.144044,1.454274
3,0.761038,0.121675,0.443863,0.333674


In [78]:
df.stack(level=['animal', 'hair_length'])

Unnamed: 0_level_0,Unnamed: 1_level_0,exp,A,B
Unnamed: 0_level_1,animal,hair_length,Unnamed: 3_level_1,Unnamed: 4_level_1
0,cat,long,1.764052,0.400157
0,dog,short,0.978738,2.240893
1,cat,long,1.867558,-0.977278
1,dog,short,0.950088,-0.151357
2,cat,long,-0.103219,0.410599
2,dog,short,0.144044,1.454274
3,cat,long,0.761038,0.121675
3,dog,short,0.443863,0.333674


In [79]:
df.stack(level=[2,0])

Unnamed: 0_level_0,Unnamed: 1_level_0,animal,cat,dog
Unnamed: 0_level_1,hair_length,exp,Unnamed: 3_level_1,Unnamed: 4_level_1
0,long,A,1.764052,
0,long,B,0.400157,
0,short,A,,0.978738
0,short,B,,2.240893
1,long,A,1.867558,
1,long,B,-0.977278,
1,short,A,,0.950088
1,short,B,,-0.151357
2,long,A,-0.103219,
2,long,B,0.410599,


In [80]:
columns = pd.MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'),
   ....:                                      ('B', 'cat'), ('A', 'dog')],
   ....:                                     names=['exp', 'animal'])
index = pd.MultiIndex.from_product([('bar', 'baz', 'foo', 'qux'),
   ....:                                     ('one', 'two')],
   ....:                                    names=['first', 'second'])
np.random.seed(0)
df = pd.DataFrame(np.random.randn(8, 4), index=index, columns=columns)
df

Unnamed: 0_level_0,exp,A,B,B,A
Unnamed: 0_level_1,animal,cat,dog,cat,dog
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
bar,one,1.764052,0.400157,0.978738,2.240893
bar,two,1.867558,-0.977278,0.950088,-0.151357
baz,one,-0.103219,0.410599,0.144044,1.454274
baz,two,0.761038,0.121675,0.443863,0.333674
foo,one,1.494079,-0.205158,0.313068,-0.854096
foo,two,-2.55299,0.653619,0.864436,-0.742165
qux,one,2.269755,-1.454366,0.045759,-0.187184
qux,two,1.532779,1.469359,0.154947,0.378163


In [81]:
df2 = df.iloc[[0, 1, 2, 4, 5, 7]]
df2

Unnamed: 0_level_0,exp,A,B,B,A
Unnamed: 0_level_1,animal,cat,dog,cat,dog
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
bar,one,1.764052,0.400157,0.978738,2.240893
bar,two,1.867558,-0.977278,0.950088,-0.151357
baz,one,-0.103219,0.410599,0.144044,1.454274
foo,one,1.494079,-0.205158,0.313068,-0.854096
foo,two,-2.55299,0.653619,0.864436,-0.742165
qux,two,1.532779,1.469359,0.154947,0.378163


In [82]:
df2.stack('animal')

Unnamed: 0_level_0,Unnamed: 1_level_0,exp,A,B
first,second,animal,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,cat,1.764052,0.978738
bar,one,dog,2.240893,0.400157
bar,two,cat,1.867558,0.950088
bar,two,dog,-0.151357,-0.977278
baz,one,cat,-0.103219,0.144044
baz,one,dog,1.454274,0.410599
foo,one,cat,1.494079,0.313068
foo,one,dog,-0.854096,-0.205158
foo,two,cat,-2.55299,0.864436
foo,two,dog,-0.742165,0.653619


In [83]:
df3 = df.iloc[[0, 1, 4, 7], [1, 2]]
df3

Unnamed: 0_level_0,exp,B,B
Unnamed: 0_level_1,animal,dog,cat
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2
bar,one,0.400157,0.978738
bar,two,-0.977278,0.950088
foo,one,-0.205158,0.313068
qux,two,1.469359,0.154947


In [84]:
# remove the inner most index by default
df3.unstack()

exp,B,B,B,B
animal,dog,dog,cat,cat
second,one,two,one,two
first,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
bar,0.400157,-0.977278,0.978738,0.950088
foo,-0.205158,,0.313068,
qux,,1.469359,,0.154947


In [85]:
df3.unstack(fill_value=-1)

exp,B,B,B,B
animal,dog,dog,cat,cat
second,one,two,one,two
first,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
bar,0.400157,-0.977278,0.978738,0.950088
foo,-0.205158,-1.0,0.313068,-1.0
qux,-1.0,1.469359,-1.0,0.154947


In [None]:
df4 = df[:3]
df4

In [None]:
df4.unstack(1)

In [None]:
df4.unstack?

## 4. Data Aggregation
### 4.1 GroupBy

In [86]:
columns = pd.MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'),
   ....:                                      ('B', 'cat'), ('A', 'dog')],
   ....:                                     names=['exp', 'animal'])
index = pd.MultiIndex.from_product([('bar', 'baz', 'foo', 'qux'),
   ....:                                     ('one', 'two')],
   ....:                                    names=['first', 'second'])
np.random.seed(0)
df = pd.DataFrame(np.random.randn(8, 4), index=index, columns=columns)
df

Unnamed: 0_level_0,exp,A,B,B,A
Unnamed: 0_level_1,animal,cat,dog,cat,dog
first,second,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
bar,one,1.764052,0.400157,0.978738,2.240893
bar,two,1.867558,-0.977278,0.950088,-0.151357
baz,one,-0.103219,0.410599,0.144044,1.454274
baz,two,0.761038,0.121675,0.443863,0.333674
foo,one,1.494079,-0.205158,0.313068,-0.854096
foo,two,-2.55299,0.653619,0.864436,-0.742165
qux,one,2.269755,-1.454366,0.045759,-0.187184
qux,two,1.532779,1.469359,0.154947,0.378163


In [87]:
df.groupby(level=1, axis=1).mean()

Unnamed: 0_level_0,animal,cat,dog
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,1.371395,1.320525
bar,two,1.408823,-0.564318
baz,one,0.020412,0.932436
baz,two,0.60245,0.227675
foo,one,0.903573,-0.529627
foo,two,-0.844277,-0.044273
qux,one,1.157757,-0.820775
qux,two,0.843863,0.923761


In [88]:
df.groupby(level=0, axis=0).mean()

exp,A,B,B,A
animal,cat,dog,cat,dog
first,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,1.815805,-0.28856,0.964413,1.044768
baz,0.328909,0.266137,0.293953,0.893974
foo,-0.529455,0.22423,0.588752,-0.79813
qux,1.901267,0.007497,0.100353,0.095489


In [89]:
df.groupby(level=0, axis=0).max()

exp,A,B,B,A
animal,cat,dog,cat,dog
first,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,1.867558,0.400157,0.978738,2.240893
baz,0.761038,0.410599,0.443863,1.454274
foo,1.494079,0.653619,0.864436,-0.742165
qux,2.269755,1.469359,0.154947,0.378163


### 4.2 Pivot with data aggregation

In [98]:
import datetime
df = pd.DataFrame({'A': ['one', 'one', 'two', 'three'] * 3,
                   'B': ['A', 'B', 'C'] * 4,
                   'C': ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                   'D': np.random.randn(12),
                   'E': np.random.randn(12),
                   'F': [datetime.datetime(2013, i, 1) for i in range(1, 7)] +
                            [datetime.datetime(2013, i, 15) for i in range(1, 7)]})
df.iloc[11,0:3] = ['one','A','foo']
df

Unnamed: 0,A,B,C,D,E,F
0,one,A,foo,0.672295,-1.491258,2013-01-01
1,one,B,foo,0.407462,0.439392,2013-02-01
2,two,C,foo,-0.769916,0.166673,2013-03-01
3,three,A,bar,0.539249,0.635031,2013-04-01
4,one,B,bar,-0.674333,2.383145,2013-05-01
5,one,C,bar,0.031831,0.944479,2013-06-01
6,two,A,foo,-0.635846,-0.912822,2013-01-15
7,three,B,foo,0.676433,1.117016,2013-02-15
8,one,C,foo,0.576591,-1.315907,2013-03-15
9,one,A,bar,-0.208299,-0.461585,2013-04-15


In [99]:
df.pivot_table(values='D', index=['A', 'B'], columns=['C'])

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-0.208299,-0.210383
one,B,-0.674333,0.407462
one,C,0.031831,0.576591
three,A,0.539249,
three,B,,0.676433
two,A,,-0.635846
two,B,0.396007,
two,C,,-0.769916


In [100]:
df.pivot_table(values='D', index=['A', 'B'], columns=['C'], aggfunc=sum)

Unnamed: 0_level_0,C,bar,foo
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-0.208299,-0.420767
one,B,-0.674333,0.407462
one,C,0.031831,0.576591
three,A,0.539249,
three,B,,0.676433
two,A,,-0.635846
two,B,0.396007,
two,C,,-0.769916
