### Series

In [40]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from IPython.display import display

Series is a one-dimensional array-like object

In [6]:
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

You can parse out the values and the index of a Series

In [8]:
print obj.values
print obj.index

[ 4  7 -5  3]
RangeIndex(start=0, stop=4, step=1)


You can specify what the index is

In [13]:
obj2 = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

You can change values based on the index

In [14]:
obj2['b']
obj2['b'] = 6
obj2

d    4
b    6
a   -5
c    3
dtype: int64

Basic operations:

In [26]:
print obj2[obj2>0]
print '\n'
print obj2*2
print '\n'
print np.exp(obj2)

d    4
b    6
c    3
dtype: int64


d     8
b    12
a   -10
c     6
dtype: int64


d     54.598150
b    403.428793
a      0.006738
c     20.085537
dtype: float64


You can convert a dict into a Series

In [22]:
sdata = {'Ohio':35000, 'Texas':70000, 'Oregon':16000, 'Utah':50000}
obj3 = Series(sdata)
obj3

Ohio      35000
Oregon    16000
Texas     70000
Utah      50000
dtype: int64

You can match the Series index with a list

In [23]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         70000.0
dtype: float64

Detecting nulls

In [25]:
print pd.isnull(obj4)
print '\n'
print pd.notnull(obj4)
print '\n'
print obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool


California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool


California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool


Series align indexes

In [29]:
print obj3
print '\n'
print obj4
print '\n'
print obj3+obj4

Ohio      35000
Oregon    16000
Texas     70000
Utah      50000
dtype: int64


California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         70000.0
dtype: float64


California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         140000.0
Utah               NaN
dtype: float64


Both the Series object and the its attribute have a name feature:

In [30]:
obj4.name = 'population'
obj4.index.name = 'state'
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         70000.0
Name: population, dtype: float64

A Series' index can be altered in place by assignment

In [31]:
obj.index = ['Bob','Steve','Jeff','Ryan']
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

### DataFrames

In [174]:
data = {'state': ['Ohio','Ohio','Ohio','Nevada','Nevada'],
        'year': [2000,2001,2002,2001,2002],
        'pop': [1.5,1.7,3.6,2.4,2.9]}
print data
frame = DataFrame(data)
print '\n'
print frame
print '\n'
print 'Index:', frame.index

{'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'pop': [1.5, 1.7, 3.6, 2.4, 2.9], 'year': [2000, 2001, 2002, 2001, 2002]}


   pop   state  year
0  1.5    Ohio  2000
1  1.7    Ohio  2001
2  3.6    Ohio  2002
3  2.4  Nevada  2001
4  2.9  Nevada  2002


Index: RangeIndex(start=0, stop=5, step=1)


You can explicitly specify what you want the output to be

In [39]:
DataFrame(data, columns=['year','state','pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


Creating a new blank colum will generate NaN values:

In [175]:
frame2 = DataFrame(data, columns=['year','state','pop','debt'],
                   index=['one','two','three','four','five'])
display(frame2)
print frame2.columns

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


Index([u'year', u'state', u'pop', u'debt'], dtype='object')


You can retrieve columns using names or by attribute

In [48]:
display(frame2['state'])
display(frame2.state)

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

Retrieving rows by index:

In [51]:
display(frame2.ix['three'])
display(frame2.ix[['three', 'four']])

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

Unnamed: 0,year,state,pop,debt
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,


In [177]:
frame2.ix[1:]

Unnamed: 0,year,state,pop,debt
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [178]:
frame2.iloc[1:]

Unnamed: 0,year,state,pop,debt
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


_Note:_  
- `loc` works on labels in the index.  
- `iloc` works on the positions in the index (so it only takes integers).  
- `ix` usually tries to behave like loc but falls back to behaving like iloc if the label is not in the index.

Modifying columns

In [52]:
frame2['debt'] = 16.5
display(frame2)
frame2['debt'] = np.arange(5.)
display(frame2)

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0


You can assign a Series to a column in a DF, but indexes must match

In [53]:
val = Series([-1.2,-1.5,-1.7], index=['two','four','five'])
frame2['debt'] = val
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


Assigning a column that doesn't exist will create a new column:

In [55]:
frame2['eastern'] = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False


Deleting a column:

In [56]:
del frame2['eastern']
frame2.columns

Index([u'year', u'state', u'pop', u'debt'], dtype='object')

Nested dicts: inner keys will be interpreted as indices

In [57]:
pop = {'Nevada':{2001:2.4, 2002:2.9},
       'Ohio':{2000:1.5, 2001:1.7, 2002:3.6}}
print pop
frame3 = DataFrame(pop)
frame3

{'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}, 'Nevada': {2001: 2.4, 2002: 2.9}}


Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


Transpose

In [58]:
frame3.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


You can always explicitly state your index:

In [61]:
display(pop)
display(DataFrame(pop, index=[2001,2002,2003]))

{'Nevada': {2001: 2.4, 2002: 2.9}, 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


Dict of Series

In [66]:
print frame3
pdata = {'Ohio': frame3['Ohio'][:-1],
         'Nevada':frame3['Nevada'][:2]}
print '\n'
print pdata
display(DataFrame(pdata))

      Nevada  Ohio
2000     NaN   1.5
2001     2.4   1.7
2002     2.9   3.6


{'Ohio': 2000    1.5
2001    1.7
Name: Ohio, dtype: float64, 'Nevada': 2000    NaN
2001    2.4
Name: Nevada, dtype: float64}


Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7


You can name the indices AND the columns:

In [67]:
frame3.index.name = 'year'; frame3.columns.name = 'state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


Values are returned as a 2D ndarray

In [68]:
frame3.values

array([[ nan,  1.5],
       [ 2.4,  1.7],
       [ 2.9,  3.6]])

When there are multiple dtypes, the value array dtype will fill all of them

In [69]:
frame2.values

array([[2000L, 'Ohio', 1.5, nan],
       [2001L, 'Ohio', 1.7, -1.2],
       [2002L, 'Ohio', 3.6, nan],
       [2001L, 'Nevada', 2.4, -1.5],
       [2002L, 'Nevada', 2.9, -1.7]], dtype=object)

### Index Objects

In [70]:
obj = Series(range(3), index=['a', 'b', 'c'])
index = obj.index
print index
print '\n'
print index[1:]

Index([u'a', u'b', u'c'], dtype='object')


Index([u'b', u'c'], dtype='object')


Index objects are immutable

In [71]:
index[1] = 'd'

TypeError: Index does not support mutable operations

This way you can share indices among data structures

In [72]:
index = pd.Index(np.arange(3))
print index
obj2 = Series([1.5, -2.5, 0], index = index)
print obj2.index is index

Int64Index([0, 1, 2], dtype='int64')
True


Indices are also fixed-size sets:

In [73]:
print frame3
print '\n'
print 'Ohio' in frame3.columns
print '\n'
print 2003 in frame3.index

state  Nevada  Ohio
year               
2000      NaN   1.5
2001      2.4   1.7
2002      2.9   3.6


True


False


Reindexing

In [78]:
obj = Series([4.5,7.2,-5.3,3.6],index=['d','b','a','c'])
print obj
print '\n'
obj2 = obj.reindex(['a','b','c','d','e'])
print obj2
print '\n'
obj2 = obj.reindex(['a','b','c','d','e'], fill_value=0)
print obj2

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64


a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64


a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64


You can also fill in values while reindexing

In [79]:
obj3 = Series(['blue', 'purple', 'yellow'], index=[0,2,4])
print obj3
print '\n'
print obj3.reindex(range(6), method='ffill')

0      blue
2    purple
4    yellow
dtype: object


0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object


Reindexing works with rows and with columns:

In [80]:
frame = DataFrame(np.arange(9).reshape((3,3)), index=['a', 'c', 'd'],
                  columns=['Ohio', 'Texas', 'California'])
print frame
print '\n'
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
print frame2

   Ohio  Texas  California
a     0      1           2
c     3      4           5
d     6      7           8


   Ohio  Texas  California
a   0.0    1.0         2.0
b   NaN    NaN         NaN
c   3.0    4.0         5.0
d   6.0    7.0         8.0


Reindexing columns

In [81]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


Both at the same time

In [82]:
frame.reindex(index=['a', 'b', 'c', 'd'], method='ffill',columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
b,1,,2
c,4,,5
d,7,,8


This can be done much more succinctly

In [83]:
frame.ix[['a','b','c','d'],states]

Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


Dropping entries from a DataFrame

In [84]:
obj = Series(np.arange(5.), index=['a', 'b', 'c', 'd','e'])
print obj
new_obj = obj.drop('c')
print '\n'
print new_obj
print '\n'
print obj.drop(['d','c'])

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64


a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64


a    0.0
b    1.0
e    4.0
dtype: float64


You can drop columns or index values in the same way

In [88]:
data = DataFrame(np.arange(16).reshape(4,4),
                 index=['Ohio','Colorado','Utah','New York'],
                    columns=['one','two','three','four'])
print data
print '\n'
print data.drop(['Colorado','Ohio'])
print '\n'
print data.drop('two',axis=1)
print '\n'
print data.drop(['two','four'],axis=1)

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


          one  two  three  four
Utah        8    9     10    11
New York   12   13     14    15


          one  three  four
Ohio        0      2     3
Colorado    4      6     7
Utah        8     10    11
New York   12     14    15


          one  three
Ohio        0      2
Colorado    4      6
Utah        8     10
New York   12     14


### Selection and filtering

Series indexing

In [90]:
obj = Series(np.arange(4.), index=['a','b','c','d'])
print obj
print '\n'
print obj['b']
print '\n'
print obj[1]
print '\n'
print obj[2:4]
print '\n'
print obj[['c','d']]

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64


1.0


1.0


c    2.0
d    3.0
dtype: float64


c    2.0
d    3.0
dtype: float64


Filtering by values

In [91]:
print obj[[1,3]]
print '\n'
print obj[obj<2]

b    1.0
d    3.0
dtype: float64


a    0.0
b    1.0
dtype: float64


Slicing is endpoint inclusive

In [92]:
obj['b':'c']

b    1.0
c    2.0
dtype: float64

Setting

In [93]:
obj['b':'c'] = 5
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

You can retrieve columns as a single value or a sequence

In [97]:
data = DataFrame(np.arange(16).reshape(4,4),
                 index=['Ohio','Colorado','Utah','New York'],
                    columns=['one','two','three','four'])
print data
print '\n'
print data['two']
print '\n'
print data[['three', 'one']]
print '\n'
print data.ix['Ohio']
print '\n'
print data.ix['Ohio', 1]
print '\n'
print data.ix['Ohio', 1:2]

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32


          three  one
Ohio          2    0
Colorado      6    4
Utah         10    8
New York     14   12


one      0
two      1
three    2
four     3
Name: Ohio, dtype: int32


1


two    1
Name: Ohio, dtype: int32


Selecting rows by slicing

In [100]:
print data
print '\n'
print data[:2]
print '\n'
print data[data['three']>5]
print '\n'
print data['three']>5
print '\n'
print data < 5
print '\n'
print data[data<5]
print '\n'
print data[data<5] == 0
print '\n'
print data

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7


          one  two  three  four
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


Ohio        False
Colorado     True
Utah         True
New York     True
Name: three, dtype: bool


            one    two  three   four
Ohio       True   True   True   True
Colorado   True  False  False  False
Utah      False  False  False  False
New York  False  False  False  False


          one  two  three  four
Ohio      0.0  1.0    2.0   3.0
Colorado  4.0  NaN    NaN   NaN
Utah      NaN  NaN    NaN   NaN
New York  NaN  NaN    NaN   NaN


            one    two  three   four
Ohio       True  False  False  False
Colorado  False  False  False  False
Utah      False  False  False  False
New York  Fals

The ix method allows you to select rows and columns using names OR indices:

In [104]:
print data
print '\n'
print data.ix[['Colorado', 'Utah'],[3,0,1]]
print '\n'
print data.ix[2]
print '\n'
print data.ix[:'Utah','two']
print '\n'
print data.ix[data.three>5,:3]

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


          four  one  two
Colorado     7    4    5
Utah        11    8    9


one       8
two       9
three    10
four     11
Name: Utah, dtype: int32


Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int32


          one  two  three
Colorado    4    5      6
Utah        8    9     10
New York   12   13     14


_Note:_ row indices are last-inclusive, while columns are last-exclusive

### Arithmetic and Data Alignment

Non-matching indices will be unioned when adding two objects (and return NaNs):

In [105]:
s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a','c','d','e'])
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a','c','e','f','g'])
print s1
print '\n'
print s2
print '\n'
print s1+s2

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64


a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64


a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64


Lets see the same with a data frame

In [106]:
df1 = DataFrame(np.arange(9.).reshape((3,3)), columns=list('bcd'),
                index=['Ohio', 'Texas', 'Colorado'])
df2 = DataFrame(np.arange(12.).reshape((4,3)), columns=list('bde'),
                index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print df1
print '\n'
print df2
print '\n'
print df1+df2

            b    c    d
Ohio      0.0  1.0  2.0
Texas     3.0  4.0  5.0
Colorado  6.0  7.0  8.0


          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0


            b   c     d   e
Colorado  NaN NaN   NaN NaN
Ohio      3.0 NaN   6.0 NaN
Oregon    NaN NaN   NaN NaN
Texas     9.0 NaN  12.0 NaN
Utah      NaN NaN   NaN NaN


To get rid of the NaN values, you can use the fill function

In [107]:
df1.add(df2, fill_value=0)

Unnamed: 0,b,c,d,e
Colorado,6.0,7.0,8.0,
Ohio,3.0,1.0,6.0,5.0
Oregon,9.0,,10.0,11.0
Texas,9.0,4.0,12.0,8.0
Utah,0.0,,1.0,2.0


Operations between a dataframe and a series

In [109]:
arr= np.arange(12.).reshape((3,4))
print arr
print '\n'
print arr[0]
print '\n'
print arr - arr[0]

[[  0.   1.   2.   3.]
 [  4.   5.   6.   7.]
 [  8.   9.  10.  11.]]


[ 0.  1.  2.  3.]


[[ 0.  0.  0.  0.]
 [ 4.  4.  4.  4.]
 [ 8.  8.  8.  8.]]


Data frames and series will use the index in the series

In [110]:
frame = DataFrame(np.arange(12.).reshape((4,3)), columns=list('bde'), 
                  index=['Utah','Ohio','Texas','Oregon'])
series = frame.ix[0]
print frame
print '\n'
print series
print '\n'
print frame-series

          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0


b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64


          b    d    e
Utah    0.0  0.0  0.0
Ohio    3.0  3.0  3.0
Texas   6.0  6.0  6.0
Oregon  9.0  9.0  9.0


If the index value is not found, the objects will be reindexed

In [111]:
series2 = Series(range(3), index=['b','e','f'])
print series2
print '\n'
print frame + series2

b    0
e    1
f    2
dtype: int64


          b   d     e   f
Utah    0.0 NaN   3.0 NaN
Ohio    3.0 NaN   6.0 NaN
Texas   6.0 NaN   9.0 NaN
Oregon  9.0 NaN  12.0 NaN


Broadcasting over columns, matching on the rows:

In [112]:
series3 = frame['d']
print series3
print '\n'
print frame
print '\n'
print frame.sub(series3, axis=0)

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64


          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0


          b    d    e
Utah   -1.0  0.0  1.0
Ohio   -1.0  0.0  1.0
Texas  -1.0  0.0  1.0
Oregon -1.0  0.0  1.0


Function application and mapping

In [113]:
frame = DataFrame(np.random.randn(4,3), columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print frame
print '\n'
print np.abs(frame)

               b         d         e
Utah   -1.819489 -0.322406  0.004895
Ohio    1.958106  1.393432  0.870651
Texas  -0.272886 -0.590315  0.242086
Oregon -1.917845 -0.585628  1.634763


               b         d         e
Utah    1.819489  0.322406  0.004895
Ohio    1.958106  1.393432  0.870651
Texas   0.272886  0.590315  0.242086
Oregon  1.917845  0.585628  1.634763


The 'apply' method

In [114]:
f = lambda x: x.max() - x.min()
print frame.apply(f)
print '\n'
print frame.apply(f,axis=1)

b    3.875951
d    1.983746
e    1.629868
dtype: float64


Utah      1.824384
Ohio      1.087455
Texas     0.832401
Oregon    3.552608
dtype: float64


Applymap allows you to apply the function to every cell in the data frame

In [118]:
format = lambda x: '%.2f' % x
frame.applymap(format) # apply only appears to work for Series

Unnamed: 0,b,d,e
Utah,-1.82,-0.32,0.0
Ohio,1.96,1.39,0.87
Texas,-0.27,-0.59,0.24
Oregon,-1.92,-0.59,1.63


### Sorting and ranking

In [125]:
obj = Series(range(4), index=['d','a','b','c'])
print obj
print '\n'
print obj.sort_index()

d    0
a    1
b    2
c    3
dtype: int64


a    1
b    2
c    3
d    0
dtype: int64


Sort data frame with the index of either axis

In [126]:
frame = DataFrame(np.arange(8).reshape((2,4)), index=['three','one'],
                  columns=['d','a','b','c'])
print frame
print '\n'
print frame.sort_index()
print '\n'
print frame.sort_index(axis=1)

       d  a  b  c
three  0  1  2  3
one    4  5  6  7


       d  a  b  c
one    4  5  6  7
three  0  1  2  3


       a  b  c  d
three  1  2  3  0
one    5  6  7  4


Ascending is default, but we can change that

In [127]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


Sorting a series by its values requires the 'sort_values' method:

In [129]:
obj = Series([4,7,-3,2])
print obj
print '\n'
print obj.sort_values()

0    4
1    7
2   -3
3    2
dtype: int64


2   -3
3    2
0    4
1    7
dtype: int64


Missing values are added to the end of the series

In [130]:
obj = Series([4,np.nan, 7, np.nan, -3, 2])
print obj
print '\n'
print obj.sort_values()

0    4.0
1    NaN
2    7.0
3    NaN
4   -3.0
5    2.0
dtype: float64


4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64


Sorting a data frame by one or more columns:

In [132]:
frame = DataFrame({'b':[4,7,-3,2], 'a':[0,1,0,1]})
print frame
print '\n'
print frame.sort_values(by='b')
print '\n'
print frame.sort_values(by=['a','b'])

   a  b
0  0  4
1  1  7
2  0 -3
3  1  2


   a  b
2  0 -3
3  1  2
0  0  4
1  1  7


   a  b
2  0 -3
0  0  4
3  1  2
1  1  7


Rank tells us where each row ranks; ties are broken using the mean ranks:

In [133]:
obj = Series([7, -5, 7, 4, 2, 0, 4])
print obj
print '\n'
print obj.rank()

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64


0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64


The 'first' method breaks ties based on the index of each value:

In [134]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

Ranking in descending order

In [135]:
obj.rank(ascending=False, method='first')

0    1.0
1    7.0
2    2.0
3    3.0
4    5.0
5    6.0
6    4.0
dtype: float64

Data frame ranking can be done on rows OR columns

In [137]:
frame = DataFrame({'b':[4,7,-3,2], 'a':[0,1,0,1], 
                   'c':[-2,5,8,-2.5]})
print frame
print '\n'
print frame.rank()
print '\n'
print frame.rank(axis=1)

   a  b    c
0  0  4 -2.0
1  1  7  5.0
2  0 -3  8.0
3  1  2 -2.5


     a    b    c
0  1.5  3.0  2.0
1  3.5  4.0  3.0
2  1.5  1.0  4.0
3  3.5  2.0  1.0


     a    b    c
0  2.0  3.0  1.0
1  1.0  3.0  2.0
2  2.0  1.0  3.0
3  2.0  3.0  1.0


Axis indexing with duplicate values:

In [140]:
obj = Series(range(5), index=['a','a','b','b','c'])
print obj
print '\n'
print obj.index.is_unique
print '\n'
print obj['a']
print '\n'
print obj['c']

a    0
a    1
b    2
b    3
c    4
dtype: int64


False


a    0
a    1
dtype: int64


4


### Summarizing and Computing Descriptive Statistics

In [141]:
df = DataFrame([[1.4,None], [7.1, -4.5],
               [np.nan, np.nan], [.75, -1.4]], 
                index=['a','b','c','d'],
                columns=['one','two'])
display(df)
display(df.sum())
display(df.sum(axis=1))

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.4


one    9.25
two   -5.90
dtype: float64

a    1.40
b    2.60
c    0.00
d   -0.65
dtype: float64

NA values are excluded from the calculation, unless otherwise specified

In [142]:
df.mean(axis=1, skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.325
dtype: float64

Cumulative methods

In [143]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.9


Summary stats

In [145]:
print '\n'
print df.describe()
print '\n'
print df.describe().applymap(format)
obj = Series(['a','b','c','c']*4)
print '\n'
print obj
print '\n'
print obj.describe()



            one       two
count  3.000000  2.000000
mean   3.083333 -2.950000
std    3.493685  2.192031
min    0.750000 -4.500000
25%         NaN       NaN
50%         NaN       NaN
75%         NaN       NaN
max    7.100000 -1.400000


        one    two
count  3.00   2.00
mean   3.08  -2.95
std    3.49   2.19
min    0.75  -4.50
25%     nan    nan
50%     nan    nan
75%     nan    nan
max    7.10  -1.40


0     a
1     b
2     c
3     c
4     a
5     b
6     c
7     c
8     a
9     b
10    c
11    c
12    a
13    b
14    c
15    c
dtype: object


count     16
unique     3
top        c
freq       8
dtype: object




Correlation and covariance

In [146]:
import pandas.io.data as web
all_data={}
for ticker in ['AAPL','IBM','MSFT','GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')
price = DataFrame({tic: data['Adj Close']
                    for tic, data in all_data.iteritems()})
volume = DataFrame({tic: data['Volume']
                    for tic, data in all_data.iteritems()})
returns = price.pct_change()
print returns.tail()

# correlation and covariance:
print '\n'
print returns.MSFT.corr(returns.IBM)
print '\n'
print returns.MSFT.cov(returns.IBM)

The pandas.io.data module is moved to a separate package (pandas-datareader) and will be removed from pandas in a future version.
After installing the pandas-datareader package (https://github.com/pydata/pandas-datareader), you can change the import ``from pandas.io import data, wb`` to ``from pandas_datareader import data, wb``.


                AAPL      GOOG       IBM      MSFT
Date                                              
2009-12-24  0.034339  0.011117  0.004385  0.002587
2009-12-28  0.012294  0.007098  0.013326  0.005484
2009-12-29 -0.011861 -0.005571 -0.003477  0.007058
2009-12-30  0.012147  0.005376  0.005461 -0.013699
2009-12-31 -0.004300 -0.004416 -0.012597 -0.015504


0.495979626103


0.000215957600767


If you run this on a data frame, you will get the full corr/cov matrix:

In [147]:
print returns.corr()
print '\n'
print returns.cov()
print '\n'
print returns.corrwith(returns.IBM)
print '\n'
print returns.corrwith(volume)

          AAPL      GOOG       IBM      MSFT
AAPL  1.000000  0.470676  0.410011  0.424305
GOOG  0.470676  1.000000  0.390689  0.443587
IBM   0.410011  0.390689  1.000000  0.495980
MSFT  0.424305  0.443587  0.495980  1.000000


          AAPL      GOOG       IBM      MSFT
AAPL  0.001027  0.000303  0.000252  0.000309
GOOG  0.000303  0.000580  0.000142  0.000205
IBM   0.000252  0.000142  0.000367  0.000216
MSFT  0.000309  0.000205  0.000216  0.000516


AAPL    0.410011
GOOG    0.390689
IBM     1.000000
MSFT    0.495980
dtype: float64


AAPL   -0.057549
GOOG    0.062647
IBM    -0.007892
MSFT   -0.014245
dtype: float64


Uniques

In [149]:
obj = Series(['c','a','d','a','a','b','b','c','c'])
obj.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

Unique counts

In [150]:
obj.nunique()

4

Counts

In [151]:
print obj.value_counts()
print '\n'
print obj.value_counts(normalize = True)

c    3
a    3
b    2
d    1
dtype: int64


c    0.333333
a    0.333333
b    0.222222
d    0.111111
dtype: float64


Check for membership using 'isin'

In [152]:
print obj
print '\n'
print obj.isin(['b','c'])

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object


0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool


### Handling missing data

Pandas treats nulls as Nan

In [155]:
string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])
print string_data
print '\n'
print string_data.isnull()

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object


0    False
1    False
2     True
3    False
dtype: bool


'None' is also treated as a NaN

In [156]:
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

Fillna

In [157]:
string_data.fillna('used to be NaN')

0    used to be NaN
1         artichoke
2    used to be NaN
3           avocado
dtype: object

Filtering out missing data:

In [160]:
data = Series([1, np.nan, 3.5, np.nan, 7])
print data.dropna()
print '\n'
print data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64


0    1.0
2    3.5
4    7.0
dtype: float64


Dropna will drop any row containing a missing value

In [162]:
data = DataFrame([[1, 6.5, 3], [1, np.nan, np.nan], [np.nan, np.nan, np.nan], [np.nan, 6.5, 3]])
cleaned = data.dropna()
print data
print '\n'
print cleaned

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0


     0    1    2
0  1.0  6.5  3.0


We can control the criteria of dropping nulls through the 'how' and 'thresh' parameters

In [165]:
print data.dropna(how='all')
print '\n'
print data.dropna(thresh=2)

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
3  NaN  6.5  3.0


     0    1    2
0  1.0  6.5  3.0
3  NaN  6.5  3.0


We can get creative with fillna

In [170]:
data = Series([1, np.nan, 3.5, np.nan, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

### Merging

Merge/joins (pd.merge is an automatic inner join)

In [None]:
# 
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 
                 'data1': range(7)})
df2 = DataFrame({'key': ['a', 'b','d'], 
                 'data1': range(3)})               
df1
df2
pd.merge(df1, df2)  
pd.merge(df1, df2, on='key')   

if the column names are different in each object, you specify col names:

In [None]:
# 
df3 = DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 
                 'data1': range(7)})
df4 = DataFrame({'rkey': ['a', 'b','d'], 
                 'data1': range(3)}) 
df3
df4
pd.merge(df3, df4, left_on='lkey', right_on='rkey') 

outer join:

In [None]:
# 
pd.merge(df1, df2, how='outer', on='key')

In [None]:
many-to-many merges:

In [None]:
# 
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 
                 'data1': range(6)})
df2 = DataFrame({'key': ['a', 'b', 'a', 'b', 'd'], 
                 'data1': range(5)}) 
df1
df2
pd.merge(df1, df2, on='key', how='left')
pd.merge(df1, df2, on='key', how='inner')

to merge with multiple keys, pass a list of column names:

In [None]:
# 
left = DataFrame({'key1': ['foo', 'foo', 'bar'], 
                  'key2': ['one', 'two', 'one'], 
                    'lval': [1, 2, 3]})
right = DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'], 
                  'key2': ['one', 'one', 'one', 'two'], 
                    'lval': [4, 5, 6, 7]})
left
right
pd.merge(left, right, on=['key1', 'key2'], how='outer')
pd.merge(left, right, left_on=['key1', 'key2'], right_on=['key1', 'key2'])

In [None]:
when merging columns with same names, you can specify suffixes to be added:

In [None]:
# 
pd.merge(left, right, on='key1', suffixes=('_left', '_right'))

Merging on Index

In [None]:
# 
left1 = DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'], 
                 'data1': range(6)})
right1 = DataFrame({'group_val': [3.5, 7]}, 
                 index=['a', 'b']) 
left1
right1
pd.merge(left1, right1, left_on='key', right_index=True)
pd.merge(left1, right1, left_on='key', right_index=True, how='outer')

hierarchically indexed data (merging index on columns):

In [None]:
lefth = DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 
                   'key2': [2000, 2001, 2002, 2001, 2002], 
                    'data': np.arange(5.)})
righth = DataFrame(np.arange(12).reshape((6, 2)), 
                   index=[['Nevada', 'Nevada', 'Ohio','Ohio', 'Ohio', 'Ohio'], 
                   [2001, 2000, 2000, 2000, 2001, 2002]], 
                    columns=['event1', 'event2'])
lefth
righth
pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)
pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True, 
         how='outer')

In [None]:
# or both indices:
left2 = DataFrame([[1, 2], [3, 4], [5, 6]], index=['a', 'c', 'e'],
                  columns=['Ohio', 'Nevada'])
right2 = DataFrame([[7, 8], [9, 10], [11, 12], [13, 14]],
                   index=['b', 'c', 'd', 'e'], 
                    columns=['Missouri', 'Alabama'])
left2
right2
pd.merge(left2, right2, how='outer', left_index=True, right_index=True)

DataFrame has a more convenient join instance for merging by index

In [None]:
# 
left2
right2
left2.join(right2, how='outer')

you can also merge the index on a key:

In [None]:
# 
left1
right1
left1.join(right1, on='key')

you can even pass multiple DataFrames into the Join function:

In [None]:
# 
another = DataFrame([[7, 8], [9, 10], [11, 12], [16, 17]], 
                    index=['a', 'c', 'e', 'f'], 
                        columns=['New York', 'Oregon'])
left2.join([right2, another])
left2.join(right2.join(another))
left2.join([right2, another], how='outer')

Concatenating Along an Axis:

In [None]:
# 
arr = np.arange(12).reshape((3, 4))
arr
np.concatenate([arr, arr], axis=1)
np.concatenate([arr, arr])

let's try it in pandas:

In [None]:
# 
s1 = Series([0, 1], index=['a', 'b'])
s2 = Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = Series([5, 6], index=['f', 'g'])
s1
s2
s3
pd.concat([s1, s2, s3])

if you do it along the columns, it's an outer join:

In [None]:
# 
pd.concat([s1, s2, s3], axis=1)

intersect

In [None]:
# 
s4 = pd.concat([s1*5, s3])
s4
s3
s1
pd.concat([s1, s4], axis=1)
pd.concat([s1, s4], axis=1, join = 'inner')

you can specify which axes to be joined on:

In [None]:
# 
s1
s4
pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']])

to identify the concatenated pieces you need specify the keys argument:

In [None]:
# 
s1
s2
s3
result = pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])
result
result.unstack()

when you combine Series along Axis=1, then the keys become the col headers:

In [None]:
# 
pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])

same logic applies to DataFrame objects:

In [None]:
# 
df1 = DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
                columns = ['one', 'two'])
df2 = DataFrame(5+np.arange(4).reshape(2, 2), index=['a', 'c'],
                columns = ['three', 'four'])
df1
df2
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'])
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'], join='inner')

if you pass a dict of objects instead of a list, the dict keys will be used:

In [None]:
# 
pd.concat({'level1': df1, 'level2': df2}, axis=1)

when it comes to hierarchical indexing:

In [None]:
# 
pd.concat([df1, df2], axis=1, keys=['level', 'level2'],
          names=['upper', 'lower'])

if the index is meaningless in the context of the analysis, ignore_index=True

In [None]:
# 
df1 = DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df2 = DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])
df1
df2
pd.concat([df1, df2]) # look at the index here
pd.concat([df1, df2], ignore_index=True) # now look at it here, dummy

Combining data with overlap

In [None]:
# 
a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
           index=['f', 'e', 'd', 'c', 'b', 'a'])
b = Series(np.arange(len(a), dtype=np.float64),
           index=['f', 'e', 'd', 'c', 'b', 'a'])
b
b[-1] = np.nan
a
b
np.where(pd.isnull(a), b, a)

or you can use the combine_first Series function:

In [None]:
# 
b[:-2].combine_first(a[2:]) # i think it automatically sorts by index
b[:-2]
a[2:]


in DataFrames, think of combine_first as patching missing data:

In [None]:
# 
df1 = DataFrame({'a':[1, np.nan, 5, np.nan],
                 'b':[np.nan, 2, np.nan, 6],
                'c': range(2, 18, 4)})
df2 = DataFrame({'a':[5, 4, np.nan, 3, 7],
                 'b':[np.nan, 3, 4, 6, 8]})
df1
df2
df1.combine_first(df2)

### Reshaping and pivoting

stack rotates the columns to the rows (producing a serie below)

In [None]:
#
data = DataFrame(np.arange(6).reshape((2, 3)), 
                 index=pd.Index(['Ohio', 'Colorado'], name='state'), 
                columns=pd.Index(['one', 'two', 'three'], name='number'))
data
result = data.stack()
result

using unstack, you rearrange a hierarchical index back to a DataFrame:

In [None]:
# 
result.unstack()
type(result)
type(result.unstack())

you can choose which level to unstack:

In [None]:
# 
result
result.unstack(0)
result.unstack('state')

unstacking might introduce missing data if all the values aren't there:

In [None]:
# 
s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = Series([4, 5, 6], index=['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys=['one', 'two'])
print s1
s2
data2
data2.unstack()

stacking filters out missing data by default:

In [None]:
# 
data2.unstack().stack()
data2.unstack().stack(dropna=False)

in a DF, the unstacked & stacked levels becomes the lowest levels:

In [None]:
# 
df = DataFrame({'left': result, 'right': result + 5}, 
               columns=pd.Index(['left', 'right'], name='side'))
df
df.unstack('state')
df.unstack('state').stack('side')

removing duplicates

In [None]:
# 
data = DataFrame({'k1': ['one']*3 + ['two']*4,
                  'k2': [1,1,2,3,3,4,4]})
data
data.duplicated()
data.drop_duplicates()
data

specifying which column to dedupe on:

In [None]:
# 
data['v1'] = range(7)
data
data.drop_duplicates(['k1'])

by default, the first observed dupe is kept, but we can change all that:

In [None]:
# 
data.drop_duplicates(['k1', 'k2'], take_last=True)

transforming data using a function or mapping:

In [None]:
# 
data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 
                           'corned beef', 'Bacon', 'pastrami', 'honey ham',
                           'nova lox'], 
                           'Ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data
meat_to_animal = {'bacon': 'pig', 'pulled pork': 'pig', 'pastrami': 'cow',
                  'corned beef': 'cow', 'honey ham': 'pig',
                  'nova lox': 'salmon'}
meat_to_animal

first, we need to convert to lower case:

In [None]:
# 
data['food'].map(str.lower)

...and then map to the other object:

In [None]:
#
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data

or we could get fancy:

In [None]:

data['food'].map(lambda x: meat_to_animal[x.lower()])

Replacing values

In [None]:
# 
data = Series([1, -999, 2, -999, -1000, 3])
data
data.replace(-999, np.nan)
data

replace multiple at once:

In [None]:
# 
data.replace([-999, -1000], np.nan)
data.replace([-999, -1000], [np.nan, 0])

you can also pass a dict instead:

In [None]:
# 
data.replace({-999: np.nan, -1000: 0})

Renaming Axis Indexes

In [None]:
# 
data = DataFrame(np.arange(12).reshape((3, 4)), 
                 index=['Ohio', 'Colorado', 'New York'], 
                    columns=['one', 'two', 'three', 'four'])
data.index.map(str.upper)
data.index = data.index.map(str.upper)
data

or you can use rename:

In [None]:
# 
data
data.rename(index=str.title, columns=str.upper)
data.rename(index=str.lower)
data

rename can be used with a dict-like object:

In [None]:
# 
data.rename(index={'OHIO': 'INDIANA'}, 
            columns={'three': 'Z MONEY!!'})

_ = data.rename(index={'OHIO': 'INDIANA'})
data 

Discretization and Binning

In [179]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, object): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

internally, this stores a 'levels' array indicating category names:

In [None]:
# 
cats.labels
cats.levels

In [None]:
pd.value_counts(cats) # open ( means exclusive, while ] means inclusive

we can change this with Right=false

In [None]:
# 
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

we can pass our own bin names:

In [None]:
# 
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
cats = pd.cut(ages, bins, labels=group_names)
pd.value_counts(cats)

you can cut the data set into equal distances based on the actual values:

In [180]:
# 
data = np.random.rand(20)
data
pd.cut(ages, 4, precision=2)
pd.value_counts(pd.cut(data, 4, precision=2))

(0.065, 0.3]    8
(0.76, 0.99]    7
(0.53, 0.76]    3
(0.3, 0.53]     2
dtype: int64

or you can cut the data based on quantiles:

In [None]:
# 
data = np.random.randn(1000) # normally distributed
cats = pd.qcut(data, 4) # cuts into quartiles
cats
pd.value_counts(cats)

you can also pass your own quantiles

In [None]:
# 
pd.qcut(data, [0, .1, .45, .85, 1])
pd.value_counts(pd.qcut(data, [0, .1, .45, .85, 1]))

Detecting and Filtering Outliers

In [None]:
# 
np.random.seed(12345)
data = DataFrame(np.random.randn(1000, 4))
data.describe()
data

to find all the values in the last column exceeding 3 (absolute value)

In [None]:
# 
data[3]
col = data[3]
col[np.abs(col) > 3]

to find values exceeding 3 or -3, use the any method:

In [None]:
# 
data[(np.abs(data)>3).any(1)]

here is how we cap off the data at 3:

In [None]:
# 
data[np.abs(data)>3]
data[np.abs(data)>3] = np.sign(data)*3
data.describe()

Permutation and Random Sampling:

In [None]:
# 
df = DataFrame(np.arange(5*4).reshape(5, 4))
df

# to randomly reorder the rows, use np.random.permutation with 'take':
sampler = np.random.permutation(5)
sampler
df
df.take(sampler)
sampler = np.random.permutation(4)
df.take(sampler, axis=1)

to select a random subset without replacement, we can do this janky method:

In [None]:
# 
df.take(np.random.permutation(len(df))[:3]) # takes first 3 random rows

to sample WITH replacement, use np.random.randint:

In [None]:
# 
bag = np.array([5, 7, -1, 6, 4])
sampler = np.random.randint(0, len(bag), size=10)
sampler

In [None]:
draws= bag.take(sampler) 
draws #question: NOT a question...comment: this super janky!

Computing Indicator/Dummy Variables:

In [None]:
# 
df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 
                'data1': range(6)})
df
pd.get_dummies(df['key'])

or add a prefix to the column names:

In [None]:
# 
dummies = pd.get_dummies(df['key'], prefix = 'key')
dummies
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

if there are multiple categories, here's what we do:

In [None]:
# 
mnames = ['movie_id', 'title', 'genres']

### String manipulation

split:

In [None]:
# 
val = 'a,b, guido'
val.split(',')

strip (trim):

In [None]:
# 
pieces = [x.strip() for x in val.split(',')]
pieces

concatenation:

In [None]:
# 
first, second, third = pieces
pieces
first + '::' + second + '::' + third

faster concatenation is with join:

In [None]:
# 
'::'.join(pieces)

to find substrings, use 'in' or 'index' or 'find':

In [None]:
# 
'guido' in val
val.index(',')
val
val.find(':')
val.find('b')

index raises an exception if the string isn't found:

In [None]:
# 
val.index(':')

count string values:

In [None]:
# 
val.count(',')

replace

In [None]:
# 
val.replace(',', '::')
val.replace(',','')

### Grouping and aggregation

In [None]:
df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                'key2' : ['one', 'two', 'one', 'two', 'one'],
                'data1' : np.random.randn(5),
                'data2' : np.random.randn(5)})
df

# getting the average of data1 using key1
grouped = df['data1'].groupby(df['key1'])
grouped
grouped.mean()

if we passed multiple arrays as a list, we get something different:

In [None]:
# 
means = df['data1'].groupby([df['key1'],df['key2']]).mean()
means
means.unstack()

we can also change the keys to be arrays:

In [None]:
# 
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
df['data1'].groupby([states, years]).mean()

we can pass column names as group keys:

In [None]:
# 
df.groupby('key1').mean()
df.groupby(['key1', 'key2']).mean()

size:

In [None]:
# 
df.groupby(['key1', 'key2']).size()

iterating over groups:

In [None]:
# 
for name, group in df.groupby('key1'):
    print name
    print group

in case of multiple keys, 1st element in the tuple is a tuple of value keys:

In [None]:
# 
for (k1, k2), group in df.groupby(['key1', 'key2']):
    print k1, k2
    print group

computing a dict of the data pieces as a one-liner:

In [None]:
# 
pieces = dict(list(df.groupby('key1')))
pieces['b']
pieces['a']

we can group by any of the axes, tho default is axis=0:

In [None]:
# 
df.dtypes
grouped = df.groupby(df.dtypes, axis=1)
grouped
dict(list(grouped))
test = dict(list(grouped))
test.keys()
for key in test.keys():
  print(key)

Select a Column or Subset of Columns:
- indexing a groupby object using a column name has the effect of selecting:

In [None]:
# 
df.groupby('key1')['data1'] # is the same as:
df['data1'].groupby(df['key1'])

to compute means for just data2 column and get a DF as output:

In [None]:
# 
df.groupby(['key1', 'key2'])[['data2']].mean()
df.groupby(['key1', 'key2'])[['data1']].mean() # note: same as:
df['data1'].groupby([df['key1'],df['key2']]).mean()

the object returned by this indexing operation is a grouped DF or Series:

In [None]:
# 
s_grouped = df.groupby(['key1', 'key2'])['data2']
s_grouped
df_grouped = df.groupby(['key1', 'key2'])
df_grouped

Grouping with Dicts and Series:

In [None]:
# 
# create a dataframe
people = DataFrame(np.random.randn(5, 5), 
                   columns=['a', 'b', 'c', 'd', 'e'], 
                    index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people
# create a group correspondence
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f': 'orange'}

# we could construct an array from this dict & pass to groupby, or:
by_column = people.groupby(mapping, axis=1)
by_column.sum()
people.groupby(mapping, axis=1).sum()
by_column = people.groupby(mapping)
by_column.sum() #this no worky b/c the corresponding group exists on axis 1

the same thing holds for Series:

In [None]:
# 
map_series = Series(mapping)
map_series
people.groupby(map_series, axis=1).count()

Grouping by Functions

In [None]:
# 
people.groupby(len).sum()


you can mix functions with arrays, dicts, or Series:

In [None]:
# 
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()

grouping by index levels:

In [None]:
# 
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                     [1, 3, 5, 1, 3]], names=['cty', 'tenor'])
hier_df = DataFrame(np.random.randn(4, 5), columns=columns)
hier_df
hier_df.groupby(level='cty', axis=1).count()

### Data aggregation

In [None]:
df
grouped = df.groupby('key1')
grouped
grouped['data1'].quantile(.9)

use the agg or aggregate method to pass my own aggregation functions:

In [None]:
# 
def peak_to_peak(arr):
    return arr.max() - arr.min()

grouped.agg(peak_to_peak)

some methods likae describe work, but they're not aggregations:

In [None]:
# 
grouped.describe()

more comprehensive example:

In [None]:
# 
tips = pd._csv(path('tips.csv'))
tips['tip_pct'] = tips['tip']/tips['total_bill']
tips.head()
tips.describe()

Column-wise and Multiple Function Application

In [None]:
# 
grouped = tips.groupby(['sex', 'smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')
tips.groupby(['sex','smoker'])[['tip_pct']].mean() #note that it's the same

you can pass multiple functions

In [None]:
# 
grouped_pct.agg(['mean', 'std', peak_to_peak])

naming the aggregate functions:

In [None]:
# 
grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])

with a dataframe, we have more options (you can specify a list):

In [None]:
# 
functions = ['count', 'mean', 'max']
result = grouped['tip_pct', 'total_bill'].agg(functions)
result
result['tip_pct']
grouped_pct.agg(functions) # uhm, you can do the same with a series...

if you want to pass different functions to different columns, use a dict:

In [None]:
# 
grouped.agg({'tip':np.max, 'size':'sum'}) #question: how to label 'max'&'sum'
grouped.agg({'tip':np.max, 'size':'sum'}) 
grouped.agg({'tip_pct' : ['min', 'max', 'mean', 'std',], 'size' : 'sum'})

Returning Aggregated Data in 'unindexed' Form

In [None]:
# 
tips.groupby(['sex', 'smoker'], as_index=False).mean()
# note, you can always get results in this format using 'reset_index'

Group-wise Operaations and Transformations
- say we want to add a column that contains group means for each index:

In [None]:
# 
df
k1_means = df.groupby('key1').mean().add_prefix('mean_')
k1_means
pd.merge(df, k1_means, left_on='key1', right_index=True)

# that was the long way. let' look at this example:
people = DataFrame(np.random.randn(5, 5), 
                   columns=['a', 'b', 'c', 'd', 'e'], 
                   index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people
key = ['one', 'two', 'one', 'two', 'one']
people.groupby(key).mean()
people.groupby(key).transform(np.mean)

suppose you want to subtract the mean from each group:

In [None]:
# 
def demean(arr):
    return arr - arr.mean()
demeaned = people.groupby(key).transform(demean)
demeaned

check that demeaned has zero group means:

In [None]:
# 
demeaned.groupby(key).mean()

Apply: general split-apply-combine
- selecting top 5 tip_pct values by group - 

In [None]:
# 
def top(df, n=5, column='tip_pct'):
    return df.sort_index(by=column)[-n:]
top(tips, n=6)

if we group by smoke and call apply, we get:

In [None]:
# 
tips.groupby('smoker').apply(top)

you can pass other parameters into the function:

In [None]:
# 
tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')

result = tips.groupby('smoker')['tip_pct'].describe()
result
result.unstack('smoker')

invoking a method inside groupby, it's a shortcut for:

In [None]:
# 
f = lambda x: x.describe()
grouped.apply(f)

you can disable the hierarchical index by:

In [None]:
# 
tips.groupby('smoker', group_keys=False).apply(top)

# Quantile and Bucket Analysis
frame = DataFrame({'data1': np.random.randn(1000),
                   'data2': np.random.randn(1000)})
frame
factor = pd.cut(frame.data1, 4)
factor[:10]

the factor object returned by cut can be passed directly to groupby, so
we could compute a set of statistics for the data2 column like so:

In [None]:

def get_stats(group):
    return{'min': group.min(), 'max': group.max(), 
           'count': group.count(), 'mean': group.mean()}
grouped = frame.data2.groupby(factor)
grouped = frame['data2'].groupby(factor) # note: same as previous row!
grouped.apply(get_stats).unstack()

or we could compute equal-size buckets based on sample quantiles:

In [None]:
# 
grouping = pd.qcut(frame.data1, 10, labels=False)
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()

example: filling missing values with group-specific values

In [None]:
# 
s = Series(np.random.randn(6))
s
s[:2]
s[::2]
s[::2] = np.nan
s
s.fillna(s.mean())

filling in NAs by group values:

In [None]:
# 
states = ['Ohio', 'New York', 'Vermont', 'Florida', 'Oregon', 'Nevada',
          'California', 'Idaho']
group_key = ['East'] * 4 + ['West'] * 4
data = Series(np.random.randn(8), index=states)
data[['Vermont', 'Nevada', 'Idaho']] = np.nan
data
group_key
data.groupby(group_key).mean()
fill_mean = lambda g: g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)

In [None]:
or, if you have pre-defined values that vary by group:

In [None]:
# 
fill_values = {'East': 0.5, 'West': -1}
fill_func = lambda g: g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func)

Example: Random Sampling and Permutation
- Hearts, Spades, Clubs, Diamonds - first let's make the deck:

In [None]:
# 
suits = ['H', 'S', 'C', 'D']
card_val = (range(1, 11) + [10]*3)*4
card_val
base_names = ['A'] + range(2, 11) + ['J', 'K', 'Q']
cards = []
for suit in ['H', 'S', 'C', 'D']:
    cards.extend(str(num) + suit for num in base_names)
deck = Series(card_val, index=cards)
deck[:13]

to draw a hand of 5 cards from the deck:

In [None]:
# 
def draw(deck, n=5):
    return deck.take(np.random.permutation(len(deck))[:n])
draw(deck)

to draw to random cards from each suit:

In [None]:
# 
get_suit = lambda card: card[-1] # last leter is suit
deck.groupby(get_suit).apply(draw, n=2)

alternatively:

In [None]:
deck.groupby(get_suit, group_keys=False).apply(draw, n=2)

Example: Group Weighted Average and Correlation

In [None]:
# 
"""operations between columns in a DataFrame or two Series, such as group
weighted average, become a routine affair"""
df = DataFrame({'category': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'],
                'data': np.random.randn(8),
                'weights': np.random.randn(8)})
df

get the group weighted average by category: 

In [None]:
# 
grouped = df.groupby('category')
get_wavg = lambda g: np.average(g['data'], weights=g['weights'])
grouped.apply(get_wavg)

or, a 'less trivial' example:

In [None]:
# 
close_px = pd.read_csv(path('stock_px.csv'), parse_dates=True, index_col=0)
close_px


to compute yearly correlations of daily returns with SPX:

In [None]:
# 
rets = close_px.pct_change().dropna()
spx_corr = lambda x: x.corrwith(x['SPX'])
by_year = rets.groupby(lambda x: x.year) # how does it know to group by index?
by_year.apply(spx_corr)

of course, we can do inter-column correlations:

In [None]:
# 
by_year.apply(lambda g: g['AAPL'].corr(g['MSFT']))

Example: Group-wise Linear Regression

In [None]:
# 
import statsmodels.api as sm
def regress(data, yvar, xvars):
    Y = data[yvar]
    X = data[xvars]
    X['intercept'] = 1.
    result = sm.OLS(Y, X).fit()
    return result.params
by_year.apply(regress, 'AAPL', ['SPX']) #  question: why is SPX in brackets??

Pivot Tables and Cross-Tabulation (note: it calculates means by default)

In [None]:
# 
tips.pivot_table(index=['sex', 'smoker']) # note, you can do this w/ groupby

let's limit the metrics, and put smoke in the table columns:

In [None]:
# 
tips.pivot_table(['tip_pct', 'size'], index=['sex', 'day'],
                 columns='smoker')

we can get totals by doing margins=True

In [None]:
# 
tips.pivot_table(['tip_pct', 'size'], index=['sex', 'day'],
                 columns='smoker', margins=True)

 we change the measure using aggfunc (note that len is the same as count):

In [None]:

tips.pivot_table('tip_pct', index=['sex', 'smoker'], columns='day',
                 aggfunc=len, margins=True)
tips.pivot_table('tip_pct', index=['sex', 'smoker'], columns='day',
                 aggfunc='count', margins=True)

if some combos are empty or NA, you can pass fill_value:

In [None]:
tips.pivot_table('size', index=['time', 'sex', 'smoker'],
                 columns='day', aggfunc='sum', fill_value=0)
tips.pivot_table('size', index=['time', 'sex', 'smoker'],
                 columns='day', aggfunc='sum')

Cross-Tabulations: Crosstab
- cross tabs are special cases of pivot tables that compute group frequencies

In [None]:
pd.crosstab([tips.time, tips.day], tips.smoker, margins=True)
tips.pivot_table('size', index=['time', 'day'], columns='smoker', aggfunc=len,
                 margins=True)