In [1]:
from pandas import *
import pandas
import numpy as np

def side_by_side(*objs, **kwds):
    from pandas.core.common import adjoin
    space = kwds.get('space', 4)
    reprs = [repr(obj).split('\n') for obj in objs]
    print adjoin(space, *reprs)

plt.rc('figure', figsize=(10, 6))
pandas.set_option("notebook_repr_html",False)

NameError: name 'plt' is not defined

Series
======

In [2]:
labels = ['a', 'b', 'c', 'd', 'e']
s = Series(randn(5), index=labels)
s

NameError: name 'randn' is not defined

In [3]:
'b' in s

NameError: name 's' is not defined

In [4]:
s['b']

In [5]:
s

In [6]:
mapping = s.to_dict()
mapping

In [7]:
s = Series(mapping)
s

In [8]:
s[:3]

In [9]:
s.index

DataFrame: 2D collection of Series
==================================

In [10]:
df = DataFrame({'a': np.random.randn(6),
                'b': ['foo', 'bar'] * 3,
                'c': np.random.randn(6)})
df

In [11]:
df.index

In [12]:
df.columns

In [13]:
df = DataFrame({'a': np.random.randn(6),
                'b': ['foo', 'bar'] * 3,
                'c': np.random.randn(6)},
               index=DateRange('1/1/2000', periods=6))
df

In [14]:
df = DataFrame({'a': np.random.randn(6),
                'b': ['foo', 'bar'] * 3,
                'c': np.random.randn(6)},
               columns=['a', 'b', 'c', 'd'])
df

Creation from nested dicts
--------------------------

These arise naturally in Python code

In [15]:
data = {}
for col in ['foo', 'bar', 'baz']:
    for row in ['a', 'b', 'c', 'd']:
        data.setdefault(col, {})[row] = randn()
data

In [16]:
DataFrame(data)

Data alignment
==============

In [17]:
close_px = read_csv('stock_data.csv', index_col=0, parse_dates=True)

In [18]:
close_px

In [19]:
s1 = close_px['AAPL'][-20:]
s2 = close_px['AAPL'][-25:-10]
side_by_side(s1, s2)

In [20]:
s1 + s2

In [21]:
df = close_px.ix[-10:, :3]
df

In [22]:
side_by_side(s1.reindex(s2.index), s2)

In [23]:
b, c  = s1.align(s2, join='inner')
side_by_side(b, c)

In [24]:
b, c  = s1.align(s2, join='outer')
side_by_side(b, c)

In [25]:
b, c  = s1.align(s2, join='right')
side_by_side(b, c)

In [26]:
df = close_px.ix[-10:, ['AAPL', 'IBM', 'MSFT']]
df

In [27]:
df2 = df.ix[::2, ['IBM', 'MSFT']]
side_by_side(df, df2)

In [28]:
df + df2

In [29]:
b, c = df.align(df2, join='inner')
side_by_side(b, c)

Transposing: no copy if all columns are same type
-------------------------------------------------

In [30]:
df[:5].T

Columns can be any type
-----------------------

In [31]:
n = 10
foo = DataFrame(index=range(n))
foo['floats'] = np.random.randn(n)
foo['ints'] = np.arange(n)
foo['strings'] = ['foo', 'bar'] * (n / 2)
foo['bools'] = foo['floats'] > 0
foo['objects'] = DateRange('1/1/2000', periods=n)
foo

In [32]:
foo.dtypes

N.B. transposing is not roundtrippable in this case (column-oriented data structure)

In [33]:
foo.T.T

In [34]:
foo.T.T.dtypes

Function application
====================

You can apply arbitrary functions to the rows or columns of a DataFrame

In [35]:
df.apply(np.mean)

In [36]:
df.apply(np.mean, axis=1)

You can get as fancy as you want

In [37]:
close_px

In [38]:
def peak_date(series):
    return series.index[series.argmax()]
close_px.apply(peak_date)

In [39]:
df.apply(lambda x: x.max() - x.min()) # np.ptp

In [40]:
np.log(close_px)

Plotting
========

Some basic plotting integration with matplotlib in Series / DataFrame

In [41]:
close_px[['AAPL', 'IBM', 'MSFT', 'XOM']].plot()

In [42]:
rets.ix[-1]

In [43]:
close_px.ix[-1].plot(kind='bar')
title('Prices on %s' % close_px.index[-1])
axhline(0)

Hierarchical indexing
---------------------

In [44]:
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]])
hdf = DataFrame(np.random.randn(10, 3), index=index,
                columns=['A', 'B', 'C'])
hdf

In [45]:
hdf.ix['foo']

In [46]:
hdf.ix['foo'] = 0
hdf

In [47]:
hdf.ix['foo', 'three']

Stacking and unstacking
-----------------------

In [48]:
tuples = zip(*[['bar', 'bar', 'baz', 'baz',
                'foo', 'foo', 'qux', 'qux'],
               ['one', 'two', 'one', 'two',
                'one', 'two', 'one', 'two']])
index = MultiIndex.from_tuples(tuples)
columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'),
                                  ('B', 'cat'), ('A', 'dog')])
df = DataFrame(randn(8, 4), index=index, columns=columns)
df

In [49]:
df2 = df.ix[[0, 1, 2, 4, 5, 7]]
df2

In [50]:
df.unstack()['B']

GroupBy
=======

In [51]:
df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                       'foo', 'bar', 'foo', 'foo'],
                'B' : ['one', 'one', 'two', 'three',
                       'two', 'two', 'one', 'three'],
                'C' : np.random.randn(8),
                'D' : np.random.randn(8)})
df

In [52]:
for key, group in df.groupby('A'):
    print key
    print group

In [53]:
df.groupby('A')['C'].describe().T

In [54]:
df.groupby('A').mean()

In [55]:
for key, group in df.groupby('A'):
    print key
    print group

In [56]:
df.groupby(['A', 'B']).mean()

In [57]:
df.groupby(['A', 'B'], as_index=False).mean()

GroupBy example: linear regression by group
-------------------------------------------

In [58]:
import scikits.statsmodels.api as sm
rets = close_px / close_px.shift(1) - 1

def get_beta(rets):
    rets = rets.dropna()
    rets['intercept'] = 1.
    model = sm.OLS(rets['MSFT'], rets.ix[:, ['AAPL', 'intercept']]).fit()
    return model.params

get_beta(rets)

In [59]:
grouped = rets.groupby([lambda x: x.year, lambda x: x.month])
beta_by_ym = grouped.apply(get_beta)
beta_by_ym

In [60]:
beta_by_ym.unstack(0)['AAPL']

GroupBy with hierarchical indexing
----------------------------------

In [61]:
tuples = zip(*[['bar', 'bar', 'baz', 'baz',
                'foo', 'foo', 'qux', 'qux'],
               ['one', 'two', 'one', 'two',
                'one', 'two', 'one', 'two']])
index = MultiIndex.from_tuples(tuples)
columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'),
                                  ('B', 'cat'), ('A', 'dog')])
df = DataFrame(randn(8, 4), index=index, columns=columns)
df

In [62]:
df.groupby(level=0, axis=0).mean()

In [63]:
df.stack()

In [64]:
df.stack().mean(1).unstack()

In [65]:
# could also have done
df.groupby(level=1, axis=1).mean()