In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [5]:
# create a series
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [6]:
# create a dataframe
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [8]:
type(dates)

pandas.tseries.index.DatetimeIndex

In [10]:
# continue create a dataframe
# create dataframe
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,2.223518,0.12672,-0.042564,1.371016
2013-01-02,-0.954965,0.778433,0.585936,-0.199295
2013-01-03,0.879113,-1.229092,-0.396789,0.389317
2013-01-04,-0.498247,0.053058,0.604372,-0.126667
2013-01-05,1.889758,-0.323823,0.616003,0.976393
2013-01-06,0.209037,-0.989248,-1.053365,3.409465


In [11]:
np.random.randn(6,4)

array([[-1.06684929, -1.08361583, -0.86719255,  0.97309253],
       [-0.43902742, -0.87854254,  1.27100541, -0.65707254],
       [ 1.14219715, -0.99449262,  0.10859349,  0.39280408],
       [ 0.45070273,  1.0824234 , -1.19223917,  0.22144221],
       [ 0.95664226, -0.12201101,  2.21713628, -0.66439005],
       [ 0.21052363,  0.57573633,  1.25399627, -2.29045491]])

The difference between Series and Dataframe. A Seires really only have one column. The index is like the row number in Excel.
However, we can label the index (aka row number) with something more meaningful in a data frame, by using the "index" keyword

In [19]:
# create dataframe from dict
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D': np.array([3] * 4,dtype='int32'),
                    'E': pd.Categorical(['test','train','test','train']),
                    'F': 'foo'
                   })

In [20]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [21]:
# specific dtypes
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [23]:
df2.A

0    1.0
1    1.0
2    1.0
3    1.0
Name: A, dtype: float64

In [27]:
# viewing data
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,2.223518,0.12672,-0.042564,1.371016
2013-01-02,-0.954965,0.778433,0.585936,-0.199295
2013-01-03,0.879113,-1.229092,-0.396789,0.389317
2013-01-04,-0.498247,0.053058,0.604372,-0.126667
2013-01-05,1.889758,-0.323823,0.616003,0.976393


In [25]:
df.tail()

Unnamed: 0,A,B,C,D
2013-01-02,-0.954965,0.778433,0.585936,-0.199295
2013-01-03,0.879113,-1.229092,-0.396789,0.389317
2013-01-04,-0.498247,0.053058,0.604372,-0.126667
2013-01-05,1.889758,-0.323823,0.616003,0.976393
2013-01-06,0.209037,-0.989248,-1.053365,3.409465


In [28]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [29]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.624702,-0.263992,0.052265,0.970038
std,1.276749,0.748441,0.684176,1.343054
min,-0.954965,-1.229092,-1.053365,-0.199295
25%,-0.321426,-0.822892,-0.308233,0.002329
50%,0.544075,-0.135383,0.271686,0.682855
75%,1.637097,0.108305,0.599763,1.27236
max,2.223518,0.778433,0.616003,3.409465


In [30]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,2.223518,-0.954965,0.879113,-0.498247,1.889758,0.209037
B,0.12672,0.778433,-1.229092,0.053058,-0.323823,-0.989248
C,-0.042564,0.585936,-0.396789,0.604372,0.616003,-1.053365
D,1.371016,-0.199295,0.389317,-0.126667,0.976393,3.409465


In [33]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,1.371016,-0.042564,0.12672,2.223518
2013-01-02,-0.199295,0.585936,0.778433,-0.954965
2013-01-03,0.389317,-0.396789,-1.229092,0.879113
2013-01-04,-0.126667,0.604372,0.053058,-0.498247
2013-01-05,0.976393,0.616003,-0.323823,1.889758
2013-01-06,3.409465,-1.053365,-0.989248,0.209037


In [35]:
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2013-01-06,0.209037,-0.989248,-1.053365,3.409465
2013-01-05,1.889758,-0.323823,0.616003,0.976393
2013-01-04,-0.498247,0.053058,0.604372,-0.126667
2013-01-03,0.879113,-1.229092,-0.396789,0.389317
2013-01-02,-0.954965,0.778433,0.585936,-0.199295
2013-01-01,2.223518,0.12672,-0.042564,1.371016


In [38]:
df.sort_index(axis=0, descending=False)

TypeError: sort_index() got an unexpected keyword argument 'descending'

In [39]:
# sorting by values
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-03,0.879113,-1.229092,-0.396789,0.389317
2013-01-06,0.209037,-0.989248,-1.053365,3.409465
2013-01-05,1.889758,-0.323823,0.616003,0.976393
2013-01-04,-0.498247,0.053058,0.604372,-0.126667
2013-01-01,2.223518,0.12672,-0.042564,1.371016
2013-01-02,-0.954965,0.778433,0.585936,-0.199295


In [44]:
# getting
# selecitng a single column gives a Series object
# it includes the index though!
df['A']

2013-01-01    2.223518
2013-01-02   -0.954965
2013-01-03    0.879113
2013-01-04   -0.498247
2013-01-05    1.889758
2013-01-06    0.209037
Freq: D, Name: A, dtype: float64

In [43]:
type(df['A'])

pandas.core.series.Series

In [45]:
# select via [] will slice the rows
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,2.223518,0.12672,-0.042564,1.371016
2013-01-02,-0.954965,0.778433,0.585936,-0.199295
2013-01-03,0.879113,-1.229092,-0.396789,0.389317


In [47]:
# selection by label
# cross-section
df.columns

Index([u'A', u'B', u'C', u'D'], dtype='object')

In [50]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

# this is poor notation!
### when we defined df, we used a Pandas TimeSeries Index called "dates"
There's no actual column called dates in df. Instead dates[0] resolves to

    Timestamp('2013-01-01 00:00:00', offset='D')

which is guess is a timestamp object in pandas. We then slice df on that date.
#### The slice is a row, since that date is a one or our index name (e.g. row 1, row 10, etc). Thus we get the row, with that index name.

#### The confusing part is that it's printed vertically, even though it's a row from our dataframe. 

In [57]:
df.loc[dates[0]]

A    2.223518
B    0.126720
C   -0.042564
D    1.371016
Name: 2013-01-01 00:00:00, dtype: float64

In [53]:
dates[0]

Timestamp('2013-01-01 00:00:00', offset='D')

#### Selecting on a multi-axis by label

In [58]:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,2.223518,0.12672
2013-01-02,-0.954965,0.778433
2013-01-03,0.879113,-1.229092
2013-01-04,-0.498247,0.053058
2013-01-05,1.889758,-0.323823
2013-01-06,0.209037,-0.989248


I read the above as "DataFrame df. This location. Rows, everything (from start to finish). Columns, A and B only"

In [59]:
# showing label slicing, both endpoints are included
df.loc['20130102':'20130104',['A','B']]

Unnamed: 0,A,B
2013-01-02,-0.954965,0.778433
2013-01-03,0.879113,-1.229092
2013-01-04,-0.498247,0.053058


In [61]:
#reduction in the dimensions of the returned object
df.loc['20130102',['A','B']]

A   -0.954965
B    0.778433
Name: 2013-01-02 00:00:00, dtype: float64

In [62]:
# for getting a scaler value.
# aka, the value of a particulae "Cell" if this were Excel
df.loc[dates[0],'A']

2.2235181391210967

#### the above says: dataframe at location [   row '0', column 'A'   ]

In [63]:
# for fats access to a scalar (equivalent to above)
df.at[dates[0],'A']

2.2235181391210967

#### Selection by Position

In [64]:
# select via the position of passed integers
df.iloc[3]

A   -0.498247
B    0.053058
C    0.604372
D   -0.126667
Name: 2013-01-04 00:00:00, dtype: float64

In [66]:
# for reference, we can see it took row 4, at the 3rd index (0 1 2 3)
df

Unnamed: 0,A,B,C,D
2013-01-01,2.223518,0.12672,-0.042564,1.371016
2013-01-02,-0.954965,0.778433,0.585936,-0.199295
2013-01-03,0.879113,-1.229092,-0.396789,0.389317
2013-01-04,-0.498247,0.053058,0.604372,-0.126667
2013-01-05,1.889758,-0.323823,0.616003,0.976393
2013-01-06,0.209037,-0.989248,-1.053365,3.409465


In [68]:
# by integer slices
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2013-01-04,-0.498247,0.053058
2013-01-05,1.889758,-0.323823
