#Pandas

##What is Pandas?
A Python library providing data structures and analysis tools for labeled, relational, or tabular data.

You could think of it as a more powerful Excel spreadsheet or R dataframe in Python.

This lecture is a very brief introduction intended to prepare you for the sprint.

In [1]:
import pandas as pd

In [2]:
import numpy as np

#Series
One of the essential Pandas data types is the Series.

A Pandas Series as a _labeled_ one-dimensional vector whose entries can contain any type of Python object.

In [20]:
# int
int_series = pd.Series(range(101,110))
print int_series.head()

0    101
1    102
2    103
3    104
4    105
dtype: int64


In [17]:
# float
float_series = pd.Series(np.linspace(0, 10, 100))
print float_series.head()

0    0.00000
1    0.10101
2    0.20202
3    0.30303
4    0.40404
dtype: float64


In [18]:
# string
str_series = pd.Series([x for x in 'abcde'*2], dtype='string')
print str_series.head()

0    a
1    b
2    c
3    d
4    e
dtype: object


In [19]:
# datetime
date_series = pd.Series(pd.date_range('2015-1-1', 
                                      '2015-11-1',
                                      freq='m'))

print date_series.head()


0   2015-01-31
1   2015-02-28
2   2015-03-31
3   2015-04-30
4   2015-05-31
dtype: datetime64[ns]


#DataFrames
Data frames extend the concept of Series to tabular data.

In [68]:
df = pd.DataFrame(np.random.randn(10, 5), 
                  columns=[x for x in 'abcde'])
df

Unnamed: 0,a,b,c,d,e
0,0.879186,-0.48388,-0.193364,0.538595,-0.479076
1,-0.504337,-0.815071,-0.255533,-1.637823,0.962461
2,0.462413,-0.097132,-0.943511,1.720747,-1.114607
3,-0.23239,-2.556239,1.997716,-0.626188,-0.106642
4,0.47979,-1.744325,0.768285,-1.557014,-0.573672
5,-0.599831,-0.21052,-0.449362,0.869437,0.394829
6,0.764018,-0.153589,1.326025,2.292867,0.378869
7,2.852531,0.565726,-0.040722,-0.905428,-1.298709
8,0.553279,0.380876,-0.34989,0.232721,-0.279359
9,1.672047,-2.654748,0.251786,-0.102103,0.265498


In [69]:
# rows and columns in dataframes are both series
column = df.a
print type(column)
row = df.ix[0]
print type(row)

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


## Working with DataFrames

In [71]:
# add a column
df['new'] = 2*df['a'] + df['b']
df

Unnamed: 0,a,b,c,d,e,new
0,0.879186,-0.48388,-0.193364,0.538595,-0.479076,1.274492
1,-0.504337,-0.815071,-0.255533,-1.637823,0.962461,-1.823744
2,0.462413,-0.097132,-0.943511,1.720747,-1.114607,0.827694
3,-0.23239,-2.556239,1.997716,-0.626188,-0.106642,-3.021019
4,0.47979,-1.744325,0.768285,-1.557014,-0.573672,-0.784746
5,-0.599831,-0.21052,-0.449362,0.869437,0.394829,-1.410181
6,0.764018,-0.153589,1.326025,2.292867,0.378869,1.374447
7,2.852531,0.565726,-0.040722,-0.905428,-1.298709,6.270788
8,0.553279,0.380876,-0.34989,0.232721,-0.279359,1.487435
9,1.672047,-2.654748,0.251786,-0.102103,0.265498,0.689346


In [73]:
# delete a column
df.drop('new', axis=1)

Unnamed: 0,a,b,c,d,e
0,0.879186,-0.48388,-0.193364,0.538595,-0.479076
1,-0.504337,-0.815071,-0.255533,-1.637823,0.962461
2,0.462413,-0.097132,-0.943511,1.720747,-1.114607
3,-0.23239,-2.556239,1.997716,-0.626188,-0.106642
4,0.47979,-1.744325,0.768285,-1.557014,-0.573672
5,-0.599831,-0.21052,-0.449362,0.869437,0.394829
6,0.764018,-0.153589,1.326025,2.292867,0.378869
7,2.852531,0.565726,-0.040722,-0.905428,-1.298709
8,0.553279,0.380876,-0.34989,0.232721,-0.279359
9,1.672047,-2.654748,0.251786,-0.102103,0.265498


In [77]:
# select a subset
df[['a', 'b']]

Unnamed: 0,a,b
0,0.879186,-0.48388
1,-0.504337,-0.815071
2,0.462413,-0.097132
3,-0.23239,-2.556239
4,0.47979,-1.744325
5,-0.599831,-0.21052
6,0.764018,-0.153589
7,2.852531,0.565726
8,0.553279,0.380876
9,1.672047,-2.654748


In [78]:
df[df.a > 2*df.b]

Unnamed: 0,a,b,c,d,e,new
0,0.879186,-0.48388,-0.193364,0.538595,-0.479076,1.274492
1,-0.504337,-0.815071,-0.255533,-1.637823,0.962461,-1.823744
2,0.462413,-0.097132,-0.943511,1.720747,-1.114607,0.827694
3,-0.23239,-2.556239,1.997716,-0.626188,-0.106642,-3.021019
4,0.47979,-1.744325,0.768285,-1.557014,-0.573672,-0.784746
6,0.764018,-0.153589,1.326025,2.292867,0.378869,1.374447
7,2.852531,0.565726,-0.040722,-0.905428,-1.298709,6.270788
9,1.672047,-2.654748,0.251786,-0.102103,0.265498,0.689346


In [79]:
df[(df.a > 0) & (df.c < 0)]

Unnamed: 0,a,b,c,d,e,new
0,0.879186,-0.48388,-0.193364,0.538595,-0.479076,1.274492
2,0.462413,-0.097132,-0.943511,1.720747,-1.114607,0.827694
7,2.852531,0.565726,-0.040722,-0.905428,-1.298709,6.270788
8,0.553279,0.380876,-0.34989,0.232721,-0.279359,1.487435


In [80]:
df.describe()

Unnamed: 0,a,b,c,d,e,new
count,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.632671,-0.77689,0.211143,0.082581,-0.185041,0.488451
std,1.038425,1.156146,0.899133,1.315686,0.709528,2.55778
min,-0.599831,-2.654748,-0.943511,-1.637823,-1.298709,-3.021019
25%,-0.058689,-1.512012,-0.326301,-0.835618,-0.550023,-1.253822
50%,0.516535,-0.3472,-0.117043,0.065309,-0.193,0.75852
75%,0.850394,-0.111247,0.63916,0.786727,0.350526,1.349458
max,2.852531,0.565726,1.997716,2.292867,0.962461,6.270788


#Indexes
Pandas Series and Dataframes have a default integer index. Indexes work like a dictionary key, allowing fast lookups based on the index. Indexes can also be exploited for fast group-bys, merges, time-series operations, etc. You can create custom indexes too.

In [87]:
indexed_series = pd.Series(range(5), 
                           index = ['California', 'Alabama', 
                                    'Indiana', 'Montana', 
                                    'Kentucky'])
alt_indexed_series = pd.Series(range(5, 10),
                               index = ['Washington', 'Alabama', 
                                        'Montana', 'Indiana', 
                                        'New York'])
print indexed_series
print
print alt_indexed_series

California    0
Alabama       1
Indiana       2
Montana       3
Kentucky      4
dtype: int64

Washington    5
Alabama       6
Montana       7
Indiana       8
New York      9
dtype: int64


In [88]:
#Pandas uses the index by default to align series for arithmetic
indexed_series = pd.Series(range(5), 
                           index = ['California', 'Alabama', 
                                    'Indiana', 'Montana', 
                                    'Kentucky'])
alt_indexed_series = pd.Series(range(5, 10),
                               index = ['Washington', 'Alabama', 
                                        'Montana', 'Indiana', 
                                        'New York'])

indexed_series + alt_indexed_series

Alabama        7
California   NaN
Indiana       10
Kentucky     NaN
Montana       10
New York     NaN
Washington   NaN
dtype: float64

In [83]:
#Datetime index
dt_index = pd.date_range('2015-1-1', 
                        '2015-11-1', 
                        freq='m')
df.index = dt_index
df

Unnamed: 0,a,b,c,d,e,new
2015-01-31,0.879186,-0.48388,-0.193364,0.538595,-0.479076,1.274492
2015-02-28,-0.504337,-0.815071,-0.255533,-1.637823,0.962461,-1.823744
2015-03-31,0.462413,-0.097132,-0.943511,1.720747,-1.114607,0.827694
2015-04-30,-0.23239,-2.556239,1.997716,-0.626188,-0.106642,-3.021019
2015-05-31,0.47979,-1.744325,0.768285,-1.557014,-0.573672,-0.784746
2015-06-30,-0.599831,-0.21052,-0.449362,0.869437,0.394829,-1.410181
2015-07-31,0.764018,-0.153589,1.326025,2.292867,0.378869,1.374447
2015-08-31,2.852531,0.565726,-0.040722,-0.905428,-1.298709,6.270788
2015-09-30,0.553279,0.380876,-0.34989,0.232721,-0.279359,1.487435
2015-10-31,1.672047,-2.654748,0.251786,-0.102103,0.265498,0.689346


# SQL-like operations

Pandas dataframes can be combined relationally like SQL tables.

Examples: http://pandas.pydata.org/pandas-docs/stable/comparison_with_sql.html