# Getting Started with pandas
- pandas is designed to work with tabular/heterogeneous data, while numpy is designed to work with homogenous numerical data

In [1]:
import pandas as pd
import numpy as np

### Series

In [2]:
obj = pd.Series([2,4,6,8]) #1D array like object with labels/indices for each element
obj

0    2
1    4
2    6
3    8
dtype: int64

In [3]:
obj.values # values attribute provides the data itself

array([2, 4, 6, 8])

In [4]:
obj.index # use index to access labels/indices

RangeIndex(start=0, stop=4, step=1)

In [5]:
obj2 = pd.Series([1,2,3,4],index=['one','two','three','four']) # enter labels into index
obj2

one      1
two      2
three    3
four     4
dtype: int64

In [6]:
obj2['two'] # now we can access data elements by their label

2

In [7]:
obj2[obj2 > 2] # we can use boolean arrays as indices like we can in numpy

three    3
four     4
dtype: int64

In [8]:
obj2 ** 2 # and we can operate on them the same way as well

one       1
two       4
three     9
four     16
dtype: int64

In [9]:
'two' in obj2 # it is similar to a dict and we can use it the same way

True

In [10]:
adict = {'one':1,'two':2,'three':3,'four':4}
nowaseries = pd.Series(adict)
nowaseries # the keys appear in sorted order when converted from dict to series

four     4
one      1
three    3
two      2
dtype: int64

In [11]:
obj2['five'] = 5
obj3 = obj2 - nowaseries # whenever we operate on multiple series, the keys are auto sorted
obj3

five     NaN
four     0.0
one      0.0
three    0.0
two      0.0
dtype: float64

In [12]:
obj3.isnull()# can use .isnull and .notnull to check for missing data values

five      True
four     False
one      False
three    False
two      False
dtype: bool

In [13]:
obj2.name = 'int' # we can set the name for a Series elements and its indices
obj2.index.name = 'str'
obj2

str
one      1
two      2
three    3
four     4
five     5
Name: int, dtype: int64

### DataFrame
- basically a rectangular table of data with an ordered collection of columns (which can have different types)
- think of it as a dict of Series
- internally, data is stored as 2D blocks
- it is only 2D, but we can represent higher dimensional data using hierarchical indexing

In [14]:
dict_of_eq_len_lists = {'state':['Ohio', 'Ohio','Ohio','Nevada','Nevada','Nevada'],
                       'year':[2000,2001,2002,2001,2002,2003],
                       'pop': [1.5,1.7,3.6,2.4,2.9,3.2]}
frame = pd.DataFrame(dict_of_eq_len_lists) # we can also create if from numpy arrays
frame # results displayed in a nice HTML table (thanks Jupyter, very cool)

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002
5,3.2,Nevada,2003


In [15]:
frame.head() # use head to see only the first 5 rows

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [16]:
pd.DataFrame(dict_of_eq_len_lists, columns=['year','pop','state'])
# change the order of the columns like so

Unnamed: 0,year,pop,state
0,2000,1.5,Ohio
1,2001,1.7,Ohio
2,2002,3.6,Ohio
3,2001,2.4,Nevada
4,2002,2.9,Nevada
5,2003,3.2,Nevada


In [17]:
frame['year'] # Series can be retrieved either as a dict or as an attribute

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [18]:
frame.loc[1] # use loc to get rows by their index

pop       1.7
state    Ohio
year     2001
Name: 1, dtype: object

In [19]:
frame['year'] = np.arange(6) # values in columns can be assigned to as need
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,0
1,1.7,Ohio,1
2,3.6,Ohio,2
3,2.4,Nevada,3
4,2.9,Nevada,4
5,3.2,Nevada,5


In [20]:
debts = pd.Series([-1.2,-1.5,-1.7],index=[1,3,5])
frame['debt'] = debts # we can assign a series to a column, and the indices are matched
frame # nonrepresented indices appear NaN

Unnamed: 0,pop,state,year,debt
0,1.5,Ohio,0,
1,1.7,Ohio,1,-1.2
2,3.6,Ohio,2,
3,2.4,Nevada,3,-1.5
4,2.9,Nevada,4,
5,3.2,Nevada,5,-1.7


In [21]:
del frame['debt'] # delete columns the same as in python dicts
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,0
1,1.7,Ohio,1
2,3.6,Ohio,2
3,2.4,Nevada,3
4,2.9,Nevada,4
5,3.2,Nevada,5


- ops on the columns via indexing return views of the data, not copies

In [22]:
dict_of_dicts = {'Nevada':{2001:2.4,2002:2.9}, 
                'Ohio':{2000:1.5,2001:1.7,2002:3.6}}
frame2 = pd.DataFrame(dict_of_dicts) # can create dataframe from dict of dicts as well
frame2 # in this case, the outer keys are the columns and the inner keys are the row indices

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


- the same method works for dicts of Series

In [23]:
frame2.T # we can transpose dataframes the same way we do numpy arrays

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


In [24]:
frame2.values # use values attribute to turn table data into a 2D array

array([[nan, 1.5],
       [2.4, 1.7],
       [2.9, 3.6]])

### Index Objects
- dataframe indices are actually their own object and are immutable

In [25]:
indices = pd.Index(['a','b','c'])
indices

Index([u'a', u'b', u'c'], dtype='object')

In [26]:
'a' in indices # index objects also behave like fixed-size sets with duplicates

True

In [27]:
indices.is_monotonic # there are several set-like functions we can use on Index objects

True

### Reindexing
- reindex - to create a new object with the data conformed to a new indices

In [28]:
frame # we can reindex the rows and columns of this data

Unnamed: 0,pop,state,year
0,1.5,Ohio,0
1,1.7,Ohio,1
2,3.6,Ohio,2
3,2.4,Nevada,3
4,2.9,Nevada,4
5,3.2,Nevada,5


In [29]:
frame.reindex(reversed(frame.index)) # here reindex rearranges the rows as specified

Unnamed: 0,pop,state,year
5,3.2,Nevada,5
4,2.9,Nevada,4
3,2.4,Nevada,3
2,3.6,Ohio,2
1,1.7,Ohio,1
0,1.5,Ohio,0


In [30]:
frame.reindex(columns=reversed(frame.columns)) # and we can use the columns option to rearrange columns

Unnamed: 0,year,state,pop
0,0,Ohio,1.5
1,1,Ohio,1.7
2,2,Ohio,3.6
3,3,Nevada,2.4
4,4,Nevada,2.9
5,5,Nevada,3.2


### Drop Entries from an Axis
- there are builtins for deleting data elements from an axis

In [31]:
frame.drop(3) # it is straightforward to use; we can pass it a list of entries to drop as well

Unnamed: 0,pop,state,year
0,1.5,Ohio,0
1,1.7,Ohio,1
2,3.6,Ohio,2
4,2.9,Nevada,4
5,3.2,Nevada,5


In [32]:
frame.drop('year', axis=1, inplace=True) # this is how we specify a drop from any axis in-place

In [33]:
frame

Unnamed: 0,pop,state
0,1.5,Ohio
1,1.7,Ohio
2,3.6,Ohio
3,2.4,Nevada
4,2.9,Nevada
5,3.2,Nevada


### Indexing, Selection, and Filtering
- we can index into series similar to how we do with numpy arrays

In [34]:
frame['pop'][[1,3]] # here we get a column and provide a list of indices to collect

1    1.7
3    2.4
Name: pop, dtype: float64

In [35]:
data = pd.DataFrame(np.arange(16).reshape((4,4)),index=['Ohio','Colorado','Utah','New York'], columns=['one','two','three','four'])
data # just creating some new examples

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [36]:
data[data['three'] > 5] # we can select rows using boolean arrays and slicing as show below

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [37]:
data[:2] # selecting rows with slicing

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [38]:
data > 6 # create convenient boolean indices by using a scalar comparison

Unnamed: 0,one,two,three,four
Ohio,False,False,False,False
Colorado,False,False,False,True
Utah,True,True,True,True
New York,True,True,True,True


In [39]:
data.loc['Utah',['two','four']]# use loc to select DataFrame rows and columns using labels

two      9
four    11
Name: Utah, dtype: int64

In [40]:
data.iloc[3,3] # use iloc to select rows and columns using ints

15

In [41]:
data.loc['Ohio':'Utah','two':] # iloc and loc work with slices too

Unnamed: 0,two,three,four
Ohio,1,2,3
Colorado,5,6,7
Utah,9,10,11


In [42]:
ser = pd.Series(np.arange(3.), index=['a','b','c'])# if indices are ints, then indexing is label-based only
ser[-1] # but we can have position-based indexing if indices are not ints

2.0

### Arithmetic and Data Alignment
- if adding objects and some indices mismatch, then the matches are operated on and any mismatches result in NaN

In [43]:
ser2 = pd.Series(np.arange(4), index=['d','c','b','a'])
ser + ser2 # result is equivalent to automatic outer join on index labels
# note that when operating on series or dataframes, rows and columns are auto aligned

a    3.0
b    3.0
c    3.0
d    NaN
dtype: float64

In [44]:
ser.add(ser2, fill_value=0) # specify a default fill when there could be NaNs

a    3.0
b    3.0
c    3.0
d    0.0
dtype: float64

In [45]:
data - data.iloc[0] # subtracting a row from a dataframe subtracts the row from every row

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,4,4,4,4
Utah,8,8,8,8
New York,12,12,12,12


- this row subtraction effect is called 'broadcasting' and is quite useful
- a series' indices are auto matched with a dataframe's columns and the op is broadcast down the rows

In [46]:
data.sub(data['one'], axis=0)# to broadcast across columns, we use an arithmetic method like so

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,0,1,2,3
Utah,0,1,2,3
New York,0,1,2,3


### Function Application and Mapping
- NumPy ufuncs (element-wise array methods) also work in Pandas

In [47]:
np.max(data.loc['Ohio']) # gets the max column value for the Ohio row

3

In [48]:
func = lambda x : x.min() ** 2 # function to test mapping with
data.apply(func) # we can specify the axis to apply the function to

one      0
two      1
three    4
four     9
dtype: int64

In [49]:
form = lambda x : '%.2f' % x
data.applymap(form) # we can apply functions element-wise like so

Unnamed: 0,one,two,three,four
Ohio,0.0,1.0,2.0,3.0
Colorado,4.0,5.0,6.0,7.0
Utah,8.0,9.0,10.0,11.0
New York,12.0,13.0,14.0,15.0


### Sorting and Ranking

In [50]:
data.sort_index() # sort by row index like so in ascending order

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
New York,12,13,14,15
Ohio,0,1,2,3
Utah,8,9,10,11


In [51]:
data.sort_index(axis=1, ascending=False) # sort by column index in descending order

Unnamed: 0,two,three,one,four
Ohio,1,2,0,3
Colorado,5,6,4,7
Utah,9,10,8,11
New York,13,14,12,15


In [52]:
data['two'].sort_values(ascending=False) # can sort the values in Series

New York    13
Utah         9
Colorado     5
Ohio         1
Name: two, dtype: int64

In [53]:
data.sort_values(by='three', ascending=False) # can sort by the values in a column

Unnamed: 0,one,two,three,four
New York,12,13,14,15
Utah,8,9,10,11
Colorado,4,5,6,7
Ohio,0,1,2,3


- ranking assigns ranks to the data based on its index if it was sorted

In [54]:
nums = pd.Series(['4','3','2','1','0'],index=['four','three','two','one','zero'])
nums.rank() # each data element is given a value corresponding to its rank

four     5.0
three    4.0
two      3.0
one      2.0
zero     1.0
dtype: float64

- there are several tie-breaking methods: average, min, max, first, dense

### Axis Indexes with Duplicate Labels
- it is not necessary for obejcts to have unique indices, we can check this with indexs' is_unique method
- if there are dups, a query returns all duplicates instead of a single one

In [55]:
data.loc['Florida'] = pd.Series([16,17,18,19], index=data.columns) # add a new row like so
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15
Florida,16,17,18,19


### Summarizing and Computing Descriptive Statistics
- there are some good builtins for this (mostly reductions)

In [56]:
data.sum() # returns the column sum

one      40
two      45
three    50
four     55
dtype: int64

In [57]:
data.sum(axis=1) # returns the row sum; skipna keyword skips any row containing NaN

Ohio         6
Colorado    22
Utah        38
New York    54
Florida     70
dtype: int64

In [58]:
data.idxmax(axis=1) # idmax and idmin return the column label with the max/min value for each row

Ohio        four
Colorado    four
Utah        four
New York    four
Florida     four
dtype: object

In [59]:
data.describe() #produces some common stats for each column label

Unnamed: 0,one,two,three,four
count,5.0,5.0,5.0,5.0
mean,8.0,9.0,10.0,11.0
std,6.324555,6.324555,6.324555,6.324555
min,0.0,1.0,2.0,3.0
25%,4.0,5.0,6.0,7.0
50%,8.0,9.0,10.0,11.0
75%,12.0,13.0,14.0,15.0
max,16.0,17.0,18.0,19.0


- other stat methods include: count, quantile, mad, var, skew, etc

### Correlation and Covariance: Finance Example
- calculated on pairs of objects

In [61]:
import pandas_datareader.data as web
all_data = {ticker:web.get_data_yahoo(ticker) for ticker in ['AAPL','IBM','MSFT','GOOG']}
price = pd.DataFrame({ticker: data['Adj Close'] for ticker, data in all_data.items()})
volume = pd.DataFrame({ticker: data['Volume'] for ticker, data in all_data.items()})

In [65]:
returns = price.pct_change()
returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-25,0.033137,0.015914,0.010865,0.009134
2019-01-28,-0.009255,-0.019166,0.002239,-0.019502
2019-01-29,-0.010365,-0.00884,0.000447,-0.020365
2019-01-30,0.068335,0.026815,0.000372,0.033417
2019-01-31,0.007201,0.025077,0.000298,-0.01833


In [69]:
returns['GOOG'].corr(returns['AAPL']) # get the correlation between two Series of data

0.45658613501890905

In [71]:
returns.MSFT.cov(returns['IBM']) # get the covariance between two Series of data (col as obj attribute)

8.769549742878532e-05

In [76]:
returns.corr() # or this gives all correlation data

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.456586,0.371469,0.44954
GOOG,0.456586,1.0,0.408169,0.537488
IBM,0.371469,0.408169,1.0,0.48639
MSFT,0.44954,0.537488,0.48639,1.0


In [78]:
returns.corrwith(returns.IBM) # but we can use more concise corrwith to compare 1 column to all the others

AAPL    0.371469
GOOG    0.408169
IBM     1.000000
MSFT    0.486390
dtype: float64

### Unique Values, Value Counts, and Membership

In [81]:
returns.IBM.value_counts().head() # we can get the frequencies of a series like so

 0.000000    7
-0.002673    1
-0.000161    1
 0.000275    1
 0.006264    1
Name: IBM, dtype: int64

In [83]:
pd.value_counts(returns.AAPL).head() # but we can also use Pandas version

 0.000000    3
-0.037569    1
 0.001630    1
 0.029434    1
 0.010530    1
Name: AAPL, dtype: int64

In [88]:
pd.value_counts(returns.AAPL).head().isin([3]) # and we can check if values are in a Series like so

 0.000000     True
-0.037569    False
 0.001630    False
 0.029434    False
 0.010530    False
Name: AAPL, dtype: bool