# Getting Started with pandas

It contains high-level data structures and manipulation tools designed to make data analysis fast and easy in Python. pandas is built on top of NumPy and makes it easy to use in NumPy-centric applications.

In [1]:
import pandas as pd
import numpy as np

## Introduction to pandas Data Structures

### Series

In [2]:
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [3]:
obj.values

array([ 4,  7, -5,  3])

Index

In [4]:
obj2 = pd.Series(
    [4, 7, -5, 3],
    index = ['d', 'b', 'a', 'c']
                )
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [5]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [7]:
obj2['a']

-5

In [8]:
obj2[['a', 'b']]

a   -5
b    7
dtype: int64

In [9]:
obj2['d'] = 6

In [10]:
obj2

d    6
b    7
a   -5
c    3
dtype: int64

In [11]:
obj2[['c', 'a', 'd']]

c    3
a   -5
d    6
dtype: int64

In [12]:
obj2 > 0

d     True
b     True
a    False
c     True
dtype: bool

In [14]:
obj2[obj2 > 0]

d    6
b    7
c    3
dtype: int64

In [17]:
obj3 = pd.Series(
    ["APPL1", "GGLE", "AMZN", "MCRS"],
    index = ['d', 'b', 'a', 'c']
                )
obj3

d    APPL1
b     GGLE
a     AMZN
c     MCRS
dtype: object

In [None]:
ticker = 'APPL'

In [22]:
obj3[obj3.str.contains('APPL.*|GG', regex=True)]

d    APPL1
b     GGLE
dtype: object

In [25]:
obj2 * 2

d    12
b    14
a   -10
c     6
dtype: int64

In [26]:
obj2

d    6
b    7
a   -5
c    3
dtype: int64

In [27]:
np.exp(obj2)

d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [28]:
np.log(obj2)

  """Entry point for launching an IPython kernel.


d    1.791759
b    1.945910
a         NaN
c    1.098612
dtype: float64

In [None]:
'it's'

"it's"

From dictionary to series

In [36]:
sdata = {
    'Ohio': 35000,
    'Texas': 71000,
    'Oregon': 16000,
    'Utah': 5000
}

obj3 = pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [37]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index=states) 
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [39]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [40]:
pd.isnull(obj4).sum()

1

In [45]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [46]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [47]:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [49]:
obj4.notnull()

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [51]:
obj4[obj4.notnull()]

Ohio      35000.0
Oregon    16000.0
Texas     71000.0
dtype: float64

In [52]:
obj4[~obj4.notnull()]

California   NaN
dtype: float64

### DataFrame

A DataFrame represents a tabular, spreadsheet-like data structure containing an or-dered collection of columns, each of which can be a different value type (numeric, string, boolean, etc.). The DataFrame has both a row and column index; it can be thought of as a dict of Series (one for all sharing the same index). Compared with other such DataFrame-like structures you may have used before (like R’s data.frame), row-oriented and column-oriented operations in DataFrame are treated roughly symmet-rically. Under the hood, the data is stored as one or more two-dimensional blocks rather than a list, dict, or some other collection of one-dimensional arrays.

In [55]:
data = {
    'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
    'year': [2000, 2001, 2002, 2001, 2002, 2003],
    'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
}

frame = pd.DataFrame(data)

In [56]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [61]:
frame.head(5)

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [63]:
frame.tail(2)

Unnamed: 0,state,year,pop
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [66]:
pd.DataFrame(data,
             columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [67]:
frame2 = pd.DataFrame(data,
                      columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four',
                             'five', 'six'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [73]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [74]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [76]:
frame2.state

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [78]:
frame2[['state', 'pop']]

Unnamed: 0,state,pop
one,Ohio,1.5
two,Ohio,1.7
three,Ohio,3.6
four,Nevada,2.4
five,Nevada,2.9
six,Nevada,3.2


In [70]:
list(frame2)

['year', 'state', 'pop', 'debt']

In [72]:
for n in list(frame2):
    print(frame2[n])

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64
one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object
one      1.5
two      1.7
three    3.6
four     2.4
five     2.9
six      3.2
Name: pop, dtype: float64
one      NaN
two      NaN
three    NaN
four     NaN
five     NaN
six      NaN
Name: debt, dtype: object


In [87]:
frame2.loc['three']

year         2002
state        Ohio
pop           3.6
debt            2
emp      0.633745
Name: three, dtype: object

In [89]:
frame2['A'] = 5
frame2

Unnamed: 0,year,state,pop,debt,emp,A
one,2000,Ohio,1.5,0.0,0.328463,5
two,2001,Ohio,1.7,1.0,1.565648,5
three,2002,Ohio,3.6,2.0,0.633745,5
four,2001,Nevada,2.4,3.0,-0.153059,5
five,2002,Nevada,2.9,4.0,-0.657072,5
six,2003,Nevada,3.2,5.0,-0.958692,5


In [84]:
frame2['debt'] = np.arange(6.)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [86]:
frame2['emp'] = np.random.normal(0, 1, 6)
frame2

Unnamed: 0,year,state,pop,debt,emp
one,2000,Ohio,1.5,0.0,0.328463
two,2001,Ohio,1.7,1.0,1.565648
three,2002,Ohio,3.6,2.0,0.633745
four,2001,Nevada,2.4,3.0,-0.153059
five,2002,Nevada,2.9,4.0,-0.657072
six,2003,Nevada,3.2,5.0,-0.958692


### Index Objects

![](https://github.com/thomaspernet/PythonTeaching/blob/master/images/02_pandas.png?raw=true)

In [92]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
obj

a    0
b    1
c    2
dtype: int64

In [94]:
obj[1:]

b    1
c    2
dtype: int64

In [96]:
obj[-2:]

b    1
c    2
dtype: int64

index[1] = 'd'  # TypeError

![](https://github.com/thomaspernet/PythonTeaching/blob/master/images/01_pandas.png?raw=true)

## Essential Functionality



### Reindexing

In [None]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

In [None]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

In [None]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3

- ffill or pad Fill (or carry) values forward 
- bfill or backfill Fill (or carry) values backward 

In [None]:
obj3.reindex(range(6), method='ffill')

In [None]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])
frame

In [None]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

In [None]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

![](https://github.com/thomaspernet/PythonTeaching/blob/master/images/03_pandas.png?raw=true)

### Dropping Entries from an Axis

In [97]:
obj = pd.Series(np.arange(5.),
                index=['a', 'b', 'c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [98]:
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [99]:
obj.drop(['d', 'c'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [100]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [103]:
data.drop(['Colorado', 'Ohio'], axis = 0)

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [104]:
data.drop('two', axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [105]:
data.drop(['two', 'four'], axis='columns')

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


In [106]:
data.drop(columns = ['two', 'four'])

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


### Indexing, Selection, and Filtering

In [None]:
obj = pd.Series(np.arange(4.),
                index=['a', 'b', 'c', 'd']
               )
obj


In [None]:
obj['b']


In [None]:
obj[1]


In [None]:
obj[2:4]


In [None]:
obj[['b', 'a', 'd']]


In [None]:
obj[[1, 3]]


In [None]:
obj[obj < 2]

In [None]:
obj['b':'c']

In [107]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data


Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [None]:
data['two']

In [None]:
data[['three', 'one']]

In [None]:
data[:2]


In [None]:
data[data['three'] > 5]

In [None]:
data < 5


In [None]:
data[data < 5] = 0
data

#### Selection with loc and iloc

In [109]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [108]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int64

In [110]:
data.iloc[2, [3, 0, 1]]

four    11
one      8
two      9
Name: Utah, dtype: int64

In [111]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

In [112]:
data.iloc[[1, 2], [3, 0, 1]]

Unnamed: 0,four,one,two
Colorado,7,4,5
Utah,11,8,9


In [113]:
data.loc[:'Utah', 'two']


Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int64

In [114]:
data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,4,5,6
Utah,8,9,10
New York,12,13,14


### Integer Indexes

In [None]:
ser = pd.Series(np.arange(3.))
ser

In [None]:
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
ser2[-1]

In [None]:
ser[:1]


In [None]:
ser.loc[:1]


In [None]:
ser.iloc[:1]

### Arithmetic and Data Alignment

In [115]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],
               index=['a', 'c', 'e', 'f', 'g'])
s1


a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [116]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [117]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [119]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                   index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                   index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [120]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


Adding these together returns a DataFrame whose index and columns are the unions of the ones in each DataFrame:

In [121]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [122]:
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'B': [3, 4]})
df1


Unnamed: 0,A
0,1
1,2


In [123]:
df2

Unnamed: 0,B
0,3
1,4


In [124]:
df1 - df2

Unnamed: 0,A,B
0,,
1,,


#### Arithmetic methods with fill values

In [125]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                   columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                   columns=list('abcde'))
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [126]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [127]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


Value in df1 equals to 0

In [128]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [129]:
df1.add(df2, fill_value=10)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,14.0
1,9.0,11.0,13.0,15.0,19.0
2,18.0,20.0,22.0,24.0,24.0
3,25.0,26.0,27.0,28.0,29.0


![](https://github.com/thomaspernet/PythonTeaching/blob/master/images/04_pandas.png?raw=true)

#### Operations between DataFrame and Series

In [None]:
arr = np.arange(12.).reshape((3, 4))
arr


In [None]:
arr[0]

In [None]:
arr - arr[0]

In [None]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.iloc[0]
frame


In [None]:
series

In [None]:
frame - series

In [None]:
series2 = pd.Series(range(3), index=['b', 'e', 'f'])
frame + series2

In [None]:
series3 = frame['d']
frame


In [None]:
series3


In [None]:
frame.sub(series3, axis='index')

### Function Application and Mapping

In [130]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,-0.13397,-0.675093,0.394376
Ohio,-1.484653,-1.489163,0.458257
Texas,0.417975,2.754439,1.982142
Oregon,1.531262,0.644138,-0.282038


In [133]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.13397,0.675093,0.394376
Ohio,1.484653,1.489163,0.458257
Texas,0.417975,2.754439,1.982142
Oregon,1.531262,0.644138,0.282038


Fonction Lambda

In [141]:
for i in list(frame):
    print(frame[i].max() - frame[i].min())

3.0159145963849214
4.243602337329856
2.2641804153770053


In [135]:
f = lambda x: x.max() - x.min()
frame.apply(f)

b    3.015915
d    4.243602
e    2.264180
dtype: float64

In [139]:
frame.apply(lambda x: x.max() - x.min(), axis =0)

b    3.015915
d    4.243602
e    2.264180
dtype: float64

In [140]:
frame.apply(lambda x: x.max() - x.min(), axis =1)

Utah      1.069469
Ohio      1.947420
Texas     2.336464
Oregon    1.813300
dtype: float64

In [None]:
frame.apply(f, axis='columns')

In [None]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)

Element-wise Python functions can be used, too. Suppose you wanted to compute a formatted string from each floating point value in frame. You can do this with applymap: 

In [None]:
format = lambda x: '%.2f' % x
frame.applymap(format)

In [None]:
frame['e'].map(format)

### Sorting and Ranking

In [142]:
obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [144]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [145]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [146]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [147]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [149]:
obj = pd.Series([4, 7, -3, 2])
obj

0    4
1    7
2   -3
3    2
dtype: int64

In [150]:
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [151]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [152]:
frame.sort_values(by='b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [153]:
frame.sort_values(by=['a', 'b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [154]:
frame.sort_values(by=['a', 'b'],
                  ascending = [True, False])

Unnamed: 0,b,a
0,4,0
2,-3,0
1,7,1
3,2,1


In [156]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [157]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

In [158]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [None]:
# Assign tie values the maximum rank in the group
obj.rank(ascending=False, method='max')

In [166]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                      'c': [-2, 5, 8, -2.5]})
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [None]:
frame.rank(axis='columns')

### Axis Indexes with Duplicate Labels

In [None]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj

In [None]:
obj.index.is_unique

In [None]:
obj['a']

In [None]:
obj['c']

In [None]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df

In [None]:
df.loc['b']

## Summarizing and Computing Descriptive Statistics

![](https://github.com/thomaspernet/PythonTeaching/blob/master/images/05_pandas.png?raw=true)

In [159]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


![](https://github.com/thomaspernet/PythonTeaching/blob/master/images/06_pandas.png?raw=true)

In [160]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [161]:
df.sum(axis='columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [162]:
df.mean(axis='columns', skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [163]:
df.idxmax()

one    b
two    d
dtype: object

In [164]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [165]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [172]:
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,-2.5


In [176]:
(frame - frame.shift(1))/frame.shift(1)

Unnamed: 0,b,a,c
0,,,
1,0.627907,inf,-3.5
2,-1.428571,-1.0,0.6
3,-1.666667,inf,-1.3125


In [178]:
frame.shift(-1).fillna(frame.loc[0])

Unnamed: 0,b,a,c
0,7.0,1.0,5.0
1,-3.0,0.0,8.0
2,2.0,1.0,-2.5
3,4.3,0.0,-2.0


In [169]:
frame.pct_change()

Unnamed: 0,b,a,c
0,,,
1,0.627907,inf,-3.5
2,-1.428571,-1.0,0.6
3,-1.666667,inf,-1.3125


In [None]:
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
obj.describe()


![](https://github.com/thomaspernet/PythonTeaching/blob/master/images/07_pandas.png?raw=true)

### Correlation and Covariance

Download dataset

```
svn export https://github.com/wesm/pydata-book/trunk/datasets
```

conda install pandas-datareader

In [179]:
import os
os.getcwd()

'/Users/thomas/Google Drive/Projects/Data_science/GitHub/Repositories/PythonTeaching/Lectures/05_Pandas'

In [181]:
price = pd.read_pickle('yahoo_price.pkl')
volume = pd.read_pickle('yahoo_volume.pkl')

In [182]:
price.head()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,27.990226,313.062468,113.304536,25.884104
2010-01-05,28.038618,311.683844,111.935822,25.892466
2010-01-06,27.592626,303.826685,111.208683,25.733566
2010-01-07,27.541619,296.753749,110.823732,25.465944
2010-01-08,27.724725,300.709808,111.935822,25.641571


In [183]:
volume.head()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,123432400,3927000,6155300,38409100
2010-01-05,150476200,6031900,6841400,49749600
2010-01-06,138040000,7987100,5605300,58182400
2010-01-07,119282800,12876600,5840600,50559700
2010-01-08,111902700,9483900,4197200,51197400


import pandas_datareader.data as web
all_data = {ticker: web.get_data_yahoo(ticker)
            for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

price = pd.DataFrame({ticker: data['Adj Close']
                     for ticker, data in all_data.items()})
volume = pd.DataFrame({ticker: data['Volume']
                      for ticker, data in all_data.items()})

In [185]:
returns = price.pct_change()
returns.head()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,,,,
2010-01-05,0.001729,-0.004404,-0.01208,0.000323
2010-01-06,-0.015906,-0.025209,-0.006496,-0.006137
2010-01-07,-0.001849,-0.02328,-0.003462,-0.0104
2010-01-08,0.006648,0.013331,0.010035,0.006897


In [190]:
price.apply(lambda x: x - x.shift(1), axis = 0).head()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,,,,
2010-01-05,0.048392,-1.378624,-1.368714,0.008362
2010-01-06,-0.445992,-7.857159,-0.727139,-0.1589
2010-01-07,-0.051007,-7.072936,-0.384951,-0.267622
2010-01-08,0.183106,3.956059,1.11209,0.175627


In [198]:
returns[['MSFT', 'IBM']].corr()

Unnamed: 0,MSFT,IBM
MSFT,1.0,0.499764
IBM,0.499764,1.0


In [199]:
returns['MSFT'].corr(returns['IBM'])

0.4997636114415114

In [200]:
returns['MSFT'].cov(returns['IBM'])

8.870655479703546e-05

In [None]:
returns.MSFT.corr(returns.IBM)

In [209]:
cov = returns.corr()

Using DataFrame’s corrwith method, you can compute pairwise correlations between a DataFrame’s columns or rows with another Series or DataFrame

In [None]:
returns.corrwith(returns.IBM)

In [None]:
returns.corrwith(volume)

### Unique Values, Value Counts, and Membership

In [201]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])

In [202]:
uniques = obj.unique()
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [204]:
obj.value_counts()

a    3
c    3
b    2
d    1
dtype: int64

In [205]:
 obj.nunique()

4

In [None]:
pd.value_counts(obj.values, sort=False)

In [206]:
mask = obj.isin(['b', 'c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [207]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [211]:
cov

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.407919,0.386817,0.389695
GOOG,0.407919,1.0,0.405099,0.465919
IBM,0.386817,0.405099,1.0,0.499764
MSFT,0.389695,0.465919,0.499764,1.0


In [214]:
cov[cov.index.isin(["MSFT", "AAPL"])]

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.407919,0.386817,0.389695
MSFT,0.389695,0.465919,0.499764,1.0


In [215]:
cov[~cov.index.isin(["MSFT", "AAPL"])]

Unnamed: 0,AAPL,GOOG,IBM,MSFT
GOOG,0.407919,1.0,0.405099,0.465919
IBM,0.386817,0.405099,1.0,0.499764


In [None]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_vals = pd.Series(['c', 'b', 'a'])
pd.Index(unique_vals).get_indexer(to_match)

In [None]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 4, 4]})
data

In [None]:
result = data.apply(pd.value_counts).fillna(0)
result

![](https://github.com/thomaspernet/PythonTeaching/blob/master/images/08_pandas.png?raw=true)

## Handling Missing Data 

Missing data is common in most data analysis applications. One of the goals in de-signing pandas was to make working with missing data as painless as possible.

In [None]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

In [None]:
string_data.isnull() 

![](https://github.com/thomaspernet/PythonTeaching/blob/master/images/09_pandas.png?raw=true)

## Filtering Out Missing Data 

In [None]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7]) 
data

In [None]:
data.dropna() 

In [None]:
data[data.notnull()] 

In [216]:
data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                  [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [218]:
cleaned = data.dropna() 
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [220]:
data.dropna( subset = [0]) 

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,


Passing how='all' will only drop rows that are all NA: 

In [221]:
data.dropna(how='all') 

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


## Filling in Missing Data 

In [None]:
df.fillna(0)

In [None]:
df.fillna({1: 0.5, 3: -1}) 

fillna returns a new object, but you can modify the existing object in place: 

In [None]:
df.fillna(0, inplace=True) 

## Hierarchical Indexing 

Hierarchical indexing is an important feature of pandas enabling you to have multiple (two or more) index levels on an axis. Somewhat abstractly, it provides a way for you to work with higher dimensional data in a lower dimensional form. 

In [222]:
data = pd.Series(np.random.randn(10),
              index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],
                     [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]]) 
data

a  1    0.519353
   2    0.057696
   3   -1.119982
b  1   -1.206614
   2    1.048662
   3    1.775189
c  1    1.744777
   2    1.005374
d  2    0.665538
   3   -0.494339
dtype: float64

What you’re seeing is a prettified view of a Series with a MultiIndex as its index. The “gaps” in the index display mean “use the label directly above”: 

In [None]:
data.index 

With a hierarchically-indexed object, so-called partial indexing is possible, enabling you to concisely select subsets of the data: 

In [None]:
data['b'] 

In [224]:
data.loc[('a')]

1    0.519353
2    0.057696
3   -1.119982
dtype: float64

In [225]:
data.loc[('a'), 1]

0.5193533502180727

In [226]:
data.loc[('a'), 1]

0.5193533502180727

Hierarchical indexing plays a critical role in reshaping data and group-based operations like forming a pivot table. For example, this data could be rearranged into a DataFrame using its unstack method: 

PS: we will see more about unstack in the next lecture

In [227]:
data.unstack() 

Unnamed: 0,1,2,3
a,0.519353,0.057696,-1.119982
b,-1.206614,1.048662,1.775189
c,1.744777,1.005374,
d,,0.665538,-0.494339


With a DataFrame, either axis can have a hierarchical index: 

In [228]:
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=[['Ohio', 'Ohio', 'Colorado'],
                            ['Green', 'Red', 'Green']]) 
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [229]:
frame.index.names = ['key1', 'key2'] 
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [230]:
frame.columns.names = ['state', 'color'] 
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [231]:
frame['Ohio'] 

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


## Summary Statistics by Level 

Many descriptive and summary statistics on DataFrame and Series have a level option in which you can specify the level you want to sum by on a particular axis. Consider the above DataFrame; we can sum by level on either the rows or columns like so:

In [234]:
frame.sum(level='key2') 

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [235]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [233]:
frame.sum(level='color', axis=1) 

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


## Using a DataFrame’s Columns 

In [236]:
frame = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),
                   'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                   'd': [0, 1, 2, 0, 1, 2, 3]}) 
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [237]:
frame2 = frame.set_index(['c', 'd']) 
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [238]:
frame.set_index(['c', 'd'], drop=False) 

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [239]:
frame2.reset_index() 

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1
