# Getting Started with pandas

In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

## Introduction to pandas data structures

### Series

In [3]:
obj = pd.Series([4, 5, 6])
obj
obj.values
obj.index

0    4
1    5
2    6
dtype: int64

array([4, 5, 6], dtype=int64)

RangeIndex(start=0, stop=3, step=1)

In [5]:
obj2 = pd.Series([4, 5 ,6], index=['a', 'b', 'c'])
obj2
obj2.values
obj2.index

a    4
b    5
c    6
dtype: int64

array([4, 5, 6], dtype=int64)

Index(['a', 'b', 'c'], dtype='object')

In [9]:
obj2['b']
obj2[obj2>5]

5

c    6
dtype: int64

In [12]:
obj2 * 2
np.exp(obj2)

a     8
b    10
c    12
dtype: int64

a     54.598150
b    148.413159
c    403.428793
dtype: float64

In [13]:
'b' in obj2

True

In [17]:
sdata = {'Houston': 200, 'Dalls': 300, 'Austin': 400}
sdata
obj3 = pd.Series(sdata)     # sort the key
obj3

{'Austin': 400, 'Dalls': 300, 'Houston': 200}

Austin     400
Dalls      300
Houston    200
dtype: int64

In [19]:
city = ['Houston', 'Dalls', 'Austin', 'Beijing']
obj4 = pd.Series(sdata, index=city) # 'Beijing' is missing
obj4

Houston    200.0
Dalls      300.0
Austin     400.0
Beijing      NaN
dtype: float64

In [26]:
pd.isnull(obj4)
obj4.notnull()

Houston    False
Dalls      False
Austin     False
Beijing     True
dtype: bool

Houston     True
Dalls       True
Austin      True
Beijing    False
dtype: bool

In [27]:
obj3 + obj4            # similar to a join operation

Austin     800.0
Beijing      NaN
Dalls      600.0
Houston    400.0
dtype: float64

In [29]:
obj3.index = ["Beijing", "Shanghai", "Shengzhen"]
obj3

Beijing      400
Shanghai     300
Shengzhen    200
dtype: int64

### DataFrame

In [33]:
data = {'state': ['Texas', 'Ohio', 'Maine', 'Massachusetts'],
        'year': [2000, 2001, 2002, 2002],
        'pop': [1.5, 1.3, 1.8, 2.4]}
frame = pd.DataFrame(data)
frame
frame.head()
pd.DataFrame(data, columns=['year', 'state', 'pop', 'gdp'])

Unnamed: 0,pop,state,year
0,1.5,Texas,2000
1,1.3,Ohio,2001
2,1.8,Maine,2002
3,2.4,Massachusetts,2002


Unnamed: 0,pop,state,year
0,1.5,Texas,2000
1,1.3,Ohio,2001
2,1.8,Maine,2002
3,2.4,Massachusetts,2002


Unnamed: 0,year,state,pop,gdp
0,2000,Texas,1.5,
1,2001,Ohio,1.3,
2,2002,Maine,1.8,
3,2002,Massachusetts,2.4,


In [36]:
frame['state']
frame.year
frame.loc[1]

0            Texas
1             Ohio
2            Maine
3    Massachusetts
Name: state, dtype: object

0    2000
1    2001
2    2002
3    2002
Name: year, dtype: int64

pop       1.3
state    Ohio
year     2001
Name: 1, dtype: object

In [43]:
frame['gdp'] = 100
frame
frame['year'] = np.arange(2014, 2018)
frame['south'] = frame.state == 'Texas'
frame

Unnamed: 0,pop,state,year,gdp,south
0,1.5,Texas,2014,100,True
1,1.3,Ohio,2015,100,False
2,1.8,Maine,2016,100,False
3,2.4,Massachusetts,2017,100,False


Unnamed: 0,pop,state,year,gdp,south
0,1.5,Texas,2014,100,True
1,1.3,Ohio,2015,100,False
2,1.8,Maine,2016,100,False
3,2.4,Massachusetts,2017,100,False


In [45]:
del frame['gdp']
frame

Unnamed: 0,pop,state,year,south
0,1.5,Texas,2014,True
1,1.3,Ohio,2015,False
2,1.8,Maine,2016,False
3,2.4,Massachusetts,2017,False


In [49]:
pop = {'Nevada': {2001: 1.4, 2002: 2.6},
       'Georgia':{2000: 1.6, 2001: 2.4, 2003: 2.5}}
frame3 = pd.DataFrame(pop)
frame3
frame3.T

Unnamed: 0,Georgia,Nevada
2000,1.6,
2001,2.4,1.4
2002,,2.6
2003,2.5,


Unnamed: 0,2000,2001,2002,2003
Georgia,1.6,2.4,,2.5
Nevada,,1.4,2.6,


In [52]:
pdata = {'Nevada': frame3['Nevada'][:-1],
         'Georgia': frame3['Georgia'][:1]}
pd.DataFrame(pdata)

Unnamed: 0,Georgia,Nevada
2000,1.6,
2001,,1.4
2002,,2.6


In [59]:
frame3.index
frame3.values
frame3.columns

Int64Index([2000, 2001, 2002, 2003], dtype='int64')

array([[ 1.6,  nan],
       [ 2.4,  1.4],
       [ nan,  2.6],
       [ 2.5,  nan]])

Index(['Georgia', 'Nevada'], dtype='object')

### index objects

In [60]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
obj
index = obj.index
index
index[1:]

a    0
b    1
c    2
dtype: int32

Index(['a', 'b', 'c'], dtype='object')

Index(['b', 'c'], dtype='object')

In [62]:
obj = pd.Series(['a', 'b', 'c'], index=range(3))
obj

0    a
1    b
2    c
dtype: object

## Essential functionality

### reindexing

In [64]:
obj = pd.Series([3.2, 4.2, 5.3, 6.1], index=['d', 'c', 'a', 'b'])
obj
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e', 'f'])
obj2

d    3.2
c    4.2
a    5.3
b    6.1
dtype: float64

a    5.3
b    6.1
c    4.2
d    3.2
e    NaN
f    NaN
dtype: float64

In [69]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 3, 5])
obj3
obj3.reindex(range(8), method='ffill')
obj3.reindex(range(8), method='bfill')
obj3

0      blue
3    purple
5    yellow
dtype: object

0      blue
1      blue
2      blue
3    purple
4    purple
5    yellow
6    yellow
7    yellow
dtype: object

0      blue
1    purple
2    purple
3    purple
4    yellow
5    yellow
6       NaN
7       NaN
dtype: object

0      blue
3    purple
5    yellow
dtype: object

In [75]:
frame = pd.DataFrame(np.arange(9).reshape(3, 3), index=[0, 3, 6],
                     columns=['Ohio', 'Texas', 'Maine'])
frame
frame2 = frame.reindex(np.arange(7))
frame2

states = ['Texas', 'Ohio', 'Michigan']
frame3 = frame2.reindex(columns=states)
frame3

Unnamed: 0,Ohio,Texas,Maine
0,0,1,2
3,3,4,5
6,6,7,8


Unnamed: 0,Ohio,Texas,Maine
0,0.0,1.0,2.0
1,,,
2,,,
3,3.0,4.0,5.0
4,,,
5,,,
6,6.0,7.0,8.0


Unnamed: 0,Texas,Ohio,Michigan
0,1.0,0.0,
1,,,
2,,,
3,4.0,3.0,
4,,,
5,,,
6,7.0,6.0,


In [77]:
frame.loc[np.arange(7), states]

Unnamed: 0,Texas,Ohio,Michigan
0,1.0,0.0,
1,,,
2,,,
3,4.0,3.0,
4,,,
5,,,
6,7.0,6.0,


### dropping entities from an axis

In [82]:
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj2 = obj.drop('c')
obj2
obj3 = obj.drop(['b', 'c'])
obj3
obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

a    0.0
d    3.0
e    4.0
dtype: float64

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [98]:
frame = pd.DataFrame(np.arange(16).reshape((4, 4)),
                     index=['Ohio', 'Texas', 'Utah', 'Washington'],
                     columns=['one', 'two', 'three', 'four'])
frame
frame.drop(['Ohio','Utah'])
frame.drop('Texas')
frame.drop(['two', 'three'], 1)
frame
frame.drop(['Texas', 'Washington'], inplace=True)    # change the original data frame
frame

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Texas,4,5,6,7
Utah,8,9,10,11
Washington,12,13,14,15


Unnamed: 0,one,two,three,four
Texas,4,5,6,7
Washington,12,13,14,15


Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Utah,8,9,10,11
Washington,12,13,14,15


Unnamed: 0,one,four
Ohio,0,3
Texas,4,7
Utah,8,11
Washington,12,15


Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Texas,4,5,6,7
Utah,8,9,10,11
Washington,12,13,14,15


Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Utah,8,9,10,11


### indexing, selection, and filtering

In [3]:
obj = pd.Series(np.arange(4), index=list('abcd'))
obj

a    0
b    1
c    2
d    3
dtype: int32

In [11]:
obj['a']
obj[1]
obj[1:4]
obj[[1, 2, 3]]
obj[['a', 'b']]
obj[obj < 3]
obj['a':'c']   # include 'c', slicing with labels is different from normal index

0

1

b    1
c    2
d    3
dtype: int32

b    1
c    2
d    3
dtype: int32

a    0
b    1
dtype: int32

a    0
b    1
c    2
dtype: int32

a    0
b    1
c    2
dtype: int32

In [13]:
obj['b':] = 5
obj

a    0
b    5
c    5
d    5
dtype: int32

In [17]:
data = pd.DataFrame(np.arange(16).reshape((4,4)), index=['Texas', 'Ohio', 'Utah', 'Maine'],
                    columns=['one', 'two', 'three', 'four'])
data
data['two']
data[['three', 'one']]

Unnamed: 0,one,two,three,four
Texas,0,1,2,3
Ohio,4,5,6,7
Utah,8,9,10,11
Maine,12,13,14,15


Texas     1
Ohio      5
Utah      9
Maine    13
Name: two, dtype: int32

Unnamed: 0,three,one
Texas,2,0
Ohio,6,4
Utah,10,8
Maine,14,12


In [32]:
data[2:3]
data['three'] > 7
data[data['three'] > 7]
data < 5

Unnamed: 0,one,two,three,four
Utah,8,9,10,11


Texas    False
Ohio     False
Utah      True
Maine     True
Name: three, dtype: bool

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
Maine,12,13,14,15


Unnamed: 0,one,two,three,four
Texas,True,True,True,True
Ohio,True,False,False,False
Utah,False,False,False,False
Maine,False,False,False,False


In [38]:
data.loc['Ohio', ['three', 'one']]
data.loc['Ohio', ['three', 'one']].index

three    6
one      4
Name: Ohio, dtype: int32

Index(['three', 'one'], dtype='object')

In [41]:
data.iloc[[1], [2]]
data.iloc[1, 2]

Unnamed: 0,three
Ohio,6


6

In [45]:
data.loc[:'Utah', 'two']
data.iloc[:, :3][data['three']>5]
data.iloc[:, :3][data.two>5]

Texas    1
Ohio     5
Utah     9
Name: two, dtype: int32

Unnamed: 0,one,two,three
Ohio,4,5,6
Utah,8,9,10
Maine,12,13,14


Unnamed: 0,one,two,three
Utah,8,9,10
Maine,12,13,14


### arithmetic and data alignment

In [47]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [48]:
df1 = pd.DataFrame(np.arange(9.).reshape((3,3)), index=['Ohio', 'Texas', 'Colorado'],
                   columns=['b', 'c', 'd'])
df2 = pd.DataFrame(np.arange(12.).reshape((4,3)), index=['Utah', 'Ohio', 'Texas', 
                   'Oregon'], columns=['b', 'd', 'e'])
df1; df2

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [52]:
df1 + df2
df1.add(df2, fill_value=0)

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


Unnamed: 0,b,c,d,e
Colorado,6.0,7.0,8.0,
Ohio,3.0,1.0,6.0,5.0
Oregon,9.0,,10.0,11.0
Texas,9.0,4.0,12.0,8.0
Utah,0.0,,1.0,2.0


In [53]:
arr = np.arange(12).reshape((3, 4))
arr
arr[0]
arr - arr[0]

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

array([0, 1, 2, 3])

array([[0, 0, 0, 0],
       [4, 4, 4, 4],
       [8, 8, 8, 8]])

In [54]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.iloc[0]
frame
series
frame - series         # match the index of columns and boradcast down

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


### function application and mapping

In [56]:
frame = pd.DataFrame(np.random.randn(4, 3), index=['Utah', 'Ohio', 'Texas', 'Oregon'],
                     columns=list('bde'))
frame
frame.abs()

Unnamed: 0,b,d,e
Utah,-0.088741,-0.19155,-1.044666
Ohio,-0.666769,0.044981,0.197224
Texas,-1.071395,0.38867,2.278752
Oregon,0.224987,1.310152,-1.883058


Unnamed: 0,b,d,e
Utah,0.088741,0.19155,1.044666
Ohio,0.666769,0.044981,0.197224
Texas,1.071395,0.38867,2.278752
Oregon,0.224987,1.310152,1.883058


Unnamed: 0,b,d,e
Utah,-0.088741,-0.19155,-1.044666
Ohio,-0.666769,0.044981,0.197224
Texas,-1.071395,0.38867,2.278752
Oregon,0.224987,1.310152,-1.883058


In [58]:
 f = lambda x: x.max() - x.min()
frame.apply(f)
frame.apply(f, axis=1)

b    1.296382
d    1.501702
e    4.161810
dtype: float64

Utah      0.955924
Ohio      0.863993
Texas     3.350147
Oregon    3.193209
dtype: float64

In [60]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

frame.apply(f)
frame.apply(f, axis=1)

Unnamed: 0,b,d,e
min,-1.071395,-0.19155,-1.883058
max,0.224987,1.310152,2.278752


Unnamed: 0,min,max
Utah,-1.044666,-0.088741
Ohio,-0.666769,0.197224
Texas,-1.071395,2.278752
Oregon,-1.883058,1.310152


In [68]:
format = lambda x: '%.2f' % x
frame.applymap(format)
frame['e'].map(format)

Unnamed: 0,b,d,e
Utah,-0.09,-0.19,-1.04
Ohio,-0.67,0.04,0.2
Texas,-1.07,0.39,2.28
Oregon,0.22,1.31,-1.88


Utah      -1.04
Ohio       0.20
Texas      2.28
Oregon    -1.88
Name: e, dtype: object

In [66]:
'%.2f' % frame.iloc[0, 0] == '-0.09'

True

### sorting and ranking

In [72]:
obj = pd.Series(range(4), index=list('dabc'))
obj
obj.sort_index()
obj

d    0
a    1
b    2
c    3
dtype: int32

a    1
b    2
c    3
d    0
dtype: int32

d    0
a    1
b    2
c    3
dtype: int32

In [78]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'], columns=['d', 'a', 'b', 'c'])
frame
frame.sort_index()
frame.sort_index(axis=1)
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [82]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj
obj.sort_values()
obj.sort_values(ascending=False)

0    4.0
1    NaN
2    7.0
3    NaN
4   -3.0
5    2.0
dtype: float64

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

2    7.0
0    4.0
5    2.0
4   -3.0
1    NaN
3    NaN
dtype: float64

In [88]:
frame = pd.DataFrame({'b': [4, 7, -3, 2],
                      'a': [0, 1, 0, 1]})
frame
frame.sort_values(by='b')
frame.sort_values(by=['a', 'b'])
frame.sort_values(by=['a', 'b'], ascending=False)
frame.sort_values(by=['a', 'b'], ascending=[False, True])

Unnamed: 0,a,b
0,0,4
1,1,7
2,0,-3
3,1,2


Unnamed: 0,a,b
2,0,-3
3,1,2
0,0,4
1,1,7


Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


Unnamed: 0,a,b
1,1,7
3,1,2
0,0,4
2,0,-3


Unnamed: 0,a,b
3,1,2
1,1,7
2,0,-3
0,0,4


In [91]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
obj.rank()
obj.rank(method='first')
obj.rank(ascending=False)

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

0    1.5
1    7.0
2    1.5
3    3.5
4    5.0
5    6.0
6    3.5
dtype: float64

In [92]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],
                      'c': [-2, 5, 8, -2.5]})
frame
frame.rank()
frame.rank(axis=1)

Unnamed: 0,a,b,c
0,0,4.3,-2.0
1,1,7.0,5.0
2,0,-3.0,8.0
3,1,2.0,-2.5


Unnamed: 0,a,b,c
0,1.5,3.0,2.0
1,3.5,4.0,3.0
2,1.5,1.0,4.0
3,3.5,2.0,1.0


Unnamed: 0,a,b,c
0,2.0,3.0,1.0
1,1.0,3.0,2.0
2,2.0,1.0,3.0
3,2.0,3.0,1.0


### axis indexes with duplicate values

In [93]:
obj = pd.Series(range(5), index=list('aabbc'))
obj

a    0
a    1
b    2
b    3
c    4
dtype: int32

In [94]:
obj.index.is_unique

False

In [95]:
obj['a']

a    0
a    1
dtype: int32

In [100]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])
df
df.loc['a']

Unnamed: 0,0,1,2
a,-1.2948,-1.233635,-0.138019
a,1.496162,-0.999489,0.759452
b,-0.680735,-0.90243,0.714947
b,-0.507529,0.08766,0.152221


Unnamed: 0,0,1,2
a,-1.2948,-1.233635,-0.138019
a,1.496162,-0.999489,0.759452


## Summarizing and Computing Descriptive Statistics

In [101]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
                  index=list('abcd'), columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [125]:
df.sum()

a = pd.DataFrame(df.sum()).T
a.index= ['sum']
a

df.sum(axis=1)
pd.DataFrame(df.sum(axis=1), columns=['sum'])

df.mean(axis=1, skipna=False)

one    9.25
two   -5.80
dtype: float64

Unnamed: 0,one,two
sum,9.25,-5.8


a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

Unnamed: 0,sum
a,1.4
b,2.6
c,0.0
d,-0.55


a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [128]:
df.describe()
df.describe().loc[['count', 'mean', 'std']]

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742


In [142]:
df.idxmin()
df.quantile([0.05, 0.1])

one    d
two    b
dtype: object

Unnamed: 0,one,two
0.05,0.815,-4.34
0.1,0.88,-4.18


### correlation and covariance

### unique values, counts, and membership

In [155]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
a= obj.unique()
a.sort()
obj.value_counts()
pd.value_counts(obj.values, sort=False)

c    3
a    3
b    2
d    1
dtype: int64

b    2
a    3
d    1
c    3
dtype: int64

In [156]:
mask = obj.isin(['b', 'c'])
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

## Moving ahead