# Python Data Science Handbook

## 3. Data Manipulation with Pandas

In [1]:
import pandas
pandas.__version__

'1.4.4'

In [3]:
import pandas as pd

In [4]:
pd?

[1;31mType:[0m        module
[1;31mString form:[0m <module 'pandas' from 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\pandas\\__init__.py'>
[1;31mFile:[0m        c:\programdata\anaconda3\lib\site-packages\pandas\__init__.py
[1;31mDocstring:[0m  
pandas - a powerful data analysis and manipulation library for Python

**pandas** is a Python package providing fast, flexible, and expressive data
structures designed to make working with "relational" or "labeled" data both
easy and intuitive. It aims to be the fundamental high-level building block for
doing practical, **real world** data analysis in Python. Additionally, it has
the broader goal of becoming **the most powerful and flexible open source data
analysis / manipulation tool available in any language**. It is already well on
its way toward this goal.

Main Features
-------------
Here are just a few of the things that pandas does well:

  - Easy handling of missing data in floating point as well as non-floating
    point d

### Introducing Pandas Objects

In [5]:
import numpy as np
import pandas as pd

### The Pandas Series Object

In [6]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [7]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [8]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [9]:
data[1]

0.5

In [10]:
data[1:3]

1    0.50
2    0.75
dtype: float64

### Series as Generalized NumPy Array

In [11]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index = ['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [12]:
data['b']

0.5

In [13]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index = [2, 5, 3, 7])
data

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64

In [14]:
data[5]

0.5

### Series as Specialized Dictionary

In [15]:
population_dict = {'California': 39538223, 'Texas': 29145505,
                   'Florida': 21538187, 'New York': 20201249,
                   'Pennsylvania': 13002700}
population = pd.Series(population_dict)

population

California      39538223
Texas           29145505
Florida         21538187
New York        20201249
Pennsylvania    13002700
dtype: int64

In [16]:
population['California']

39538223

In [18]:
population['California':'Florida']

California    39538223
Texas         29145505
Florida       21538187
dtype: int64

### Constructing Series Objects

In [20]:
pd.Series([2, 4, 6])

0    2
1    4
2    6
dtype: int64

In [22]:
pd.Series(5, index = [100, 200, 300])

100    5
200    5
300    5
dtype: int64

In [23]:
pd.Series({2:'a', 1:'b', 3:'c'})

2    a
1    b
3    c
dtype: object

In [24]:
pd.Series({2:'a', 1:'b', 3:'c'}, index=[1, 2])

1    b
2    a
dtype: object

### DataFrame as Generalized NumPy Array

In [25]:
area_dict = {'California': 423967, 'Texas': 695662, 'Florida': 170312,
             'New York': 141297, 'Pennsylvania': 119280}

area = pd.Series(area_dict)

area

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
dtype: int64

In [27]:
states = pd.DataFrame({'population': population,
                       'area': area})
states

Unnamed: 0,population,area
California,39538223,423967
Texas,29145505,695662
Florida,21538187,170312
New York,20201249,141297
Pennsylvania,13002700,119280


In [28]:
states.index

Index(['California', 'Texas', 'Florida', 'New York', 'Pennsylvania'], dtype='object')

In [29]:
states.columns

Index(['population', 'area'], dtype='object')

### DataFrame as Specialized Dictionary

In [30]:
states['area']

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
Name: area, dtype: int64

### Constructing DataFrame Objects

In [31]:
pd.DataFrame(population, columns = ['population'])

Unnamed: 0,population
California,39538223
Texas,29145505
Florida,21538187
New York,20201249
Pennsylvania,13002700


In [32]:
data = [{'a': i, 'b': 2 * i}
        for i in range(3)]

pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [33]:
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [34]:
pd.DataFrame({'population': population,
              'area': area})

Unnamed: 0,population,area
California,39538223,423967
Texas,29145505,695662
Florida,21538187,170312
New York,20201249,141297
Pennsylvania,13002700,119280


In [35]:
pd.DataFrame(np.random.rand(3, 2),
             columns = ['foo', 'bar'],
             index = ['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.385912,0.185063
b,0.620482,0.206212
c,0.550815,0.447333


In [37]:
A = np.zeros(3, dtype = [('A', 'i8'), ('B', 'f8')])
A

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])

In [38]:
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


### The Pandas Index Object

In [39]:
ind = pd.Index([2, 3, 5, 7, 11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

### Index as Immutable Array

In [41]:
ind[1]

3

In [42]:
ind[::2]

Int64Index([2, 5, 11], dtype='int64')

In [43]:
print(ind.size, ind.shape, ind.ndim, ind.dtype)

5 (5,) 1 int64


### Index as Ordered Set

In [45]:
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])

In [46]:
indA.intersection(indB)

Int64Index([3, 5, 7], dtype='int64')

In [47]:
indA.union(indB)

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [48]:
indA.symmetric_difference(indB)

Int64Index([1, 2, 9, 11], dtype='int64')

### Series as Dictionary

In [1]:
import pandas as pd

In [4]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index = ['a', 'b', 'c', 'd'])

data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [5]:
data['b']

0.5

In [6]:
'a' in data

True

In [7]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [8]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [9]:
data['e'] = 1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

### Series as One-Dimensional Array

In [11]:
# Slicicng by explicit index
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [12]:
# Slicing by implicit integer index
data[0:2]

a    0.25
b    0.50
dtype: float64

In [13]:
# Masking
data[(data > 0.3) & (data <0.8)]

b    0.50
c    0.75
dtype: float64

In [14]:
# Fancy indexing
data[['a', 'e']]

a    0.25
e    1.25
dtype: float64

### Indexers: <code>loc</code> and <code>iloc</code>

In [15]:
data[1]

0.5

In [16]:
data[1:3]

b    0.50
c    0.75
dtype: float64

In [17]:
data = pd.Series(['a', 'b', 'c'], index = [1, 3, 5])
data

1    a
3    b
5    c
dtype: object

In [18]:
# Explicit index when indexing
data[1]

'a'

In [21]:
data[3]

'b'

In [19]:
# Implicit index when slicing
data[1:3]

3    b
5    c
dtype: object

In [22]:
data.loc[1]

'a'

In [23]:
data.loc[1:3]

1    a
3    b
dtype: object

In [24]:
data.iloc[1]

'b'

In [25]:
data.iloc[1:3]

3    b
5    c
dtype: object

### DataFrame as Dictionary

In [26]:
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'Florida': 170312, 'New York': 141297,
                  'Pennsylvania': 119280})

pop = pd.Series({'California': 39538223, 'Texas': 29145505,
                 'Florida': 21538187, 'New York': 20201249, 
                 'Pennsylvania': 13002700})

data = pd.DataFrame({'area': area, 'pop': pop})

data

Unnamed: 0,area,pop
California,423967,39538223
Texas,695662,29145505
Florida,170312,21538187
New York,141297,20201249
Pennsylvania,119280,13002700


In [27]:
data['area']

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
Name: area, dtype: int64

In [28]:
data.area

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
Name: area, dtype: int64

In [29]:
data.pop is data['pop']

False

In [30]:
data['density'] = data['pop'] / data['area']
data

Unnamed: 0,area,pop,density
California,423967,39538223,93.257784
Texas,695662,29145505,41.896072
Florida,170312,21538187,126.463121
New York,141297,20201249,142.97012
Pennsylvania,119280,13002700,109.009893


### DataFrame as Two-Dimensional Array

In [31]:
data.values

array([[4.23967000e+05, 3.95382230e+07, 9.32577842e+01],
       [6.95662000e+05, 2.91455050e+07, 4.18960717e+01],
       [1.70312000e+05, 2.15381870e+07, 1.26463121e+02],
       [1.41297000e+05, 2.02012490e+07, 1.42970120e+02],
       [1.19280000e+05, 1.30027000e+07, 1.09009893e+02]])

In [32]:
data.T

Unnamed: 0,California,Texas,Florida,New York,Pennsylvania
area,423967.0,695662.0,170312.0,141297.0,119280.0
pop,39538220.0,29145500.0,21538190.0,20201250.0,13002700.0
density,93.25778,41.89607,126.4631,142.9701,109.0099


In [33]:
data.values[0]

array([4.23967000e+05, 3.95382230e+07, 9.32577842e+01])

In [34]:
data['area']

California      423967
Texas           695662
Florida         170312
New York        141297
Pennsylvania    119280
Name: area, dtype: int64

In [35]:
data.iloc[:3, :2]

Unnamed: 0,area,pop
California,423967,39538223
Texas,695662,29145505
Florida,170312,21538187


In [36]:
data.loc[:'Florida', :'pop']

Unnamed: 0,area,pop
California,423967,39538223
Texas,695662,29145505
Florida,170312,21538187


In [37]:
data.loc[data.density > 120, ['pop', 'density']]

Unnamed: 0,pop,density
Florida,21538187,126.463121
New York,20201249,142.97012


In [40]:
data.iloc[0, 2] = 90
data

Unnamed: 0,area,pop,density
California,423967,39538223,90.0
Texas,695662,29145505,41.896072
Florida,170312,21538187,126.463121
New York,141297,20201249,142.97012
Pennsylvania,119280,13002700,109.009893


### Additional Indexing Conventions

In [41]:
data['Florida':'New York']

Unnamed: 0,area,pop,density
Florida,170312,21538187,126.463121
New York,141297,20201249,142.97012


In [42]:
data[1:3]

Unnamed: 0,area,pop,density
Texas,695662,29145505,41.896072
Florida,170312,21538187,126.463121


In [43]:
data[data.density > 120]

Unnamed: 0,area,pop,density
Florida,170312,21538187,126.463121
New York,141297,20201249,142.97012


### Ufuncs: Index Preservation

In [44]:
import pandas as pd
import numpy as np

In [45]:
rng = np.random.default_rng(42)
ser = pd.Series(rng.integers(0, 10, 4))
ser

0    0
1    7
2    6
3    4
dtype: int64

In [47]:
df = pd.DataFrame(rng.integers(0, 10, (3, 4)),
                  columns = ['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,4,8,0,6
1,2,0,5,9
2,7,7,7,7


In [48]:
np.exp(ser)

0       1.000000
1    1096.633158
2     403.428793
3      54.598150
dtype: float64

In [49]:
np.sin(df * np.pi / 4)

Unnamed: 0,A,B,C,D
0,1.224647e-16,-2.449294e-16,0.0,-1.0
1,1.0,0.0,-0.707107,0.707107
2,-0.7071068,-0.7071068,-0.707107,-0.707107


### Index Alignment in Series

In [50]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name = 'area')

population = pd.Series({'California': 39538223, 'Texas': 29145505,
                        'Florida': 21538187}, name = 'population')

In [51]:
population / area

Alaska              NaN
California    93.257784
Florida             NaN
Texas         41.896072
dtype: float64

In [52]:
area.index.union(population.index)

Index(['Alaska', 'California', 'Florida', 'Texas'], dtype='object')

In [53]:
A = pd.Series([2, 4, 6], index = [0, 1, 2])
B = pd.Series([1, 3, 5], index = [1, 2, 3])
A + B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [54]:
A.add(B, fill_value = 0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

### Index Alignment in DataFrames

In [56]:
A = pd.DataFrame(rng.integers(0, 20, (2, 2)),
                 columns = ['a', 'b'])
A

Unnamed: 0,a,b
0,10,2
1,16,9


In [57]:
B = pd.DataFrame(rng.integers(0, 10, (3, 3)),
                 columns = ['b', 'a', 'c'])
B

Unnamed: 0,b,a,c
0,5,3,1
1,9,7,6
2,4,8,5


In [58]:
A + B

Unnamed: 0,a,b,c
0,13.0,7.0,
1,23.0,18.0,
2,,,


In [60]:
A.add(B, fill_value = A.values.mean())

Unnamed: 0,a,b,c
0,13.0,7.0,10.25
1,23.0,18.0,15.25
2,17.25,13.25,14.25


### Ufuncs: Operations Between DataFrames and Series

In [61]:
A = rng.integers(10, size = (3, 4))
A

array([[4, 4, 2, 0],
       [5, 8, 0, 8],
       [8, 2, 6, 1]], dtype=int64)

In [62]:
A - A[0]

array([[ 0,  0,  0,  0],
       [ 1,  4, -2,  8],
       [ 4, -2,  4,  1]], dtype=int64)

In [63]:
df = pd.DataFrame(A, columns = ['Q', 'R', 'S', 'T'])
df - df.iloc[0]

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,1,4,-2,8
2,4,-2,4,1


In [64]:
df.subtract(df['R'], axis = 0)

Unnamed: 0,Q,R,S,T
0,0,0,-2,-4
1,-3,0,-8,0
2,6,0,4,-1


In [65]:
halfrow = df.iloc[0, ::2]
halfrow

Q    4
S    2
Name: 0, dtype: int64

In [66]:
df - halfrow

Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,1.0,,-2.0,
2,4.0,,4.0,


### Missing Data in Pandas

In [1]:
import numpy as np
import pandas as pd

### <code>None</code> as a Sentinel Value

In [2]:
vals1 = np.array([1, None, 2, 3])
vals1

array([1, None, 2, 3], dtype=object)

In [3]:
%timeit np.arange(1E6, dtype = int).sum()

1.94 ms ± 30.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [4]:
%timeit np.arange(1E6, dtype = object).sum()

53.2 ms ± 538 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [6]:
# Leads to error:
# vals1.sum()

### <code>NaN</code>: Missing Numerical Data

In [7]:
vals2 = np.array([1, np.nan, 3, 4])
vals2

array([ 1., nan,  3.,  4.])

In [8]:
1 + np.nan

nan

In [9]:
0 * np.nan

nan

In [10]:
10 * np.nan

nan

In [11]:
vals2.sum(), vals2.min(), vals2.max()

(nan, nan, nan)

In [12]:
np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)

(8.0, 1.0, 4.0)

### <code>NaN</code> and <code>None</code> in Pandas

In [13]:
pd.Series([1, np.nan, 2, None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [14]:
pd.Series([1, 2, None])

0    1.0
1    2.0
2    NaN
dtype: float64

In [15]:
pd.Series([1, 2, np.nan])

0    1.0
1    2.0
2    NaN
dtype: float64

In [16]:
x = pd.Series(range(2), dtype = int)
x

0    0
1    1
dtype: int32

In [17]:
x[0] = None
x

0    NaN
1    1.0
dtype: float64

In [22]:
x2 = pd.Series(['a', 'b'], dtype = object)
x2

0    a
1    b
dtype: object

In [24]:
x2[0] = None
x2

0    None
1       b
dtype: object

In [25]:
x3 = pd.Series([1.1, 1.2], dtype = float)
x3

0    1.1
1    1.2
dtype: float64

In [26]:
x3[0] = None
x3

0    NaN
1    1.2
dtype: float64

In [27]:
x4 = pd.Series([0, 0, 1, 1], dtype = bool)
x4

0    False
1    False
2     True
3     True
dtype: bool

In [28]:
x4[0] = None
x4

0      NaN
1    False
2     True
3     True
dtype: object

### Pandas Nullable Dtypes

In [30]:
pd.Series([1, np.nan, 2, None, pd.NA], dtype = 'Int32')

0       1
1    <NA>
2       2
3    <NA>
4    <NA>
dtype: Int32

In [35]:
pd.Series([1, np.nan, 2, None, pd.NA])

0       1
1     NaN
2       2
3    None
4    <NA>
dtype: object