In [1]:
import numpy as np
print("numpy version: {}".format(np.__version__))
import pandas as pd 
print("pandas version: {}".format(pd.__version__))
import matplotlib
import matplotlib.pyplot as plt
print("matplotlib version: {}".format(matplotlib.__version__))
import scipy as sp
print("scipy version: {}".format(sp.__version__))
import sklearn as sl
print("scikit-learn: {}".format(sl.__version__))
import seaborn as sns
print("seaborn: {}".format(sns.__version__))
import statsmodels as sm
print("statsmodels: {}".format(sm.__version__))

numpy version: 1.17.4
pandas version: 0.25.3
matplotlib version: 3.1.2
scipy version: 1.3.3
scikit-learn: 0.21.3
seaborn: 0.9.0
statsmodels: 0.10.2


## Handling Missing Data

The way that missing data is represented in pandas objects is somewhat imperfect,
but it is functional for a lot of users. For numeric data, pandas uses the floating-point
value NaN (Not a Number) to represent missing data. We call this a sentinel value that
can be easily detected:

In [2]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])

In [3]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [4]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
string_data[0] = None

In [6]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

dropna, fillna, isnull, notnull

In [7]:
from numpy import nan as NA

In [8]:
data = pd.Series([1, NA, 3.5, NA, 7])

In [9]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [10]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

With DataFrame objects, things are a bit more complex. You may want to drop rows
or columns that are all NA or only those containing any NAs. dropna by default drops
any row containing a missing value:

In [11]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                    [NA, NA, NA], [NA, 6.5, 3.]])

In [12]:
cleaned = data.dropna()

In [13]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [14]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


Passing how='all' will only drop rows that are all NA:

In [15]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


To drop columns in the same way, pass axis=1 :

In [16]:
data[4] = NA

In [17]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [18]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


A related way to filter out DataFrame rows tends to concern time series data. Suppose
you want to keep only rows containing a certain number of observations. You can
indicate this with the thresh argument:

In [19]:
df = pd.DataFrame(np.random.randn(7, 3))

In [20]:
df.iloc[:4, 1] = NA

In [21]:
df.iloc[:2, 2] = NA

In [22]:
df

Unnamed: 0,0,1,2
0,-0.717128,,
1,0.96399,,
2,-0.577861,,-0.269272
3,0.113432,,-0.160936
4,0.037198,0.519453,0.630641
5,-1.863318,0.393318,0.481553
6,0.397928,0.602605,2.465243


In [23]:
df.dropna()

Unnamed: 0,0,1,2
4,0.037198,0.519453,0.630641
5,-1.863318,0.393318,0.481553
6,0.397928,0.602605,2.465243


In [24]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-0.577861,,-0.269272
3,0.113432,,-0.160936
4,0.037198,0.519453,0.630641
5,-1.863318,0.393318,0.481553
6,0.397928,0.602605,2.465243


### Filling In Missing Data

Rather than filtering out missing data (and potentially discarding other data along
with it), you may want to fill in the “holes” in any number of ways. For most pur‐
poses, the fillna method is the workhorse function to use. Calling fillna with a
constant replaces missing values with that value:

In [25]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.717128,0.0,0.0
1,0.96399,0.0,0.0
2,-0.577861,0.0,-0.269272
3,0.113432,0.0,-0.160936
4,0.037198,0.519453,0.630641
5,-1.863318,0.393318,0.481553
6,0.397928,0.602605,2.465243


Calling fillna with a dict, you can use a different fill value for each column:

In [26]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,-0.717128,0.5,0.0
1,0.96399,0.5,0.0
2,-0.577861,0.5,-0.269272
3,0.113432,0.5,-0.160936
4,0.037198,0.519453,0.630641
5,-1.863318,0.393318,0.481553
6,0.397928,0.602605,2.465243


fillna returns a new object, but you can modify the existing object in-place:

In [27]:
_ = df.fillna(0, inplace=True)

In [28]:
df

Unnamed: 0,0,1,2
0,-0.717128,0.0,0.0
1,0.96399,0.0,0.0
2,-0.577861,0.0,-0.269272
3,0.113432,0.0,-0.160936
4,0.037198,0.519453,0.630641
5,-1.863318,0.393318,0.481553
6,0.397928,0.602605,2.465243


The same interpolation methods available for reindexing can be used with fillna :

In [29]:
df = pd.DataFrame(np.random.randn(6, 3))

In [30]:
df.iloc[2:, 1] = NA

In [31]:
df.iloc[4:, 2] = NA

In [32]:
df

Unnamed: 0,0,1,2
0,0.116656,0.75757,0.208982
1,-0.07544,-0.354168,0.758356
2,-0.035961,,0.642285
3,0.177026,,-2.19672
4,-0.365953,,
5,-0.772393,,


In [33]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.116656,0.75757,0.208982
1,-0.07544,-0.354168,0.758356
2,-0.035961,-0.354168,0.642285
3,0.177026,-0.354168,-2.19672
4,-0.365953,-0.354168,-2.19672
5,-0.772393,-0.354168,-2.19672


In [34]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.116656,0.75757,0.208982
1,-0.07544,-0.354168,0.758356
2,-0.035961,-0.354168,0.642285
3,0.177026,-0.354168,-2.19672
4,-0.365953,,-2.19672
5,-0.772393,,-2.19672


In [35]:
data = pd.Series([1., NA, 3.5, NA, 7])

In [36]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

## Data Transformation

### Removing Duplicates

Duplicate rows may be found in a DataFrame for any number of reasons.

In [38]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})

In [39]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [40]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [41]:
print(len(data))
data_droped = data.drop_duplicates()
print(len(data_droped))

7
6


In [42]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [43]:
data['v1'] = range(7)

In [44]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


duplicated and drop_duplicates by default keep the first observed value combina‐
tion. Passing keep='last' will return the last one:

In [45]:
data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


In [46]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                    'Pastrami', 'corned beef', 'Bacon',
                    'pastrami', 'honey ham', 'nova lox'],
                    'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [47]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [49]:
meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham': 'pig',
    'nova lox': 'salmon'
}

In [50]:
lowercased = data['food'].str.lower()

In [51]:
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [52]:
data['animal'] = lowercased.map(meat_to_animal)

In [53]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [54]:
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [55]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])

In [56]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

To replace these with NA
values that pandas understands, we can use replace , producing a new Series (unless
you pass inplace=True ):

In [57]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [58]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [59]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [60]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [61]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

In [62]:
transform = lambda x: x[:4].upper()

In [63]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [64]:
data.index = data.index.map(transform)

In [65]:
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


If you want to create a transformed version of a dataset without modifying the origi‐
nal, a useful method is rename :

In [66]:
data.rename(index={'OHIO': 'INDIANA'},
           columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [67]:
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)

In [68]:
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


### Discretization and Binning

In [69]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

Let’s divide these into bins of 18 to 25, 26 to 35, 36 to 60, and finally 61 and older. To
do so, you have to use cut , a function in pandas:

In [70]:
bins = [18, 25, 35, 60, 100]

In [71]:
cats = pd.cut(ages, bins)

In [72]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [73]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [74]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

pd.value_counts(cats) are the bin counts for the result of pandas.cut .

In [75]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [76]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [77]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

In [78]:
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [79]:
data = np.random.rand(20)

If you pass an integer number of bins to cut instead of explicit bin edges, it will com‐
pute equal-length bins based on the minimum and maximum values in the data.
Consider the case of some uniformly distributed data chopped into fourths:

In [80]:
pd.cut(data, 4, precision=2)

[(0.26, 0.49], (0.019, 0.26], (0.73, 0.97], (0.73, 0.97], (0.49, 0.73], ..., (0.73, 0.97], (0.26, 0.49], (0.73, 0.97], (0.019, 0.26], (0.26, 0.49]]
Length: 20
Categories (4, interval[float64]): [(0.019, 0.26] < (0.26, 0.49] < (0.49, 0.73] < (0.73, 0.97]]

A closely related function, qcut , bins the data based on sample quantiles. Depending
on the distribution of the data, using cut will not usually result in each bin having the
same number of data points. Since qcut uses sample quantiles instead, by definition
you will obtain roughly equal-size bins:

In [81]:
data = np.random.randn(1000) # Normally distributed

In [82]:
cats = pd.qcut(data, 4)

In [83]:
cats

[(0.679, 2.681], (-2.941, -0.632], (-2.941, -0.632], (0.679, 2.681], (-2.941, -0.632], ..., (-0.632, 0.0118], (-2.941, -0.632], (0.679, 2.681], (-0.632, 0.0118], (-2.941, -0.632]]
Length: 1000
Categories (4, interval[float64]): [(-2.941, -0.632] < (-0.632, 0.0118] < (0.0118, 0.679] < (0.679, 2.681]]

In [84]:
pd.value_counts(cats)

(0.679, 2.681]      250
(0.0118, 0.679]     250
(-0.632, 0.0118]    250
(-2.941, -0.632]    250
dtype: int64

In [85]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])

[(1.262, 2.681], (-1.256, 0.0118], (-1.256, 0.0118], (0.0118, 1.262], (-1.256, 0.0118], ..., (-1.256, 0.0118], (-1.256, 0.0118], (0.0118, 1.262], (-1.256, 0.0118], (-2.941, -1.256]]
Length: 1000
Categories (4, interval[float64]): [(-2.941, -1.256] < (-1.256, 0.0118] < (0.0118, 1.262] < (1.262, 2.681]]

In [86]:
data = pd.DataFrame(np.random.randn(1000, 4))

In [87]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.007172,0.023018,-0.037487,0.02338
std,0.980958,1.015843,1.011829,1.010441
min,-2.977898,-2.979676,-3.203186,-2.998162
25%,-0.681876,-0.678612,-0.709662,-0.677422
50%,0.014419,-0.004998,-0.07046,0.005868
75%,0.619752,0.709789,0.605668,0.689957
max,4.001775,2.859657,3.57322,3.405977


In [88]:
col = data[2]

In [89]:
col[np.abs(col) > 3]

205   -3.203186
229    3.573220
511    3.191844
936    3.152750
Name: 2, dtype: float64

In [90]:
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
205,2.01681,-0.142657,-3.203186,-1.369253
229,0.119754,-0.279929,3.57322,0.399163
511,-0.742608,-0.103785,3.191844,1.091967
584,-0.336692,-1.449178,-0.402841,3.02196
637,4.001775,0.692367,0.50538,0.283948
664,-0.942337,-0.485765,-1.344024,3.405977
734,3.468797,-0.490506,0.975213,0.191443
936,-0.150224,1.591857,3.15275,-0.233494


In [91]:
data[np.abs(data) > 3] = np.sign(data) * 3

In [92]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.005701,0.023018,-0.038202,0.022952
std,0.975829,1.015843,1.008256,1.009096
min,-2.977898,-2.979676,-3.0,-2.998162
25%,-0.681876,-0.678612,-0.709662,-0.677422
50%,0.014419,-0.004998,-0.07046,0.005868
75%,0.619752,0.709789,0.605668,0.689957
max,3.0,2.859657,3.0,3.0


In [93]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,-1.0,1.0,-1.0,1.0
1,1.0,1.0,-1.0,-1.0
2,-1.0,1.0,-1.0,1.0
3,1.0,1.0,1.0,-1.0
4,1.0,1.0,1.0,-1.0


In [95]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))

In [97]:
sampler = np.random.permutation(5)

In [98]:
sampler

array([1, 4, 3, 2, 0])

In [99]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


That array can then be used in iloc -based indexing or the equivalent take function:

In [100]:
df.take(sampler)

Unnamed: 0,0,1,2,3
1,4,5,6,7
4,16,17,18,19
3,12,13,14,15
2,8,9,10,11
0,0,1,2,3


In [101]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
1,4,5,6,7
4,16,17,18,19
2,8,9,10,11


To generate a sample with replacement (to allow repeat choices), pass replace=True
to sample :

In [102]:
choices = pd.Series([5, 7, -1, 6, 4])

In [103]:
draws = choices.sample(n=10, replace=True)

In [104]:
draws

3    6
4    4
2   -1
4    4
1    7
3    6
2   -1
1    7
4    4
0    5
dtype: int64