* 创建类别型变量
    - pd.Series([], `dtype = 'category'`)
    - pd.DataFrame(, `dtype = 'category'`)
    - `pd.Series.astype('category')`
    - `pd.Series.astype(pd.api.types.CategoricalDtype(categories, ordered))`
        - `pd.Series.astype(str)`
        - `np.asarray(Series)`
    - `pd.DataFrame.astype('category')`
    - pd.cut(x, bins, right, `labels`)
* 类别型对象
    - object
        - `pd.Categorical([], categories, ordered)`
        - `pd.Categorical.from_codes([], categories, ordered)`
* 类CategoricalDtype
    - instance
        - `pd.api.types.CategoricalDtype(categories, ordered)`
        - attr
            - `pd.Series.cat.categories`
            - `pd.Series.cat.ordered`
* 描述统计
    - pd.DataFrame.describe()
* 属性
    - pd.Series.cat.categories
    - pd.Series.cat.ordered
* 类别重命名
    - pd.Series.cat.categories = []
    - pd.Series.cat.rename_categories([])
    - pd.Series.cat.rename_categories({'':, '':, ...})
* 增加新类别
    - pd.Series.cat.add_categories([])
* 删除类别
    - pd.Series.cat.remove_categories([])
* 删除无用类别
    - pd.Series.cat.remove_unused_categories()
* 设置类别
    - pd.Series.cat.set_categories([])
* 有序类别型变量
    - pd.Categorical([], ordered = True)
    - pd.Series.astype(pd.api.types.CategoricalDtype(ordered = True))
    - pd.Series.cat.as_ordered()
    - pd.Series.cat.as_unordered()
    - pd.Series.cat.set_categories(categories = [], ordered = True)
    - pd.Series.cat.reorder_categories(categories = [], ordered = True)
* 比较
* 操作
    - pd.Series.value_counts()
    - df.groupby().mean()
* pd.Series.cat.codes

In [1]:
import numpy as np
import pandas as pd

# 1. 创建类别型Series 或 DataFrame列

### `pd.Series(, dtype = 'category')`

In [194]:
# pd.Series([], dtype = 'category')
s = pd.Series(['a', 'b', 'c', 'a'], dtype = 'category')
print(s)
print()
print(s.cat.categories)
print()
print(s.cat.ordered)

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): [a, b, c]

Index(['a', 'b', 'c'], dtype='object')

False


### `Series.astype('category')`

In [198]:
# .astype('category')
df = pd.DataFrame({
    'A': ['a', 'b', 'c', 'a']
})
df['B'] = df['A'].astype('category')
print(df)
print()
print(df['B'])
print()
print(df['B'].cat.categories)
print()
print(df['B'].cat.ordered)

   A  B
0  a  a
1  b  b
2  c  c
3  a  a

0    a
1    b
2    c
3    a
Name: B, dtype: category
Categories (3, object): [a, b, c]

Index(['a', 'b', 'c'], dtype='object')

False


### `pd.cut()`

In [202]:
# pd.cut()
df = pd.DataFrame({
    'value': np.random.randint(0, 100, 20)
})
labels = ['{0} - {1}'.format(i, i + 9) for i in range(0, 100, 10)]
df['group'] = pd.cut(x = df.value, 
                     bins = range(0, 105, 10), right = False, 
                     labels = labels)
print(df.head())
print()
print(df.dtypes)
print()
print(df['group'])
print()
print(df['group'].cat.categories)
print()
print(df['group'].cat.ordered)

   value    group
0     90  90 - 99
1     25  20 - 29
2     70  70 - 79
3     60  60 - 69
4     79  70 - 79

value       int32
group    category
dtype: object

0     90 - 99
1     20 - 29
2     70 - 79
3     60 - 69
4     70 - 79
5     90 - 99
6     40 - 49
7     40 - 49
8     30 - 39
9     40 - 49
10    30 - 39
11    80 - 89
12    20 - 29
13    70 - 79
14    60 - 69
15    40 - 49
16    20 - 29
17    20 - 29
18      0 - 9
19    60 - 69
Name: group, dtype: category
Categories (10, object): [0 - 9 < 10 - 19 < 20 - 29 < 30 - 39 ... 60 - 69 < 70 - 79 < 80 - 89 < 90 - 99]

Index(['0 - 9', '10 - 19', '20 - 29', '30 - 39', '40 - 49', '50 - 59',
       '60 - 69', '70 - 79', '80 - 89', '90 - 99'],
      dtype='object')

True


### `pd.Categorical(, categories, ordered)`

In [203]:
# pd.Categorical(data, categories, ordered)
raw_cat_non_ordered = pd.Categorical(['a', 'b', 'c', 'a'], 
                                     categories = ['b', 'c', 'd'],
                                     ordered = False)
raw_cat_ordered = pd.Categorical(['a', 'b', 'c', 'd'], 
                                 categories = ['b', 'c', 'd'],
                                 ordered = True)
print(raw_cat_non_ordered)
print()
print(raw_cat_ordered)

[NaN, b, c, NaN]
Categories (3, object): [b, c, d]

[NaN, b, c, d]
Categories (3, object): [b < c < d]


In [207]:
s_non_ordered = pd.Series(raw_cat_non_ordered)
s_non_ordered

0    NaN
1      b
2      c
3    NaN
dtype: category
Categories (3, object): [b, c, d]

In [208]:
s_ordered = pd.Series(raw_cat_ordered)
s_ordered

0    NaN
1      b
2      c
3      d
dtype: category
Categories (3, object): [b < c < d]

In [209]:
df = pd.DataFrame({
    'A': ['a', 'b', 'c', 'a']
})
df['B'] = raw_cat
df.dtypes

A      object
B    category
dtype: object

### `pd.DataFrame(, dtype = 'category')`

In [210]:
df = pd.DataFrame({'A': list('abca'),
                   'B': list('bccd')}, 
                  dtype = 'category')
df.dtypes

A    category
B    category
dtype: object

### `pd.DataFrame.astype('category')`

In [39]:
df = pd.DataFrame({
    'A': list('abca'),
    'B': list('bccd')
})
df_cat = df.astype('category')
df_cat.dtypes

A    category
B    category
dtype: object

### cd  = `pd.api.types.CategoricalDtype`

* .astype(cd)
* .cat.categories
* .cat.ordered

In [40]:
from  pandas.api.types import CategoricalDtype

In [41]:
s = pd.Series(['a', 'b', 'c', 'a'])
cat_type = CategoricalDtype(categories = ['b', 'c', 'd'], 
                            ordered = True)
s_cat = s.astype(cat_type)
s_cat

0    NaN
1      b
2      c
3    NaN
dtype: category
Categories (3, object): [b < c < d]

In [214]:
df = pd.DataFrame({
    'A': list('abca'), 
    'B': list('bccd')
})
cat_type = CategoricalDtype(categories = list('abcd'), 
                            ordered = True)
df_cat = df.astype(cat_type)
df_cat

Unnamed: 0,A,B
0,a,b
1,b,c
2,c,c
3,a,d


### `pd.Categorical.from_codes(, categories, ordered)`

In [219]:
splitter = np.random.choice([0, 1], 5, p = [0.5, 0.5])
cat_code = pd.Categorical.from_codes(splitter, 
                                     categories = ['train', 'test'],
                                     ordered = True)
splitter_cat = pd.Series(cat_code)
splitter_cat

0    train
1     test
2     test
3     test
4     test
dtype: category
Categories (2, object): [train < test]

#### `.astype('category'), .astype(str), np.asarray()`

In [221]:
s = pd.Series(['a', 'b', 'c', 'a'])
print(s.astype('category'))
print()
print(s2.astype(str))
print()
print(np.asarray(s2))

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): [a, b, c]

0    a
1    b
2    c
3    a
dtype: object

['a' 'b' 'c' 'a']


# 2. CategoricalDtype

In [48]:
from pandas.api.types import CategoricalDtype

In [49]:
CategoricalDtype(categories = ['a', 'b', 'c'])

CategoricalDtype(categories=['a', 'b', 'c'], ordered=None)

In [50]:
CategoricalDtype(categories = ['a', 'b', 'c'], ordered = True)

CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)

In [51]:
CategoricalDtype()

CategoricalDtype(categories=None, ordered=None)

In [52]:
c1 = CategoricalDtype(['a', 'b', 'c'], ordered = False)
c2 = CategoricalDtype(['b', 'c', 'a'], ordered = False)
c1 == c2

True

In [53]:
c3 = CategoricalDtype(['a', 'b', 'c'], ordered = True)
c3 == c1

False

In [54]:
c1 == 'category'

True

# 3. 描述性统计

In [228]:
cat = pd.Categorical(['a', 'c', 'c', np.nan], 
                     categories = ['b', 'a', 'c'])
df =  pd.DataFrame({
    'cat': cat,
    's': ['a', 'c', 'c', np.nan]
})
print(df.describe())
print()
print(df['cat'].describe())

       cat  s
count    3  3
unique   2  2
top      c  c
freq     2  2

count     3
unique    2
top       c
freq      2
Name: cat, dtype: object


# 4. Working with categories

* s.cat.categories
* s.cat.ordered
* 类别重命名
    - s.cat.categories = []
    - s.cat.rename_categories([])
    - s.cat.rename_categories({'':, '':, ...})
* 增加新类别
    - s.cat.add_categories([])
* 删除类别
    - s.cat.remove_categories([])
* 删除无用类别
    - s.cat.remove_unused_categories()
* 设置类别
    - s.cat.set_categories([])

### 4.1 属性

* s.cat.categories
* s.cat.ordered

In [257]:
s = pd.Series(['a', 'b', 'c', 'a'], dtype = 'category')
print(s.cat.categories)
print()
print(s.cat.ordered)

Index(['a', 'b', 'c'], dtype='object')

False


In [258]:
s = pd.Series(pd.Categorical(['a', 'b', 'c', 'a'], 
                             categories = ['c', 'b', 'a']))
print(s.cat.categories)
print()
print(s.cat.ordered)

Index(['c', 'b', 'a'], dtype='object')

False


In [259]:
cat_type = CategoricalDtype(categories = list('abcd'), 
                            ordered = False)
s = pd.Series(list('babc')).astype(cat_type)
print(s.cat.categories)
print()
print(s.cat.ordered)
print()
print(s.unique())

Index(['a', 'b', 'c', 'd'], dtype='object')

False

[b, a, c]
Categories (3, object): [b, a, c]


### 4.2 Rename categories

* s.cat.categories = []
* s.cat.rename_categories([])
* s.cat.rename_categories({'':, '':, ...})

In [260]:
s = pd.Series(['a', 'b', 'c', 'a'], dtype = 'category')
print(s)
print()
print(s.cat.categories)
print()
print(s.cat.ordered)

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): [a, b, c]

Index(['a', 'b', 'c'], dtype='object')

False


In [261]:
s.cat.categories = ['Group %s' % g for g in s.cat.categories]
print(s)
print()
print(s.cat.categories)
print()
print(s.cat.ordered)

0    Group a
1    Group b
2    Group c
3    Group a
dtype: category
Categories (3, object): [Group a, Group b, Group c]

Index(['Group a', 'Group b', 'Group c'], dtype='object')

False


In [262]:
s = s.cat.rename_categories([1, 2, 3])
print(s)
print()
print(s.cat.categories)
print()
print(s.cat.ordered)

0    1
1    2
2    3
3    1
dtype: category
Categories (3, int64): [1, 2, 3]

Int64Index([1, 2, 3], dtype='int64')

False


In [263]:
s = s.cat.rename_categories({
    1: 'x',
    2: 'y',
    3: 'z'
})
print(s)
print()
print(s.cat.categories)
print()
print(s.cat.ordered)

0    x
1    y
2    z
3    x
dtype: category
Categories (3, object): [x, y, z]

Index(['x', 'y', 'z'], dtype='object')

False


### 4.3 增加新类别

In [264]:
s = s.cat.add_categories([4])
print(s)
print()
print(s.cat.categories)
print()
print(s.cat.ordered)

0    x
1    y
2    z
3    x
dtype: category
Categories (4, object): [x, y, z, 4]

Index(['x', 'y', 'z', 4], dtype='object')

False


### 4.4 删除类别

In [265]:
s = s.cat.remove_categories([4])
print(s)
print()
print(s.cat.categories)
print()
print(s.cat.ordered)

0    x
1    y
2    z
3    x
dtype: category
Categories (3, object): [x, y, z]

Index(['x', 'y', 'z'], dtype='object')

False


### 4.5 删除没用的类别 

In [266]:
s = pd.Series(pd.Categorical(['a', 'b', 'a'], categories = ['a', 'b', 'c', 'd']))
print(s)
print()
print(s.cat.remove_unused_categories())

0    a
1    b
2    a
dtype: category
Categories (4, object): [a, b, c, d]

0    a
1    b
2    a
dtype: category
Categories (2, object): [a, b]


### 4.6 设置类别

In [267]:
s = pd.Series(['one', 'two', 'four', '-'], dtype = 'category')
print(s)
print()
s = s.cat.set_categories(['one', 'two', 'three', 'four'])
print(s)

0     one
1     two
2    four
3       -
dtype: category
Categories (4, object): [-, four, one, two]

0     one
1     two
2    four
3     NaN
dtype: category
Categories (4, object): [one, two, three, four]


# 5. Sorting and Order

* pd.Categorical([], ordered = True)
* pd.api.types.CategoricalDtype(categories = [], ordered = True)
* pd.Series.cat.as_ordered()
* pd.Series.cat.as_unordered()
* pd.Series.cat.set_categories(categories = [], ordered = True)
* pd.Series.cat.reorder_categories(categories = [], ordered = True)

In [268]:
s = pd.Series(pd.Categorical(['a', 'b', 'c', 'a'], ordered = False))
s.sort_values(inplace = True)
print(s)
print()
print(s.cat.categories)
print()
print(s.cat.ordered)

0    a
3    a
1    b
2    c
dtype: category
Categories (3, object): [a, b, c]

Index(['a', 'b', 'c'], dtype='object')

False


In [270]:
s = pd.Series(['a', 'b', 'c', 'a']) \
    .astype(CategoricalDtype(ordered = True))
s.sort_values(inplace = True)
print(s)
print()
print(s.cat.categories)
print()
print(s.cat.ordered)
print(s.min(), s.max())

0    a
3    a
1    b
2    c
dtype: category
Categories (3, object): [a < b < c]

Index(['a', 'b', 'c'], dtype='object')

True
a c


In [90]:
s.cat.as_ordered()

0    a
3    a
1    b
2    c
dtype: category
Categories (3, object): [a < b < c]

In [91]:
s.cat.as_unordered()

0    a
3    a
1    b
2    c
dtype: category
Categories (3, object): [a, b, c]

In [271]:
s = pd.Series([1, 2, 3, 1], dtype = 'category')
s = s.cat.set_categories([2, 3, 1], ordered = True)
print(s)
print()
print(s.cat.categories)
print()
print(s.cat.ordered)
print()
print(s.max(), s.min())

0    1
1    2
2    3
3    1
dtype: category
Categories (3, int64): [2 < 3 < 1]

Int64Index([2, 3, 1], dtype='int64')

True

1 2


In [272]:
s = pd.Series([1, 2, 3, 1], dtype = 'category')
s = s.cat.reorder_categories([2, 3, 1], ordered = True)
print(s)
print()
print(s.cat.categories)
print()
print(s.cat.ordered)
print()
print(s.max(), s.min())

0    1
1    2
2    3
3    1
dtype: category
Categories (3, int64): [2 < 3 < 1]

Int64Index([2, 3, 1], dtype='int64')

True

1 2


In [274]:
dfs = pd.DataFrame({
    'A': pd.Categorical(list('bbeebbaa'), 
                        categories = ['e', 'a', 'b'], 
                        ordered = True),
    'B': [1, 2, 1, 2, 2, 1, 2, 1]
})
print(dfs)
print()
print(dfs.sort_values(by = ['A', 'B']))
dfs['A'] = dfs['A'].cat.reorder_categories(['a', 'b', 'e'])
print(dfs.sort_values(by = ['A', 'B']))

   A  B
0  b  1
1  b  2
2  e  1
3  e  2
4  b  2
5  b  1
6  a  2
7  a  1

   A  B
2  e  1
3  e  2
7  a  1
6  a  2
0  b  1
5  b  1
1  b  2
4  b  2
   A  B
7  a  1
6  a  2
0  b  1
5  b  1
1  b  2
4  b  2
2  e  1
3  e  2


# 6. 比较

In [122]:
cat = pd.Series([1, 2, 3]).astype(
    CategoricalDtype([3, 2, 1], 
                     ordered = True)
)
cat_base = pd.Series([2, 2, 2]).astype(
    CategoricalDtype([3, 2, 1], 
                     ordered = True)
)
cat_base2 = pd.Series([2, 2, 2]).astype(
    CategoricalDtype(ordered = True)
)

In [123]:
cat

0    1
1    2
2    3
dtype: category
Categories (3, int64): [3 < 2 < 1]

In [124]:
cat_base

0    2
1    2
2    2
dtype: category
Categories (3, int64): [3 < 2 < 1]

In [125]:
cat_base2

0    2
1    2
2    2
dtype: category
Categories (1, int64): [2]

In [126]:
cat > cat_base

0     True
1    False
2    False
dtype: bool

In [127]:
cat > 2

0     True
1    False
2    False
dtype: bool

In [128]:
cat == cat_base

0    False
1     True
2    False
dtype: bool

In [129]:
cat == np.array([1, 2, 3])

0    True
1    True
2    True
dtype: bool

In [130]:
cat == 2

0    False
1     True
2    False
dtype: bool

In [131]:
try:
    cat > cat_base2
except TypeError as e:
    print('TypeError: ', str(e))

TypeError:  Categoricals can only be compared if 'categories' are the same. Categories are different lengths


In [132]:
base = np.array([1, 2, 3])
try:
    cat > base
except TypeError as e:
    print("TypeError;", str(e))

TypeError; Cannot compare a Categorical for op __gt__ with type <class 'numpy.ndarray'>.
If you want to compare values, use 'np.asarray(cat) <op> other'.


In [133]:
np.asarray(cat) > base

array([False, False, False])

In [134]:
c1 = pd.Categorical(['a', 'b'], categories = ['a', 'b'], ordered = False)
c1 = pd.Categorical(['a', 'b'], categories = ['b', 'a'], ordered = False)
c1 == c2

array([False, False])

# 7. 操作

In [135]:
s = pd.Series(pd.Categorical(['a', 'b', 'c', 'c'], 
                             categories = ['c', 'a', 'b', 'd']))
s.value_counts()

c    2
b    1
a    1
d    0
dtype: int64

In [176]:
cats = pd.Categorical(['a', 'b', 'b', 'b', 'c', 'c', 'c'], 
                      categories = ['a', 'b', 'c', 'd'])
df = pd.DataFrame({
    'cats': cats, 
    'values': [1, 2, 2, 2, 3, 4, 5]
})
df

Unnamed: 0,cats,values
0,a,1
1,b,2
2,b,2
3,b,2
4,c,3
5,c,4
6,c,5


In [178]:
df.groupby('cats').mean()

Unnamed: 0_level_0,values
cats,Unnamed: 1_level_1
a,1.0
b,2.0
c,4.0
d,


In [179]:
cats2 = pd.Categorical(['a', 'a', 'b', 'b'], 
                       categories = ['a', 'b', 'c'])
df2 = pd.DataFrame({
    'cats': cats2,
    'B': ['c', 'd', 'c', 'd'],
    'values': [1, 2, 3, 4]
})
df2

Unnamed: 0,cats,B,values
0,a,c,1
1,a,d,2
2,b,c,3
3,b,d,4


In [181]:
df2.groupby(['cats', 'B']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,values
cats,B,Unnamed: 2_level_1
a,c,1.0
a,d,2.0
b,c,3.0
b,d,4.0
c,c,
c,d,


In [186]:
raw_cat = pd.Categorical(['a', 'a', 'b', 'b'], 
                         categories = ['a', 'b', 'c'])
df3 = pd.DataFrame({
    'A': raw_cat,
    'B': ['c', 'd', 'c', 'd'],
    'values': [1, 2, 3, 4]
})
df3

Unnamed: 0,A,B,values
0,a,c,1
1,a,d,2
2,b,c,3
3,b,d,4


In [189]:
pd.pivot_table(df3, values = 'values', index = ['A', 'B'])

Unnamed: 0_level_0,Unnamed: 1_level_0,values
A,B,Unnamed: 2_level_1
a,c,1
a,d,2
b,c,3
b,d,4


# 8. 数据修改(data munging)

### 8.1 Getting

In [278]:
idx = pd.Index(['h', 'i', 'j', 'k', 'l', 'm', 'n'])
cats = pd.Series(['a', 'b', 'b', 'b', 'c', 'c', 'c'],
                 dtype = 'category',
                 index = idx)
values = [1, 2, 2, 2, 3, 4, 5]
df = pd.DataFrame({
    'cats': cats,
    'values': values
}, index = idx)
df

Unnamed: 0,cats,values
h,a,1
i,b,2
j,b,2
k,b,2
l,c,3
m,c,4
n,c,5


In [298]:
print(df.iloc[2:4, :])
print()
print(df.iloc[2:4, :].dtypes)

  cats  values
j    y       2
k    y       2

cats      category
values       int64
dtype: object


In [284]:
df.loc['h':'j', 'cats']

h    a
i    b
j    b
Name: cats, dtype: category
Categories (3, object): [a, b, c]

In [286]:
df[df['cats'] == 'b']

Unnamed: 0,cats,values
i,b,2
j,b,2
k,b,2


In [288]:
df.loc['h', :]

cats      a
values    1
Name: h, dtype: object

In [290]:
df.iat[0, 0]

'a'

In [295]:
df['cats'].cat.categories = ['x', 'y', 'z']
print(df)
print()
print(df.at['h', 'cats'])

  cats  values
h    x       1
i    y       2
j    y       2
k    y       2
l    z       3
m    z       4
n    z       5

x


In [296]:
df.loc[['h'], 'cats']

h    x
Name: cats, dtype: category
Categories (3, object): [x, y, z]

#### 8.2 String and datatime accessors

In [300]:
str_s = pd.Series(list('aabb'))
str_cat = str_s.astype('category')
str_cat.str.contains('a')

0     True
1     True
2    False
3    False
dtype: bool

In [304]:
date_s = pd.Series(pd.date_range('1/1/2019', periods = 5, freq = 'D'))
date_cat = date_s.astype('category')
print(date_cat)
print()
print(date_cat.dt.day)

0   2019-01-01
1   2019-01-02
2   2019-01-03
3   2019-01-04
4   2019-01-05
dtype: category
Categories (5, datetime64[ns]): [2019-01-01, 2019-01-02, 2019-01-03, 2019-01-04, 2019-01-05]

0    1
1    2
2    3
3    4
4    5
dtype: int64


In [306]:
ret_s = str_s.str.contains('a')
ret_cat = str_cat.str.contains('a')
ret_s.dtype == ret_cat.dtype
ret_s == ret_cat

0    True
1    True
2    True
3    True
dtype: bool

# 9. Getting Data in / out

In [169]:
import io
s = pd.Series(pd.Categorical(['a', 'b', 'b', 'a', 'a', 'd']))
s.cat.categories = ['very good', 'good', 'bad']
s = s.cat.set_categories(['very bad', 'bad', 'medium', 'good', 'very good'])

df = pd.DataFrame({
    'cats': s,
    'vals': [1, 2, 3, 4, 5, 6]
})
df.dtypes

cats    category
vals       int64
dtype: object

In [170]:
csv = io.StringIO()
df.to_csv(csv)
df2 = pd.read_csv(io.StringIO(csv.getvalue()))
df2.dtypes

Unnamed: 0     int64
cats          object
vals           int64
dtype: object

In [171]:
df2['cats']

0    very good
1         good
2         good
3    very good
4    very good
5          bad
Name: cats, dtype: object

In [172]:
df2['cats'] = df2['cats'].astype('category')
df2['cats'].cat.set_categories(['very bad', 'bad', 'medium', 'good', 'very good'], inplace = True)
df2.dtypes

Unnamed: 0       int64
cats          category
vals             int64
dtype: object

In [173]:
df2['cats']

0    very good
1         good
2         good
3    very good
4    very good
5          bad
Name: cats, dtype: category
Categories (5, object): [very bad, bad, medium, good, very good]

# 10. Miss Data

In [146]:
s = pd.Series(['a', 'b', np.nan, 'a'], dtype = 'category')
s

0      a
1      b
2    NaN
3      a
dtype: category
Categories (2, object): [a, b]

In [148]:
s.cat.codes

0    0
1    1
2   -1
3    0
dtype: int8

In [151]:
pd.isna(s)

0    False
1    False
2     True
dtype: bool

In [153]:
s.fillna('a')

0    a
1    b
2    a
dtype: category
Categories (2, object): [a, b]

# 11. Gotchas

In [137]:
s = pd.Series(['foo', 'bar'] * 1000)
s.nbytes

16000

In [144]:
s.astype('category').nbytes

2016