In [1]:
import numpy as np
import pandas as pd

# Object creation

## Series creation

In [2]:
s = pd.Series(["a", "b", "c", "a"], dtype="category")

s

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): ['a', 'b', 'c']

In [3]:
df = pd.DataFrame({"A": ["a", "b", "c", "a"]})

df

Unnamed: 0,A
0,a
1,b
2,c
3,a


In [4]:
df["B"] = df["A"].astype("category")

df

Unnamed: 0,A,B
0,a,a
1,b,b
2,c,c
3,a,a


In [6]:
df = pd.DataFrame({"value": np.random.randint(0, 100, 20)})

labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)]
labels

['0 - 9',
 '10 - 19',
 '20 - 29',
 '30 - 39',
 '40 - 49',
 '50 - 59',
 '60 - 69',
 '70 - 79',
 '80 - 89',
 '90 - 99']

In [11]:
df['group'] = pd.cut(df.value, np.arange(0,101,10), right=False, labels=labels)
df

Unnamed: 0,value,group
0,3,0 - 9
1,51,50 - 59
2,53,50 - 59
3,73,70 - 79
4,96,90 - 99
5,89,80 - 89
6,13,10 - 19
7,61,60 - 69
8,53,50 - 59
9,77,70 - 79


In [13]:
raw_cat = pd.Categorical(
    ["a", "b", "c", "a"], categories=["b", "c", "d"], ordered=False
)

raw_cat

[NaN, 'b', 'c', NaN]
Categories (3, object): ['b', 'c', 'd']

In [14]:
s = pd.Series(raw_cat)

s

0    NaN
1      b
2      c
3    NaN
dtype: category
Categories (3, object): ['b', 'c', 'd']

In [15]:
df = pd.DataFrame({"A": ["a", "b", "c", "a"]})

df["B"] = raw_cat

In [16]:
df

Unnamed: 0,A,B
0,a,
1,b,b
2,c,c
3,a,


In [17]:
df.dtypes

A      object
B    category
dtype: object

## DataFrame creation

In [18]:
df = pd.DataFrame({"A": list("abca"), "B": list("bccd")}, dtype="category")

df.dtypes

A    category
B    category
dtype: object

In [19]:
df = pd.DataFrame({"A": list("abca"), "B": list("bccd")})

df_cat = df.astype("category")

df_cat.dtypes

A    category
B    category
dtype: object

## Controlling behavior

In [20]:
from pandas.api.types import CategoricalDtype

s = pd.Series(["a", "b", "c", "a"])

cat_type = CategoricalDtype(categories=["b", "c", "d"], ordered=True)

s_cat = s.astype(cat_type)

s_cat

0    NaN
1      b
2      c
3    NaN
dtype: category
Categories (3, object): ['b' < 'c' < 'd']

In [21]:
from pandas.api.types import CategoricalDtype

df = pd.DataFrame({"A": list("abca"), "B": list("bccd")})

cat_type = CategoricalDtype(categories=list("abcd"), ordered=True)

df_cat = df.astype(cat_type)

df_cat["A"]

0    a
1    b
2    c
3    a
Name: A, dtype: category
Categories (4, object): ['a' < 'b' < 'c' < 'd']

In [22]:
df_cat["B"]


0    b
1    c
2    c
3    d
Name: B, dtype: category
Categories (4, object): ['a' < 'b' < 'c' < 'd']

To perform table-wise conversion, where all labels in the entire DataFrame are used as categories for each column, the categories parameter can be determined programmatically by categories = pd.unique(df.to_numpy().ravel())

If you already have codes and categories, you can use the from_codes() constructor to save the factorize step during normal constructor mode:

In [23]:
splitter = np.random.choice([0, 1], 5, p=[0.5, 0.5])

s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"]))

## Regaining original data

In [24]:
s = pd.Series(["a", "b", "c", "a"])
s2 = s.astype("category")
s2.astype(str)
np.asarray(s2)


array(['a', 'b', 'c', 'a'], dtype=object)

# CategoricalDtype

In [25]:
from pandas.api.types import CategoricalDtype

CategoricalDtype(["a", "b", "c"])

CategoricalDtype(categories=['a', 'b', 'c'], ordered=False)

In [26]:
CategoricalDtype(["a", "b", "c"], ordered=True)


CategoricalDtype(categories=['a', 'b', 'c'], ordered=True)

In [27]:
CategoricalDtype()



CategoricalDtype(categories=None, ordered=False)

## Equality semantics

In [28]:
c1 = CategoricalDtype(["a", "b", "c"], ordered=False)


In [29]:
c1 == CategoricalDtype(["b", "c", "a"], ordered=False)


True

In [30]:
c1 == CategoricalDtype(["a", "b", "c"], ordered=True)


False

In [31]:
c1 == "category"


True

# Description

In [32]:
cat = pd.Categorical(["a", "c", "c", np.nan], categories=["b", "a", "c"])

df = pd.DataFrame({"cat": cat, "s": ["a", "c", "c", np.nan]})

df.describe()

Unnamed: 0,cat,s
count,3,3
unique,2,2
top,c,c
freq,2,2


In [33]:
df["cat"].describe()


count     3
unique    2
top       c
freq      2
Name: cat, dtype: object

# Working with categories

In [36]:
s = pd.Series(["a", "b", "c", "a"], dtype="category")

s.cat.categories

Index(['a', 'b', 'c'], dtype='object')

In [35]:
s.cat.ordered


False

In [38]:
s = pd.Series(pd.Categorical(["a", "b", "c", "a"], categories=["c", "b", "a"]))
s

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): ['c', 'b', 'a']

In [39]:
s.cat.categories


Index(['c', 'b', 'a'], dtype='object')

In [40]:
s.cat.ordered


False

In [41]:
s = pd.Series(list("babc")).astype(CategoricalDtype(list("abcd")))

s

0    b
1    a
2    b
3    c
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [42]:
s.cat.categories


Index(['a', 'b', 'c', 'd'], dtype='object')

In [43]:
s.unique()


['b', 'a', 'c']
Categories (4, object): ['a', 'b', 'c', 'd']

## Renaming categories

Renaming categories is done by using the rename_categories() method:



In [55]:
s = pd.Series(["a", "b", "c", "a"], dtype="category")

s

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): ['a', 'b', 'c']

In [45]:
new_categories = ["Group %s" % g for g in s.cat.categories]


In [51]:
s = s.cat.rename_categories(new_categories)

s

0    Group a
1    Group b
2    Group c
3    Group a
dtype: category
Categories (3, object): ['Group a', 'Group b', 'Group c']

In [58]:
s = s.cat.rename_categories({'a': "Group a", 'b': "y", 3: "z"})

s

0    Group a
1          y
2          c
3    Group a
dtype: category
Categories (3, object): ['Group a', 'y', 'c']

# Appending new categories

In [48]:
s = s.cat.add_categories([4])

s.cat.categories

Index(['Group a', 'Group b', 'Group c', 4], dtype='object')

## Removing categories

In [49]:
s = s.cat.remove_categories([4])

s

0    Group a
1    Group b
2    Group c
3    Group a
dtype: category
Categories (3, object): ['Group a', 'Group b', 'Group c']

## Setting categories

In [52]:
s = pd.Series(["one", "two", "four", "-"], dtype="category")

s

0     one
1     two
2    four
3       -
dtype: category
Categories (4, object): ['-', 'four', 'one', 'two']

In [53]:
s = s.cat.set_categories(["one", "two", "three", "four"])

s

0     one
1     two
2    four
3     NaN
dtype: category
Categories (4, object): ['one', 'two', 'three', 'four']

# Sorting and order

In [59]:
s = pd.Series(pd.Categorical(["a", "b", "c", "a"], ordered=False))

s = s.sort_values()

s = pd.Series(["a", "b", "c", "a"]).astype(CategoricalDtype(ordered=True))

s = s.sort_values()

s

0    a
3    a
1    b
2    c
dtype: category
Categories (3, object): ['a' < 'b' < 'c']

In [60]:
s.min(), s.max()

('a', 'c')

In [61]:
s = pd.Series(pd.Categorical(["a", "b", "c", "a"], ordered=False))

s.cat.as_ordered()


0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): ['a' < 'b' < 'c']

In [64]:
s = s.sort_values()
s

0    a
3    a
1    b
2    c
dtype: category
Categories (3, object): ['a', 'b', 'c']

In [65]:
s.cat.as_ordered()


0    a
3    a
1    b
2    c
dtype: category
Categories (3, object): ['a' < 'b' < 'c']

In [66]:
s.cat.as_unordered()


0    a
3    a
1    b
2    c
dtype: category
Categories (3, object): ['a', 'b', 'c']

Sorting will use the order defined by categories, not any lexical order present on the data type. This is even true for strings and numeric data:


In [67]:

s = pd.Series([1, 2, 3, 1], dtype="category")

s = s.cat.set_categories([2, 3, 1], ordered=True)

s

0    1
1    2
2    3
3    1
dtype: category
Categories (3, int64): [2 < 3 < 1]

In [68]:
s = s.sort_values()

s

1    2
2    3
0    1
3    1
dtype: category
Categories (3, int64): [2 < 3 < 1]

In [69]:
s.min(), s.max()

(2, 1)

## Reordering

In [70]:
s = pd.Series([1, 2, 3, 1], dtype="category")

s = s.cat.reorder_categories([1, 2, 3], ordered=True)

s

0    1
1    2
2    3
3    1
dtype: category
Categories (3, int64): [1 < 2 < 3]

In [72]:
s = s.sort_values()
s

0    1
3    1
1    2
2    3
dtype: category
Categories (3, int64): [1 < 2 < 3]

In [73]:
s.min(), s.max()

(1, 3)

## Multi column sorting

In [74]:
dfs = pd.DataFrame(
    {
        "A": pd.Categorical(
            list("bbeebbaa"),
            categories=["e", "a", "b"],
            ordered=True,
        ),
        "B": [1, 2, 1, 2, 2, 1, 2, 1],
    }
)

In [75]:
dfs.sort_values(by=["A", "B"])


Unnamed: 0,A,B
2,e,1
3,e,2
7,a,1
6,a,2
0,b,1
5,b,1
1,b,2
4,b,2


In [76]:
dfs.A

0    b
1    b
2    e
3    e
4    b
5    b
6    a
7    a
Name: A, dtype: category
Categories (3, object): ['e' < 'a' < 'b']

In [79]:
dfs.sort_values(by=['A','B'])

Unnamed: 0,A,B
2,e,1
3,e,2
7,a,1
6,a,2
0,b,1
5,b,1
1,b,2
4,b,2


In [83]:
dfs.A = dfs['A'].cat.reorder_categories(['a', 'b', 'e'])

In [84]:
dfs.sort_values(by=["A", "B"])


Unnamed: 0,A,B
7,a,1
6,a,2
0,b,1
5,b,1
1,b,2
4,b,2
2,e,1
3,e,2


# Comparisons

In [85]:
cat = pd.Series([1, 2, 3]).astype(CategoricalDtype([3, 2, 1], ordered=True))

cat_base = pd.Series([2, 2, 2]).astype(CategoricalDtype([3, 2, 1], ordered=True))

cat_base2 = pd.Series([2, 2, 2]).astype(CategoricalDtype(ordered=True))

cat

0    1
1    2
2    3
dtype: category
Categories (3, int64): [3 < 2 < 1]

In [86]:
cat_base

0    2
1    2
2    2
dtype: category
Categories (3, int64): [3 < 2 < 1]

In [87]:
cat_base2

0    2
1    2
2    2
dtype: category
Categories (1, int64): [2]

In [88]:
cat>cat_base

0     True
1    False
2    False
dtype: bool

In [90]:
cat>cat_base2

TypeError: Categoricals can only be compared if 'categories' are the same.

In [91]:
c1 = pd.Categorical(["a", "b"], categories=["a", "b"], ordered=False)

c2 = pd.Categorical(["a", "b"], categories=["b", "a"], ordered=False)

c1 == c2

array([ True,  True])